{“状态”:“确定”,“消息类型”:“工作”,“信息版本”:“1.0.0”,“邮件”:{“索引”:{“日期-部件”:[[2024,6,7]],“日期-时间”:“2024-06-07T05:17:54Z”,“时间戳”:1717737474320},“引用-计数”:134,“发布者”:“IEEE”,“许可证”:[{“开始”:{-“日期-零件”:[2022,6,1]],”日期-时间“:”2022-06-01T00:00:00 Z“,”timestamp“:1654041600000},”content-version“:”stm-asf“,“delay-in-days”:0,“URL”:“https:\/\/doi.org\/10.15223\/policy-029”},{“start”:{“date-parts”:[2022,6,1]],“date-time”:“2022-06-01T00:00:00Z”,“timestamp”:1654041600000},“content-version”:“stm-asf”,“dellay-in-days”:0域“:{”域“:[],”交叉标记-限制“:false},”短容器-时间“:[],“published-print”:{“date-parts”:[[2022,6]]},“DOI”:“10.1109\/cvpr52688.2022.01589”,“type”:“proceedings-article”,“created”:{“date-ports”:[2022,9,27]],“date-time”:“2022-09-27T19:56:41Z”,“timestamp”:1664308601000},”source“Crossref”,“is-referenced-by-count”:54,“title”:[“MERLOT RESERVE:通过视觉和语言和声音“],“prefix”:“10.1109”,“author”:[{“given”:“Rowan”,“family”:“Zellers”,“sequence”:“first”,“affiliation”:[}“name”:“Paul G.Allen School of Computer Science&Engineering,Washington”}]},{“fixed”:“Jiasen”,“家庭”:“Lu”,《sequence》:“additional”,“feliation”:“Lu”,“sequence”:“additional”,“affiliation”:[{“name”:“Allen Institute for Artificial Intelligence”}]},{“given”:“Youngjae”,“family”:“Yu”,“serquence”:“additionable”,“feliation”:[{(名称):“Allen-Intertifical Intelligational”}],{(给定):“Yanpeng”,“家族”:“Zhao”,“sequence”(序列):“addressional”,“atriation”:“[{”名称“:“艾伦人工智能研究所”}]},{“given”:“Mohammadreza”,“family”:“Salehi”,”sequence“:”additional“,”affiliation“:[{“name”:“University of Edinburgh”}]neneneep,{”given“:”Aditya“,”family“:”Kusupati“,”session“:”additional“”affidiation“:[{”name“:”Paul G.Allen School of Computer Science&Engineering,Washington”}]{,“givent”:“Jack”,“family”:“Hessel”,“sequence”:“additional”,“affiliation”:[{“name”:“Allen Institute for Artificial Intelligence”}]},{“given”:“Ali”,“家庭”:“Farhadi”,“序列”:“附加”,“从属”:[}“name“:”Paul G.Allen School of Computer Science&Engineering,University of Washington”}]{,“affiliation”:[{“name”:“Allen Institute for Artificial Intelligence”}]}],“member”:“263”,“reference”:[}“key”:“ref39”,“article-title”:“vision and language representation learning的大规模对抗性训练”,“author”:“gan”,“year”:“2020”,“journal-title“:”ArXiv Preprint:“滚动展开第一人称视频中动作预期的lstms”,“author”:“furnari”,“year”:“2020”,“journal-title”:“IEEE模式分析与机器智能(PAMI)汇刊”},{“key”:“ref33”,“first-page”:“1422”,“article-title,“journal-title”:“IEEE国际计算机视觉会议论文集”},{“key”:“ref32”,“article-title“:”记录英语庞大的清洁爬虫语料库“,“volume”:”abs 2104 8758“,“author”:“dodge”,“year”:“2021”,“jornal-tittle”:《CoRR》},}“key:”ref31“,”doi-asserted-by“:”publisher“,”doi“:”10.1111\/j.1460-2466.2000.tb02845.x“},{“key”:“ref30”,“doi-asserted-by”:“publisher”,“doi”:“10.1111\/j.1460-2466.2007.00376.x”},{“密钥”:“ref37”,“doi-assertd-by”:“publisher”,“DI”:“10.3390\/info2010140”}、{“键”:“参考36”,“文章-标题”:“从生活方式日志到日常交互”,“作者”:“david”,“年份”:“2018”,“日志标题”:”CVPR“},“:”ref35“,”doi-asserted-by“:”publisher“,”doi“:“10.1016\/0896-6273(93)90304-A”},{“key”:“ref34”,“article-title”:“图像值16x16个单词:图像识别的变形金刚”,“author”:“dosovitskiy”,“year”:“2020”,“journal-title“:ArXiv Preprint”},“author”:“sunipa”,“year”:“2021”,“journal-title”:“ArXiv预印本”},{“key”:“ref27”,“doi-asserted-by”:“publisher”,“doi”:“10.1109\/CVPR46437.2021.01101”},{“键”:“参考20”,“doi由”断言:“publisher”,“doi”:“10.1017\/S021963099004953”},{“key”:“ref22”,“文章标题”:“通过文本生成统一视觉和语言任务”,“author”:“cho”,“year”:“2021”,“journal title”:“ICML”},{“key”:“ref21”,“first page”:“104”,“文章标题”:“Uniter:通用图像文本表示学习”,“author”:“chen”,“year”:“0”,“journal-title”:“European Conference on Computer Vision”},{“key”:“ref24”,“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/D19-1418”}、{“密钥”:“ref23”,“author”:“tweet”,“year”:“0”,“jornal-title:”描述中图像特征和词语之间的重要依赖性可以通过词语之间的依赖性来解释“},{“key”:“ref101”,“author”:“srnicek”,“year”:“2017”,“journal-title”:“Platform Capitalism”},{“key”:“ref26”,“article-title“:“Rescaling利己主义视觉:epic-kitchens-100的收集、管道和挑战”,“author”:“damen”,《year》:“2021”,“journal-ttitle”:《国际计算机视觉杂志》(IJCV)”}、{“key”:《ref100》,“doi-asserted-by”:“publisher”,“DOI”:“10.1162\/1064546053278973”},{“key”:“ref25”,“DOI-asserted-by”:“publicher”,“DOI”:”10.1177\/1461444816657096“}”,{”key“:”ref50“,”DOI-assert-by“:”publisher“,”DOI“:”10.1145\/2509558.2509563 7.670“},{”key“:”ref59“,”DOI-asserted-by“:”publisher“,”DOI“:“10.18653\/v1\/2020.emnlp-main.62”},{“key”:“ref58”,“doi-asserted-by”:“publisher”,“doi”:“10.4324\/9781410606105”}、{“密钥”:“ref57”,“doi-asserte-by”:“publisher”,“DI:”10.1109\/CVPR.2016.90,{“key”:“ref55”,“首页”:“1”,“文章标题”:“性别识别或性别还原主义?嵌入式性别识别系统的社会影响”,“作者”:“hamidi”,“年份”:“0”,“新闻标题”:“2018 CHI计算机系统人为因素会议论文集”},{“key”:“ref54”,“article-title”:“Audioclip:将剪辑扩展到图像、文本和音频”,“author”:“guzhov”,“year”:“2021”,“期刊标题”:“ArXiv预印本”},{“key”:“ref53”,“doi断言”:“publisher”,“doi”:“10.1109\/TASP.1984.1164317”},{“key”:“ref52”,“文章标题”:“好”还不够好,“作者”:“绿色”,“年份”:“0”,“期刊标题”:“NeurIPS人工智能促进社会公益研讨会论文集”},{“key”:“ref40”,“文章标题”:“The pile:用于语言建模的800gb多样性文本数据集”,“author”:“gao”,“year”:“2020”,“journal-title”:“ArXiv预打印”},{“key”:“ref4”,“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/D19-1219”}“ArXiv预打印”},{“key”:“ref6”,“doi-asserted-by”:“publisher”,“doi”:“10.1109\/TPAMI.2018.2798607”}“,”article-title“:“观察世界:从未标记视频中学习表征”,“author”:“gordon”,“year”:“2020”,“journal-title”:“ArXiv预印本”},{“key”:“ref7”,“first page”:《610》,“article-title》:“关于随机鹦鹉的危险:语言模型是否太大?”,“author”:“emily”,“年份”:“0”:“时空关注是视频理解所需要的吗?”,“作者”:“bertasius”,“年份”:“2021”,“新闻标题”:“ArXiv预打印”},{“key”:“ref46”,“doi-asserted-by”:“publisher”,“doi”:“10.1109\/ICCV48922.2021.01325”},“首页”:“1”,“文章标题”:“面向聋哑或听力障碍人群的用户驱动声音识别器个性化”,“音量”:“5”,“作者”:“史蒂文”,“年份”:“0”,“新闻标题”:“交互式移动穿戴和普及技术ACM会议录”},{“key”:“ref47”,“doi-asserted-by”:“publisher”,“doi”:“10.21437\/Interspeech.2021-698”},“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/2020.findings-emnlp.301”},{“key”:“ref41”,“article-title”:“数据集数据表”,“author”:“gebru”,“year”:“2018”,“journal-title“:“ArXiv预打印”}“key”:“ref43”,“doi-asserted-by”:“publisher”,“doi”:“10.1109”\/ICASSP.2017.7952261“},{“key”:“ref127”,“article-title”:“防御神经伪新闻”,“volume”:”32“,“author”:“zellers”,“year”:“2019”,“journal-title“:“Advances in neural information processing systems”},“key“:”ref126“,”doi-asserted-by“:”publisher“,“doi”:“10.109\/CVPR.2019.00688”}:“佛罗伦萨:计算机视觉的新基础模型”,“作者”:“元”,“年份”:“2021年”,“新闻标题”:“ArXiv预印本”},{“关键字”:“ref124”,“文章标题”:”Ernie-vil:“通过场景图增强视觉语言的知识表示”,“作家”:“yu”,“年”:“2020年”,”新闻标题“ArXiv-预印本:“混合:有效的正则化来微调大规模预训练的语言模型”,“作者”:“lee”,“年份”:“0”,“期刊标题”:“国际学习表征会议”},{“key”:“ref72”,“文章标题”:“Albert:语言表征自我监督学习的lite bert”,“作者”:“lan”,“年份”:“0”,“期刊标题”:“学习表征国际会议”},{“key”:“ref129”,“doi-asserted-by”:“publisher”,”doi“:“10.1109\/CVPR.2018.00611”},“journal-title”:“ArXiv Preprint”},{“key”:“ref70”,“doi-asserted-by”:“crossref”,“first page”:”491“,“doi”:“10.1007\/978-3030-58558-7_29”,“article-title“:大转移(比特):一般视觉表征学习”,“author”:“kolesnikov”,“year”:“2020”,“johnal-title:”计算机愿景-ECCV 2020第16届欧洲会议“},}“key:”ref76“,“doi-asserted-by”:“publisher”,“doi”:“10.1016\/j.cviu.2016.09.001”},{“key”:“ref130”,“author”:“zai”,“year”:“2021”,“journal-title”:“Scaling vision transformers”}、{“key”:”ref77“article-title“:”Visualbert:视觉和语言的简单和性能基线“,”author“:”harold li“,”year“2019”,“日记标题”:“ArXiv Preprint”},{“键”:“参考74”,“first page”:“7331”,“article-title”:“Less is more:Clipbert for video and language learning via sparse sampling”,“author”:“jie”,“year”:“0”,“journal title”:《IEEE\/CVF计算机视觉与模式识别会议论文集》},{“key”:《ref75》,“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/D18-1167”},“key“ref133”,“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/D17-1323”},{“key”:“ref134”,”doi-assert-by“:”publisher“,”doi“:”10.1057\/jit.2015.5“},”{“key”:”ref131“,“doi-asserted-by”:”publister“,”doi“:“10.1109\/CVPR46437.2021.00553”}“,{”key“:”ref78“,”doi-assert-by“:”publisher“,“doi”:“10.1109\/CVPR46437.2021.00693”},{“key”:“ref132”,“article-title”:“从成对图像和文本中对比学习医学视觉表征”,“作者”:“余浩”,“年份”:“2020年”,“期刊标题”:“ArXiv预印本”},{“关键”:“参考79”,“文章标题”:”光学:用于跨模态理解和生成的全感知预印本、“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/K19-1039”},{“key”:“ref62”,“doi-sserted-by“:”publisher“,”doi“:”10.18653\/v1\\/K19-1006“},”{“key”:”ref61“,”doi-assert-by“”:“publisher”,“doi”:“在嘈杂的文本监督下进行语言表征学习”,“作者”:“jia”,“year”:“2021”,“journal-title”:“ArXiv预打印”},{“key”:“ref64”,“doi-asserted-by”:“crossref”,”first page“:”64“,”doi“:”10.1162\/tacl_a_00300“,”article-title“:”Spanbert:“通过表示和预测跨度改进预训练”,“volume”:”8“,”author“:”mandar“,”year“:”2020“,”journal-title“:“计算语言学协会学报”},{“key”:“ref65”,“首页”:“39”,“article-title”:“我的数据无处不在:“互联网的用户心理模型以及对隐私和安全的影响”,“author”:“kang”,“year”:“0”,“journal-title“:“第十一届可用隐私和安全研讨会(SOUPS 2015)”}:“ref66”,“article-title”:“神经语言模型的缩放定律”,“author”:“jared”,“year”:“2020”,“journal-title“:”ArXiv预打印“},{“key”:“ref67”,“doi-asserted-by”:“publisher”,”doi“:”10.1080\/03080188.2020.1840224“}”,{”key“:”ref68“,”doi-assert-by“:”publisher“,”doi:“10.1609\/aaaai.v35i14.17556”},{“key”:“ref2”,“article-title”:“VATT:从原始视频、音频和文本进行多模式自主学习的变形金刚”,“作者”:“akbari”,“年份”:“2021年”,“新闻标题”:“ArXiv预印本”},{“关键”:“ref69”,“文章标题”:”Adam:随机优化方法“,”卷“:”abs 1412 6980“,”作者“:”kingma“,”年“:”2014年“新闻标题“:”CoRR“},”{“重点”:“ref1”,“论文标题”:“Youtube-8m:大型视频分类基准”,“author”:“abu-el-haija”,“year”:“2016”,“journal-title”:“ArXiv Preprint”},{“key”:“ref109”,”doi-asserted-by“:”publisher“,”doi“:”10.18653\/v1\/2020.findings-emnlp.417“”,“年份”:“0”,“新闻标题”:“2020年公平问责与透明度会议记录”},{“key”:“ref108”,“首页”:“8252”,“article-title”:“修复列车测试分辨率差异”,“volume”:”32“,“author”:“touvron”,“year”:“2019”,“journal-title“:“Advances in neural information processing systems”},“doi-asserted-by”:“publisher”,“doi”:“10.7312\/raju18532”},{“key”:“ref107”,“doi-asserted-by”:”publisher“,”doi“:”10.18653\/v1\/W17-1606“},”{“key”:”ref93“,”first page“:”1“,”article-title“:”使用统一的文本到文本转换器探索迁移学习的极限“,”volume“:“21”,“author”:“raffel”,“year”:“2020”,“新闻标题“:“Journal of Machine Learning Research”},{“key”:“ref106”,“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/D19-1514”}:“10.1109\/ICCV.2019.00756”},{“key”:“ref91”,“doi-asserted-by”:“publisher”,“doi”:“10.109\/AVSS.2009.53”}“:”10.1145\/2733373.2806390“},{”密钥“:“ref103”,“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/P19-1355”},{“key”:“ref102”,“author”:“strangelove”,“year”:“2020”,“journal title”:“Watching YouTube”}“,”文章标题“:“不确定的未来:使用变分自动编码器从静态图像进行预测”,“作者”:“walker”,“年份”:“0”,“新闻标题”:“欧洲计算机视觉会议”},{“关键字”:“ref110”,“首页”:“5998”,“文章标题”:”注意力是你所需要的”,“作家”:“vaswani”,“年”:“2017”,“杂志标题”:“神经信息处理系统的进展”},{“key”:“ref98”,“doi-asserted-by”:“publisher”,“doi”:“10.1145\/2647868.2655045”}:“第四届国际人工智能联合会议论文集-”},{“key”:“ref96”,“doi-asserted-by”:“publisher”,“doi”:“10.1007”,{《key》:“ref97”,“article-title”:“Avlnet:从教学视频中学习视听语言表征”,“author”:“rouditchenko”,“year”:“2020”,“journal-tittle”:“ArXiv预印”},{“key”:“ref10”,“author”:“biderman”,“year”:“2021”,“journal-title”:“Rotary embeddings A relative revolution”}“,”doi“:“10.1177\/1354856517736978”},{“key”:“ref13”,“doi-asserted-by”:“publisher”,“doi”:“10.18653\/v1\/2021.findings-emnlp.259”}ted-by“:”publisher“,”doi“:”10.18653\/v1\/D19-1176“},{“key”:“ref118”,“doi asserted by”:“publisher”,“doi”:“10.18653\/v1\/K17-1029”},{“key”:“ref16”,“文章标题”:“语言模型很少学习”,“作者”:“tom”,“年份”:“2020”,“期刊标题”:“ArXiv预印本”},{“key”:“ref82”,“文章标题”:“通过填空问答理解视频数据的数据集和模型探索”,“author”:“tegan”,“year”:“2017”,“journal title”:“Computer Vision and Pattern Recognition(CVPR)”},{“key”:“ref117”,“journal-title”:“ArXiv预打印”},{“key”:“ref17”,“first page”:”67“,“article-title“:“Vggface2:跨姿势和年龄识别人脸的数据集”,“author”:“qiong”,“year”:“0”,“jornal-tittle”:《2018年IEEE第13届自动人脸和手势识别国际会议(FG 2018)》}:“ViL-BERT:针对视觉和语言任务的预训练任务-认知视觉语言表征”,“author”:“lu”,“year”:“2019”,“journal-title”:“Advances in neural information processing systems”},{“key”:“ref18”,“article-title“:“Extracting training data from large language models”,“author”:“carlini”,“年份”:“2020”,“日记标题”:“ArXiv预印本”},{“key”:“ref84”,“doi-asserted-by”:“publisher”,“doi”:“10.1109”\/ICCV.2019.00272,{“key”:“ref83”,“doi-asserted-by”:“publisher”,“doi”:“10.1177\/1461444814543995”},{“key”:“ref114”,“article-title”:“通用音频表征的多模自指导学习”,“author”:“wang”,“year”:“2021”,“journal-title“:”ArXiv Preprint“},“key“:”ref113“,“doi-asserted-by”:”publisher“,”doi“:”10.1109\/CVPR46437.2021.00 252“},{”键“:”参考116“,“文章标题”:“Simvlm:弱监督下的简单视觉语言模型预训练”,“作者”:“wang”,“年份”:“2021”,“期刊标题”:“ArXiv预印本”},{“key”:“ref80”,“文章标题”:“解耦权重衰减正则化”,“作者”:“loshchilov”,“年份”:“2017”,“期刊标题”:“ArXiv预印本”},{“key”:“ref115”,“文章标题”:“Tacotron:走向端到端语音合成”,“author”:“wang”,“year”:“2017”,“journal-title”:“ArXiv预打印”},{“key”:“ref120”,“doi-asserted-by”:“publisher”,“doi”:“10.1145\/312366.3123427”}“出版商”、“内政部”:“10.18653\/v1\/2021.emnlp-main.544”},{“key”:“ref122”,“article-title”:“用于视频识别的多视图变换器”,“author”:“yan”,“year”:“2022”,“journal-title“:”ArXiv Preprint“},”{“密钥”:“ref123”,“doi-asserted-by”:“publisher”,”doi“:”10.29007\/qwpk“}”,{”key“:”ref85“,”doi-assert-by“:”publisher“,”doi“:”10.1109\/ICCV.2019.00272“},{”key“:”ref86“,“first page”:“1”,“article-title”:“探索youtube上的性别差异:对vlog的创建和接收的分析”,“volume”:”10“,“author”:“molyneaux”,“year”:“2008”,“journal-title“:“American Communication journal”},{“key”:《美国传播杂志》:“ref87”,“article-title”:“Voxceleb:一个大规模说话人识别数据集”,“auth”:“nagrani”,“年份”:“2017”,“journal-title”:“ArXiv预印本”},{“key”:“ref88”,“article-title“:“碳排放和大型神经网络训练”,“author”:“patterson”,“year”:“2021”,“jornal-tittle”:《ArXiv印本》}],“event”:{“name”:“2020 IEEE\/CVF计算机视觉与模式识别会议(CVPR)”,“location”:“New Orleans,LA,USA”,“start”:{“date-parts”:[2022,6,18]]},“end”:{“date-parts”:[[2022,6,24]]}},”container-title“:[”2022 IEEE\/CVF计算机视觉和模式识别会议(CVPR)“],”original-title”:[],“link”:[{“URL”:“http://\/xplorestaging.IEEE.org\/ielx7\/987378\/987366\/098799062.pdf?arnumber=9879062“,”content-type“未指定”,“content-version”:“vor”,“intended-application”:“相似性检查”}],“存放”:{“日期部分”:[[2022,10,14]],“日期时间”:“2022-10-14T20:58:15Z”,“时间戳”:1665781095000},“分数”:1,“资源”:{“主要”:{:“URL”:“https:\/\/ieeexplore.ieee.org\/document\/9879062\/”}},”副标题“:[],”短标题“:[],”已发布“:{”日期部分“:[2022,6]]},“references-count”:134,“URL”:“http://\/dx.doi.org\/10.109\/cvpr52688.2022.01589”,“关系”:{},“主题”:[],“发布”:{“日期部分”:[[2022,6]]}}}