{"data":{"id":"10.48550/arxiv.2006.06195","type":"dois","attributes":{"doi":"10.48550/arxiv.2006.06195","prefix":"10.48550","suffix":"arxiv.2006.06195","identifiers":[{"identifier":"2006.06195","identifierType":"arXiv"}],"alternateIdentifiers":[{"alternateIdentifierType":"arXiv","alternateIdentifier":"2006.06195"}],"creators":[{"name":"Gan, Zhe","nameType":"Personal","givenName":"Zhe","familyName":"Gan","affiliation":[],"nameIdentifiers":[]},{"name":"Chen, Yen-Chun","nameType":"Personal","givenName":"Yen-Chun","familyName":"Chen","affiliation":[],"nameIdentifiers":[]},{"name":"Li, Linjie","nameType":"Personal","givenName":"Linjie","familyName":"Li","affiliation":[],"nameIdentifiers":[]},{"name":"Zhu, Chen","nameType":"Personal","givenName":"Chen","familyName":"Zhu","affiliation":[],"nameIdentifiers":[]},{"name":"Cheng, Yu","nameType":"Personal","givenName":"Yu","familyName":"Cheng","affiliation":[],"nameIdentifiers":[]},{"name":"Liu, Jingjing","nameType":"Personal","givenName":"Jingjing","familyName":"Liu","affiliation":[],"nameIdentifiers":[]}],"titles":[{"title":"Large-Scale Adversarial Training for Vision-and-Language Representation Learning"}],"publisher":"arXiv","container":{},"publicationYear":2020,"subjects":[{"lang":"en","subject":"Computer Vision and Pattern Recognition (cs.CV)","subjectScheme":"arXiv"},{"lang":"en","subject":"Computation and Language (cs.CL)","subjectScheme":"arXiv"},{"lang":"en","subject":"Machine Learning (cs.LG)","subjectScheme":"arXiv"},{"subject":"FOS: Computer and information sciences","subjectScheme":"Fields of Science and Technology (FOS)"},{"subject":"FOS: Computer and information sciences","schemeUri":"http://www.oecd.org/science/inno/38235147.pdf","subjectScheme":"Fields of Science and Technology (FOS)"}],"contributors":[],"dates":[{"date":"2020-06-11T05:14:35Z","dateType":"Submitted","dateInformation":"v1"},{"date":"2020-06-12T00:07:15Z","dateType":"Updated","dateInformation":"v1"},{"date":"2020-10-22T18:12:53Z","dateType":"Submitted","dateInformation":"v2"},{"date":"2020-10-26T00:01:03Z","dateType":"Updated","dateInformation":"v2"},{"date":"2020-06","dateType":"Available","dateInformation":"v1"},{"date":"2020","dateType":"Issued"}],"language":null,"types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"Article","resourceTypeGeneral":"Preprint"},"relatedIdentifiers":[],"relatedItems":[],"sizes":[],"formats":[],"version":"2","rightsList":[{"rights":"arXiv.org perpetual, non-exclusive license","rightsUri":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}],"descriptions":[{"description":"We present VILLA, the first known effort on large-scale adversarial training for vision-and-language (V+L) representation learning. VILLA consists of two training stages: (i) task-agnostic adversarial pre-training; followed by (ii) task-specific adversarial finetuning. Instead of adding adversarial perturbations on image pixels and textual tokens, we propose to perform adversarial training in the embedding space of each modality. To enable large-scale training, we adopt the \"free\" adversarial training strategy, and combine it with KL-divergence-based regularization to promote higher invariance in the embedding space. We apply VILLA to current best-performing V+L models, and achieve new state of the art on a wide range of tasks, including Visual Question Answering, Visual Commonsense Reasoning, Image-Text Retrieval, Referring Expression Comprehension, Visual Entailment, and NLVR2.","descriptionType":"Abstract"},{"description":"NeurIPS 2020 Spotlight paper","descriptionType":"Other"}],"geoLocations":[],"fundingReferences":[],"xml":"PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPHJlc291cmNlIHhtbG5zPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCIgeG1sbnM6eHNpPSJodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYS1pbnN0YW5jZSIgeHNpOnNjaGVtYUxvY2F0aW9uPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCBodHRwOi8vc2NoZW1hLmRhdGFjaXRlLm9yZy9tZXRhL2tlcm5lbC00LjMvbWV0YWRhdGEueHNkIj4KICA8aWRlbnRpZmllciBpZGVudGlmaWVyVHlwZT0iRE9JIj4xMC40ODU1MC9BUlhJVi4yMDA2LjA2MTk1PC9pZGVudGlmaWVyPgogIDxhbHRlcm5hdGVJZGVudGlmaWVycz4KICAgIDxhbHRlcm5hdGVJZGVudGlmaWVyIGFsdGVybmF0ZUlkZW50aWZpZXJUeXBlPSJhclhpdiI+MjAwNi4wNjE5NTwvYWx0ZXJuYXRlSWRlbnRpZmllcj4KICA8L2FsdGVybmF0ZUlkZW50aWZpZXJzPgogIDxjcmVhdG9ycz4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5HYW4sIFpoZTwvY3JlYXRvck5hbWU+CiAgICAgIDxnaXZlbk5hbWU+WmhlPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPkdhbjwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5DaGVuLCBZZW4tQ2h1bjwvY3JlYXRvck5hbWU+CiAgICAgIDxnaXZlbk5hbWU+WWVuLUNodW48L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+Q2hlbjwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5MaSwgTGluamllPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5MaW5qaWU8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+TGk8L2ZhbWlseU5hbWU+CiAgICA8L2NyZWF0b3I+CiAgICA8Y3JlYXRvcj4KICAgICAgPGNyZWF0b3JOYW1lIG5hbWVUeXBlPSJQZXJzb25hbCI+Wmh1LCBDaGVuPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5DaGVuPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPlpodTwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5DaGVuZywgWXU8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPll1PC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPkNoZW5nPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPkxpdSwgSmluZ2ppbmc8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPkppbmdqaW5nPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPkxpdTwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICA8L2NyZWF0b3JzPgogIDx0aXRsZXM+CiAgICA8dGl0bGU+TGFyZ2UtU2NhbGUgQWR2ZXJzYXJpYWwgVHJhaW5pbmcgZm9yIFZpc2lvbi1hbmQtTGFuZ3VhZ2UgUmVwcmVzZW50YXRpb24gTGVhcm5pbmc8L3RpdGxlPgogIDwvdGl0bGVzPgogIDxwdWJsaXNoZXI+YXJYaXY8L3B1Ymxpc2hlcj4KICA8cHVibGljYXRpb25ZZWFyPjIwMjA8L3B1YmxpY2F0aW9uWWVhcj4KICA8c3ViamVjdHM+CiAgICA8c3ViamVjdCB4bWw6bGFuZz0iZW4iIHN1YmplY3RTY2hlbWU9ImFyWGl2Ij5Db21wdXRlciBWaXNpb24gYW5kIFBhdHRlcm4gUmVjb2duaXRpb24gKGNzLkNWKTwvc3ViamVjdD4KICAgIDxzdWJqZWN0IHhtbDpsYW5nPSJlbiIgc3ViamVjdFNjaGVtZT0iYXJYaXYiPkNvbXB1dGF0aW9uIGFuZCBMYW5ndWFnZSAoY3MuQ0wpPC9zdWJqZWN0PgogICAgPHN1YmplY3QgeG1sOmxhbmc9ImVuIiBzdWJqZWN0U2NoZW1lPSJhclhpdiI+TWFjaGluZSBMZWFybmluZyAoY3MuTEcpPC9zdWJqZWN0PgogICAgPHN1YmplY3Qgc3ViamVjdFNjaGVtZT0iRmllbGRzIG9mIFNjaWVuY2UgYW5kIFRlY2hub2xvZ3kgKEZPUykiPkZPUzogQ29tcHV0ZXIgYW5kIGluZm9ybWF0aW9uIHNjaWVuY2VzPC9zdWJqZWN0PgogIDwvc3ViamVjdHM+CiAgPGRhdGVzPgogICAgPGRhdGUgZGF0ZVR5cGU9IlN1Ym1pdHRlZCIgZGF0ZUluZm9ybWF0aW9uPSJ2MSI+MjAyMC0wNi0xMVQwNToxNDozNVo8L2RhdGU+CiAgICA8ZGF0ZSBkYXRlVHlwZT0iVXBkYXRlZCIgZGF0ZUluZm9ybWF0aW9uPSJ2MSI+MjAyMC0wNi0xMlQwMDowNzoxNVo8L2RhdGU+CiAgICA8ZGF0ZSBkYXRlVHlwZT0iU3VibWl0dGVkIiBkYXRlSW5mb3JtYXRpb249InYyIj4yMDIwLTEwLTIyVDE4OjEyOjUzWjwvZGF0ZT4KICAgIDxkYXRlIGRhdGVUeXBlPSJVcGRhdGVkIiBkYXRlSW5mb3JtYXRpb249InYyIj4yMDIwLTEwLTI2VDAwOjAxOjAzWjwvZGF0ZT4KICAgIDxkYXRlIGRhdGVUeXBlPSJBdmFpbGFibGUiIGRhdGVJbmZvcm1hdGlvbj0idjEiPjIwMjAtMDY8L2RhdGU+CiAgPC9kYXRlcz4KICA8cmVzb3VyY2VUeXBlIHJlc291cmNlVHlwZUdlbmVyYWw9IlByZXByaW50Ij5BcnRpY2xlPC9yZXNvdXJjZVR5cGU+CiAgPHZlcnNpb24+MjwvdmVyc2lvbj4KICA8cmlnaHRzTGlzdD4KICAgIDxyaWdodHMgcmlnaHRzVVJJPSJodHRwOi8vYXJ4aXYub3JnL2xpY2Vuc2VzL25vbmV4Y2x1c2l2ZS1kaXN0cmliLzEuMC8iPmFyWGl2Lm9yZyBwZXJwZXR1YWwsIG5vbi1leGNsdXNpdmUgbGljZW5zZTwvcmlnaHRzPgogIDwvcmlnaHRzTGlzdD4KICA8ZGVzY3JpcHRpb25zPgogICAgPGRlc2NyaXB0aW9uIGRlc2NyaXB0aW9uVHlwZT0iQWJzdHJhY3QiPldlIHByZXNlbnQgVklMTEEsIHRoZSBmaXJzdCBrbm93biBlZmZvcnQgb24gbGFyZ2Utc2NhbGUgYWR2ZXJzYXJpYWwgdHJhaW5pbmcgZm9yIHZpc2lvbi1hbmQtbGFuZ3VhZ2UgKFYrTCkgcmVwcmVzZW50YXRpb24gbGVhcm5pbmcuIFZJTExBIGNvbnNpc3RzIG9mIHR3byB0cmFpbmluZyBzdGFnZXM6IChpKSB0YXNrLWFnbm9zdGljIGFkdmVyc2FyaWFsIHByZS10cmFpbmluZzsgZm9sbG93ZWQgYnkgKGlpKSB0YXNrLXNwZWNpZmljIGFkdmVyc2FyaWFsIGZpbmV0dW5pbmcuIEluc3RlYWQgb2YgYWRkaW5nIGFkdmVyc2FyaWFsIHBlcnR1cmJhdGlvbnMgb24gaW1hZ2UgcGl4ZWxzIGFuZCB0ZXh0dWFsIHRva2Vucywgd2UgcHJvcG9zZSB0byBwZXJmb3JtIGFkdmVyc2FyaWFsIHRyYWluaW5nIGluIHRoZSBlbWJlZGRpbmcgc3BhY2Ugb2YgZWFjaCBtb2RhbGl0eS4gVG8gZW5hYmxlIGxhcmdlLXNjYWxlIHRyYWluaW5nLCB3ZSBhZG9wdCB0aGUgImZyZWUiIGFkdmVyc2FyaWFsIHRyYWluaW5nIHN0cmF0ZWd5LCBhbmQgY29tYmluZSBpdCB3aXRoIEtMLWRpdmVyZ2VuY2UtYmFzZWQgcmVndWxhcml6YXRpb24gdG8gcHJvbW90ZSBoaWdoZXIgaW52YXJpYW5jZSBpbiB0aGUgZW1iZWRkaW5nIHNwYWNlLiBXZSBhcHBseSBWSUxMQSB0byBjdXJyZW50IGJlc3QtcGVyZm9ybWluZyBWK0wgbW9kZWxzLCBhbmQgYWNoaWV2ZSBuZXcgc3RhdGUgb2YgdGhlIGFydCBvbiBhIHdpZGUgcmFuZ2Ugb2YgdGFza3MsIGluY2x1ZGluZyBWaXN1YWwgUXVlc3Rpb24gQW5zd2VyaW5nLCBWaXN1YWwgQ29tbW9uc2Vuc2UgUmVhc29uaW5nLCBJbWFnZS1UZXh0IFJldHJpZXZhbCwgUmVmZXJyaW5nIEV4cHJlc3Npb24gQ29tcHJlaGVuc2lvbiwgVmlzdWFsIEVudGFpbG1lbnQsIGFuZCBOTFZSMi48L2Rlc2NyaXB0aW9uPgogICAgPGRlc2NyaXB0aW9uIGRlc2NyaXB0aW9uVHlwZT0iT3RoZXIiPk5ldXJJUFMgMjAyMCBTcG90bGlnaHQgcGFwZXI8L2Rlc2NyaXB0aW9uPgogIDwvZGVzY3JpcHRpb25zPgo8L3Jlc291cmNlPg==","url":"https://arxiv.org/abs/2006.06195","contentUrl":null,"metadataVersion":0,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"viewCount":0,"viewsOverTime":[],"downloadCount":0,"downloadsOverTime":[],"referenceCount":0,"citationCount":0,"citationsOverTime":[],"partCount":0,"partOfCount":0,"versionCount":0,"versionOfCount":0,"created":"2022-02-25T06:34:45.000Z","registered":"2022-02-25T06:34:45.000Z","published":"2020","updated":"2022-02-25T06:34:45.000Z"},"relationships":{"client":{"data":{"id":"arxiv.content","type":"clients"}},"provider":{"data":{"id":"arxiv","type":"providers"}},"media":{"data":{"id":"10.48550/arxiv.2006.06195","type":"media"}},"references":{"data":[]},"citations":{"data":[]},"parts":{"data":[]},"partOf":{"data":[]},"versions":{"data":[]},"versionOf":{"data":[]}}}}