{"data":{"id":"10.48550/arxiv.1706.00932","type":"dois","attributes":{"doi":"10.48550/arxiv.1706.00932","prefix":"10.48550","suffix":"arxiv.1706.00932","identifiers":[{"identifier":"1706.00932","identifierType":"arXiv"}],"alternateIdentifiers":[{"alternateIdentifierType":"arXiv","alternateIdentifier":"1706.00932"}],"creators":[{"name":"Aytar, Yusuf","nameType":"Personal","givenName":"Yusuf","familyName":"Aytar","affiliation":[],"nameIdentifiers":[]},{"name":"Vondrick, Carl","nameType":"Personal","givenName":"Carl","familyName":"Vondrick","affiliation":[],"nameIdentifiers":[]},{"name":"Torralba, Antonio","nameType":"Personal","givenName":"Antonio","familyName":"Torralba","affiliation":[],"nameIdentifiers":[]}],"titles":[{"title":"See, Hear, and Read: Deep Aligned Representations"}],"publisher":"arXiv","container":{},"publicationYear":2017,"subjects":[{"lang":"en","subject":"Computer Vision and Pattern Recognition (cs.CV)","subjectScheme":"arXiv"},{"subject":"FOS: Computer and information sciences","subjectScheme":"Fields of Science and Technology (FOS)"},{"subject":"FOS: Computer and information sciences","schemeUri":"http://www.oecd.org/science/inno/38235147.pdf","subjectScheme":"Fields of Science and Technology (FOS)"}],"contributors":[],"dates":[{"date":"2017-06-03T11:11:13Z","dateType":"Submitted","dateInformation":"v1"},{"date":"2017-06-06T00:04:14Z","dateType":"Updated","dateInformation":"v1"},{"date":"2017-06","dateType":"Available","dateInformation":"v1"},{"date":"2017","dateType":"Issued"}],"language":null,"types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"Article","resourceTypeGeneral":"Preprint"},"relatedIdentifiers":[],"relatedItems":[],"sizes":[],"formats":[],"version":"1","rightsList":[{"rights":"arXiv.org perpetual, non-exclusive license","rightsUri":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}],"descriptions":[{"description":"We capitalize on large amounts of readily-available, synchronous data to learn a deep discriminative representations shared across three major natural modalities: vision, sound and language. By leveraging over a year of sound from video and millions of sentences paired with images, we jointly train a deep convolutional network for aligned representation learning. Our experiments suggest that this representation is useful for several tasks, such as cross-modal retrieval or transferring classifiers between modalities. Moreover, although our network is only trained with image+text and image+sound pairs, it can transfer between text and sound as well, a transfer the network never observed during training. Visualizations of our representation reveal many hidden units which automatically emerge to detect concepts, independent of the modality.","descriptionType":"Abstract"}],"geoLocations":[],"fundingReferences":[],"xml":"PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPHJlc291cmNlIHhtbG5zPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCIgeG1sbnM6eHNpPSJodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYS1pbnN0YW5jZSIgeHNpOnNjaGVtYUxvY2F0aW9uPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCBodHRwOi8vc2NoZW1hLmRhdGFjaXRlLm9yZy9tZXRhL2tlcm5lbC00LjMvbWV0YWRhdGEueHNkIj4KICA8aWRlbnRpZmllciBpZGVudGlmaWVyVHlwZT0iRE9JIj4xMC40ODU1MC9BUlhJVi4xNzA2LjAwOTMyPC9pZGVudGlmaWVyPgogIDxhbHRlcm5hdGVJZGVudGlmaWVycz4KICAgIDxhbHRlcm5hdGVJZGVudGlmaWVyIGFsdGVybmF0ZUlkZW50aWZpZXJUeXBlPSJhclhpdiI+MTcwNi4wMDkzMjwvYWx0ZXJuYXRlSWRlbnRpZmllcj4KICA8L2FsdGVybmF0ZUlkZW50aWZpZXJzPgogIDxjcmVhdG9ycz4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5BeXRhciwgWXVzdWY8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPll1c3VmPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPkF5dGFyPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPlZvbmRyaWNrLCBDYXJsPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5DYXJsPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPlZvbmRyaWNrPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPlRvcnJhbGJhLCBBbnRvbmlvPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5BbnRvbmlvPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPlRvcnJhbGJhPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogIDwvY3JlYXRvcnM+CiAgPHRpdGxlcz4KICAgIDx0aXRsZT5TZWUsIEhlYXIsIGFuZCBSZWFkOiBEZWVwIEFsaWduZWQgUmVwcmVzZW50YXRpb25zPC90aXRsZT4KICA8L3RpdGxlcz4KICA8cHVibGlzaGVyPmFyWGl2PC9wdWJsaXNoZXI+CiAgPHB1YmxpY2F0aW9uWWVhcj4yMDE3PC9wdWJsaWNhdGlvblllYXI+CiAgPHN1YmplY3RzPgogICAgPHN1YmplY3QgeG1sOmxhbmc9ImVuIiBzdWJqZWN0U2NoZW1lPSJhclhpdiI+Q29tcHV0ZXIgVmlzaW9uIGFuZCBQYXR0ZXJuIFJlY29nbml0aW9uIChjcy5DVik8L3N1YmplY3Q+CiAgICA8c3ViamVjdCBzdWJqZWN0U2NoZW1lPSJGaWVsZHMgb2YgU2NpZW5jZSBhbmQgVGVjaG5vbG9neSAoRk9TKSI+Rk9TOiBDb21wdXRlciBhbmQgaW5mb3JtYXRpb24gc2NpZW5jZXM8L3N1YmplY3Q+CiAgPC9zdWJqZWN0cz4KICA8ZGF0ZXM+CiAgICA8ZGF0ZSBkYXRlVHlwZT0iU3VibWl0dGVkIiBkYXRlSW5mb3JtYXRpb249InYxIj4yMDE3LTA2LTAzVDExOjExOjEzWjwvZGF0ZT4KICAgIDxkYXRlIGRhdGVUeXBlPSJVcGRhdGVkIiBkYXRlSW5mb3JtYXRpb249InYxIj4yMDE3LTA2LTA2VDAwOjA0OjE0WjwvZGF0ZT4KICAgIDxkYXRlIGRhdGVUeXBlPSJBdmFpbGFibGUiIGRhdGVJbmZvcm1hdGlvbj0idjEiPjIwMTctMDY8L2RhdGU+CiAgPC9kYXRlcz4KICA8cmVzb3VyY2VUeXBlIHJlc291cmNlVHlwZUdlbmVyYWw9IlByZXByaW50Ij5BcnRpY2xlPC9yZXNvdXJjZVR5cGU+CiAgPHZlcnNpb24+MTwvdmVyc2lvbj4KICA8cmlnaHRzTGlzdD4KICAgIDxyaWdodHMgcmlnaHRzVVJJPSJodHRwOi8vYXJ4aXYub3JnL2xpY2Vuc2VzL25vbmV4Y2x1c2l2ZS1kaXN0cmliLzEuMC8iPmFyWGl2Lm9yZyBwZXJwZXR1YWwsIG5vbi1leGNsdXNpdmUgbGljZW5zZTwvcmlnaHRzPgogIDwvcmlnaHRzTGlzdD4KICA8ZGVzY3JpcHRpb25zPgogICAgPGRlc2NyaXB0aW9uIGRlc2NyaXB0aW9uVHlwZT0iQWJzdHJhY3QiPldlIGNhcGl0YWxpemUgb24gbGFyZ2UgYW1vdW50cyBvZiByZWFkaWx5LWF2YWlsYWJsZSwgc3luY2hyb25vdXMgZGF0YSB0byBsZWFybiBhIGRlZXAgZGlzY3JpbWluYXRpdmUgcmVwcmVzZW50YXRpb25zIHNoYXJlZCBhY3Jvc3MgdGhyZWUgbWFqb3IgbmF0dXJhbCBtb2RhbGl0aWVzOiB2aXNpb24sIHNvdW5kIGFuZCBsYW5ndWFnZS4gQnkgbGV2ZXJhZ2luZyBvdmVyIGEgeWVhciBvZiBzb3VuZCBmcm9tIHZpZGVvIGFuZCBtaWxsaW9ucyBvZiBzZW50ZW5jZXMgcGFpcmVkIHdpdGggaW1hZ2VzLCB3ZSBqb2ludGx5IHRyYWluIGEgZGVlcCBjb252b2x1dGlvbmFsIG5ldHdvcmsgZm9yIGFsaWduZWQgcmVwcmVzZW50YXRpb24gbGVhcm5pbmcuIE91ciBleHBlcmltZW50cyBzdWdnZXN0IHRoYXQgdGhpcyByZXByZXNlbnRhdGlvbiBpcyB1c2VmdWwgZm9yIHNldmVyYWwgdGFza3MsIHN1Y2ggYXMgY3Jvc3MtbW9kYWwgcmV0cmlldmFsIG9yIHRyYW5zZmVycmluZyBjbGFzc2lmaWVycyBiZXR3ZWVuIG1vZGFsaXRpZXMuIE1vcmVvdmVyLCBhbHRob3VnaCBvdXIgbmV0d29yayBpcyBvbmx5IHRyYWluZWQgd2l0aCBpbWFnZSt0ZXh0IGFuZCBpbWFnZStzb3VuZCBwYWlycywgaXQgY2FuIHRyYW5zZmVyIGJldHdlZW4gdGV4dCBhbmQgc291bmQgYXMgd2VsbCwgYSB0cmFuc2ZlciB0aGUgbmV0d29yayBuZXZlciBvYnNlcnZlZCBkdXJpbmcgdHJhaW5pbmcuIFZpc3VhbGl6YXRpb25zIG9mIG91ciByZXByZXNlbnRhdGlvbiByZXZlYWwgbWFueSBoaWRkZW4gdW5pdHMgd2hpY2ggYXV0b21hdGljYWxseSBlbWVyZ2UgdG8gZGV0ZWN0IGNvbmNlcHRzLCBpbmRlcGVuZGVudCBvZiB0aGUgbW9kYWxpdHkuPC9kZXNjcmlwdGlvbj4KICA8L2Rlc2NyaXB0aW9ucz4KPC9yZXNvdXJjZT4=","url":"https://arxiv.org/abs/1706.00932","contentUrl":null,"metadataVersion":0,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"viewCount":0,"viewsOverTime":[],"downloadCount":0,"downloadsOverTime":[],"referenceCount":0,"citationCount":0,"citationsOverTime":[],"partCount":0,"partOfCount":0,"versionCount":0,"versionOfCount":0,"created":"2022-03-03T20:42:03.000Z","registered":"2022-03-03T20:42:03.000Z","published":"2017","updated":"2022-03-03T20:42:03.000Z"},"relationships":{"client":{"data":{"id":"arxiv.content","type":"clients"}},"provider":{"data":{"id":"arxiv","type":"providers"}},"media":{"data":{"id":"10.48550/arxiv.1706.00932","type":"media"}},"references":{"data":[]},"citations":{"data":[]},"parts":{"data":[]},"partOf":{"data":[]},"versions":{"data":[]},"versionOf":{"data":[]}}}}