{"data":{"id":"10.48550/arxiv.2308.06571","type":"dois","attributes":{"doi":"10.48550/arxiv.2308.06571","prefix":"10.48550","suffix":"arxiv.2308.06571","identifiers":[{"identifier":"2308.06571","identifierType":"arXiv"}],"alternateIdentifiers":[{"alternateIdentifierType":"arXiv","alternateIdentifier":"2308.06571"}],"creators":[{"name":"Wang, Jiuniu","nameType":"Personal","givenName":"Jiuniu","familyName":"Wang","affiliation":[],"nameIdentifiers":[]},{"name":"Yuan, Hangjie","nameType":"Personal","givenName":"Hangjie","familyName":"Yuan","affiliation":[],"nameIdentifiers":[]},{"name":"Chen, Dayou","nameType":"Personal","givenName":"Dayou","familyName":"Chen","affiliation":[],"nameIdentifiers":[]},{"name":"Zhang, Yingya","nameType":"Personal","givenName":"Yingya","familyName":"Zhang","affiliation":[],"nameIdentifiers":[]},{"name":"Wang, Xiang","nameType":"Personal","givenName":"Xiang","familyName":"Wang","affiliation":[],"nameIdentifiers":[]},{"name":"Zhang, Shiwei","nameType":"Personal","givenName":"Shiwei","familyName":"Zhang","affiliation":[],"nameIdentifiers":[]}],"titles":[{"title":"ModelScope Text-to-Video Technical Report"}],"publisher":"arXiv","container":{},"publicationYear":2023,"subjects":[{"lang":"en","subject":"Computer Vision and Pattern Recognition (cs.CV)","subjectScheme":"arXiv"},{"lang":"en","subject":"Artificial Intelligence (cs.AI)","subjectScheme":"arXiv"},{"subject":"FOS: Computer and information sciences","subjectScheme":"Fields of Science and Technology (FOS)"},{"subject":"FOS: Computer and information sciences","schemeUri":"http://www.oecd.org/science/inno/38235147.pdf","subjectScheme":"Fields of Science and Technology (FOS)"}],"contributors":[],"dates":[{"date":"2023-08-12T13:53:10Z","dateType":"Submitted","dateInformation":"v1"},{"date":"2023-08-15T00:12:17Z","dateType":"Updated","dateInformation":"v1"},{"date":"2023-08","dateType":"Available","dateInformation":"v1"},{"date":"2023","dateType":"Issued"}],"language":null,"types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"Article","resourceTypeGeneral":"Preprint"},"relatedIdentifiers":[],"relatedItems":[],"sizes":[],"formats":[],"version":"1","rightsList":[{"rights":"arXiv.org perpetual, non-exclusive license","rightsUri":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}],"descriptions":[{"description":"This paper introduces ModelScopeT2V, a text-to-video synthesis model that evolves from a text-to-image synthesis model (i.e., Stable Diffusion). ModelScopeT2V incorporates spatio-temporal blocks to ensure consistent frame generation and smooth movement transitions. The model could adapt to varying frame numbers during training and inference, rendering it suitable for both image-text and video-text datasets. ModelScopeT2V brings together three components (i.e., VQGAN, a text encoder, and a denoising UNet), totally comprising 1.7 billion parameters, in which 0.5 billion parameters are dedicated to temporal capabilities. The model demonstrates superior performance over state-of-the-art methods across three evaluation metrics. The code and an online demo are available at \\url{https://modelscope.cn/models/damo/text-to-video-synthesis/summary}.","descriptionType":"Abstract"},{"description":"Technical report. Project page: \\url{https://modelscope.cn/models/damo/text-to-video-synthesis/summary}","descriptionType":"Other"}],"geoLocations":[],"fundingReferences":[],"xml":"PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPHJlc291cmNlIHhtbG5zPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCIgeG1sbnM6eHNpPSJodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYS1pbnN0YW5jZSIgeHNpOnNjaGVtYUxvY2F0aW9uPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCBodHRwOi8vc2NoZW1hLmRhdGFjaXRlLm9yZy9tZXRhL2tlcm5lbC00LjMvbWV0YWRhdGEueHNkIj4KICA8aWRlbnRpZmllciBpZGVudGlmaWVyVHlwZT0iRE9JIj4xMC40ODU1MC9BUlhJVi4yMzA4LjA2NTcxPC9pZGVudGlmaWVyPgogIDxhbHRlcm5hdGVJZGVudGlmaWVycz4KICAgIDxhbHRlcm5hdGVJZGVudGlmaWVyIGFsdGVybmF0ZUlkZW50aWZpZXJUeXBlPSJhclhpdiI+MjMwOC4wNjU3MTwvYWx0ZXJuYXRlSWRlbnRpZmllcj4KICA8L2FsdGVybmF0ZUlkZW50aWZpZXJzPgogIDxjcmVhdG9ycz4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5XYW5nLCBKaXVuaXU8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPkppdW5pdTwvZ2l2ZW5OYW1lPgogICAgICA8ZmFtaWx5TmFtZT5XYW5nPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPll1YW4sIEhhbmdqaWU8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPkhhbmdqaWU8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+WXVhbjwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5DaGVuLCBEYXlvdTwvY3JlYXRvck5hbWU+CiAgICAgIDxnaXZlbk5hbWU+RGF5b3U8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+Q2hlbjwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5aaGFuZywgWWluZ3lhPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5ZaW5neWE8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+Wmhhbmc8L2ZhbWlseU5hbWU+CiAgICA8L2NyZWF0b3I+CiAgICA8Y3JlYXRvcj4KICAgICAgPGNyZWF0b3JOYW1lIG5hbWVUeXBlPSJQZXJzb25hbCI+V2FuZywgWGlhbmc8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPlhpYW5nPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPldhbmc8L2ZhbWlseU5hbWU+CiAgICA8L2NyZWF0b3I+CiAgICA8Y3JlYXRvcj4KICAgICAgPGNyZWF0b3JOYW1lIG5hbWVUeXBlPSJQZXJzb25hbCI+WmhhbmcsIFNoaXdlaTwvY3JlYXRvck5hbWU+CiAgICAgIDxnaXZlbk5hbWU+U2hpd2VpPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPlpoYW5nPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogIDwvY3JlYXRvcnM+CiAgPHRpdGxlcz4KICAgIDx0aXRsZT5Nb2RlbFNjb3BlIFRleHQtdG8tVmlkZW8gVGVjaG5pY2FsIFJlcG9ydDwvdGl0bGU+CiAgPC90aXRsZXM+CiAgPHB1Ymxpc2hlcj5hclhpdjwvcHVibGlzaGVyPgogIDxwdWJsaWNhdGlvblllYXI+MjAyMzwvcHVibGljYXRpb25ZZWFyPgogIDxzdWJqZWN0cz4KICAgIDxzdWJqZWN0IHhtbDpsYW5nPSJlbiIgc3ViamVjdFNjaGVtZT0iYXJYaXYiPkNvbXB1dGVyIFZpc2lvbiBhbmQgUGF0dGVybiBSZWNvZ25pdGlvbiAoY3MuQ1YpPC9zdWJqZWN0PgogICAgPHN1YmplY3QgeG1sOmxhbmc9ImVuIiBzdWJqZWN0U2NoZW1lPSJhclhpdiI+QXJ0aWZpY2lhbCBJbnRlbGxpZ2VuY2UgKGNzLkFJKTwvc3ViamVjdD4KICAgIDxzdWJqZWN0IHN1YmplY3RTY2hlbWU9IkZpZWxkcyBvZiBTY2llbmNlIGFuZCBUZWNobm9sb2d5IChGT1MpIj5GT1M6IENvbXB1dGVyIGFuZCBpbmZvcm1hdGlvbiBzY2llbmNlczwvc3ViamVjdD4KICA8L3N1YmplY3RzPgogIDxkYXRlcz4KICAgIDxkYXRlIGRhdGVUeXBlPSJTdWJtaXR0ZWQiIGRhdGVJbmZvcm1hdGlvbj0idjEiPjIwMjMtMDgtMTJUMTM6NTM6MTBaPC9kYXRlPgogICAgPGRhdGUgZGF0ZVR5cGU9IlVwZGF0ZWQiIGRhdGVJbmZvcm1hdGlvbj0idjEiPjIwMjMtMDgtMTVUMDA6MTI6MTdaPC9kYXRlPgogICAgPGRhdGUgZGF0ZVR5cGU9IkF2YWlsYWJsZSIgZGF0ZUluZm9ybWF0aW9uPSJ2MSI+MjAyMy0wODwvZGF0ZT4KICA8L2RhdGVzPgogIDxyZXNvdXJjZVR5cGUgcmVzb3VyY2VUeXBlR2VuZXJhbD0iUHJlcHJpbnQiPkFydGljbGU8L3Jlc291cmNlVHlwZT4KICA8dmVyc2lvbj4xPC92ZXJzaW9uPgogIDxyaWdodHNMaXN0PgogICAgPHJpZ2h0cyByaWdodHNVUkk9Imh0dHA6Ly9hcnhpdi5vcmcvbGljZW5zZXMvbm9uZXhjbHVzaXZlLWRpc3RyaWIvMS4wLyI+YXJYaXYub3JnIHBlcnBldHVhbCwgbm9uLWV4Y2x1c2l2ZSBsaWNlbnNlPC9yaWdodHM+CiAgPC9yaWdodHNMaXN0PgogIDxkZXNjcmlwdGlvbnM+CiAgICA8ZGVzY3JpcHRpb24gZGVzY3JpcHRpb25UeXBlPSJBYnN0cmFjdCI+VGhpcyBwYXBlciBpbnRyb2R1Y2VzIE1vZGVsU2NvcGVUMlYsIGEgdGV4dC10by12aWRlbyBzeW50aGVzaXMgbW9kZWwgdGhhdCBldm9sdmVzIGZyb20gYSB0ZXh0LXRvLWltYWdlIHN5bnRoZXNpcyBtb2RlbCAoaS5lLiwgU3RhYmxlIERpZmZ1c2lvbikuIE1vZGVsU2NvcGVUMlYgaW5jb3Jwb3JhdGVzIHNwYXRpby10ZW1wb3JhbCBibG9ja3MgdG8gZW5zdXJlIGNvbnNpc3RlbnQgZnJhbWUgZ2VuZXJhdGlvbiBhbmQgc21vb3RoIG1vdmVtZW50IHRyYW5zaXRpb25zLiBUaGUgbW9kZWwgY291bGQgYWRhcHQgdG8gdmFyeWluZyBmcmFtZSBudW1iZXJzIGR1cmluZyB0cmFpbmluZyBhbmQgaW5mZXJlbmNlLCByZW5kZXJpbmcgaXQgc3VpdGFibGUgZm9yIGJvdGggaW1hZ2UtdGV4dCBhbmQgdmlkZW8tdGV4dCBkYXRhc2V0cy4gTW9kZWxTY29wZVQyViBicmluZ3MgdG9nZXRoZXIgdGhyZWUgY29tcG9uZW50cyAoaS5lLiwgVlFHQU4sIGEgdGV4dCBlbmNvZGVyLCBhbmQgYSBkZW5vaXNpbmcgVU5ldCksIHRvdGFsbHkgY29tcHJpc2luZyAxLjcgYmlsbGlvbiBwYXJhbWV0ZXJzLCBpbiB3aGljaCAwLjUgYmlsbGlvbiBwYXJhbWV0ZXJzIGFyZSBkZWRpY2F0ZWQgdG8gdGVtcG9yYWwgY2FwYWJpbGl0aWVzLiBUaGUgbW9kZWwgZGVtb25zdHJhdGVzIHN1cGVyaW9yIHBlcmZvcm1hbmNlIG92ZXIgc3RhdGUtb2YtdGhlLWFydCBtZXRob2RzIGFjcm9zcyB0aHJlZSBldmFsdWF0aW9uIG1ldHJpY3MuIFRoZSBjb2RlIGFuZCBhbiBvbmxpbmUgZGVtbyBhcmUgYXZhaWxhYmxlIGF0IFx1cmx7aHR0cHM6Ly9tb2RlbHNjb3BlLmNuL21vZGVscy9kYW1vL3RleHQtdG8tdmlkZW8tc3ludGhlc2lzL3N1bW1hcnl9LjwvZGVzY3JpcHRpb24+CiAgICA8ZGVzY3JpcHRpb24gZGVzY3JpcHRpb25UeXBlPSJPdGhlciI+VGVjaG5pY2FsIHJlcG9ydC4gUHJvamVjdCBwYWdlOiBcdXJse2h0dHBzOi8vbW9kZWxzY29wZS5jbi9tb2RlbHMvZGFtby90ZXh0LXRvLXZpZGVvLXN5bnRoZXNpcy9zdW1tYXJ5fTwvZGVzY3JpcHRpb24+CiAgPC9kZXNjcmlwdGlvbnM+CjwvcmVzb3VyY2U+","url":"https://arxiv.org/abs/2308.06571","contentUrl":null,"metadataVersion":0,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"viewCount":0,"viewsOverTime":[],"downloadCount":0,"downloadsOverTime":[],"referenceCount":0,"citationCount":0,"citationsOverTime":[],"partCount":0,"partOfCount":0,"versionCount":0,"versionOfCount":0,"created":"2023-08-15T01:51:27.000Z","registered":"2023-08-15T01:51:28.000Z","published":"2023","updated":"2023-08-15T01:51:28.000Z"},"relationships":{"client":{"data":{"id":"arxiv.content","type":"clients"}},"provider":{"data":{"id":"arxiv","type":"providers"}},"media":{"data":{"id":"10.48550/arxiv.2308.06571","type":"media"}},"references":{"data":[]},"citations":{"data":[]},"parts":{"data":[]},"partOf":{"data":[]},"versions":{"data":[]},"versionOf":{"data":[]}}}}