{"data":{"id":"10.48550/arxiv.2310.02989","type":"dois","attributes":{"doi":"10.48550/arxiv.2310.02989","prefix":"10.48550","suffix":"arxiv.2310.02989","identifiers":[{"identifier":"2310.02989","identifierType":"arXiv"}],"alternateIdentifiers":[{"alternateIdentifierType":"arXiv","alternateIdentifier":"2310.02989"}],"creators":[{"name":"Golkar, Siavash","nameType":"Personal","givenName":"Siavash","familyName":"Golkar","affiliation":[],"nameIdentifiers":[]},{"name":"Pettee, Mariel","nameType":"Personal","givenName":"Mariel","familyName":"Pettee","affiliation":[],"nameIdentifiers":[]},{"name":"Eickenberg, Michael","nameType":"Personal","givenName":"Michael","familyName":"Eickenberg","affiliation":[],"nameIdentifiers":[]},{"name":"Bietti, Alberto","nameType":"Personal","givenName":"Alberto","familyName":"Bietti","affiliation":[],"nameIdentifiers":[]},{"name":"Cranmer, Miles","nameType":"Personal","givenName":"Miles","familyName":"Cranmer","affiliation":[],"nameIdentifiers":[]},{"name":"Krawezik, Geraud","nameType":"Personal","givenName":"Geraud","familyName":"Krawezik","affiliation":[],"nameIdentifiers":[]},{"name":"Lanusse, Francois","nameType":"Personal","givenName":"Francois","familyName":"Lanusse","affiliation":[],"nameIdentifiers":[]},{"name":"McCabe, Michael","nameType":"Personal","givenName":"Michael","familyName":"McCabe","affiliation":[],"nameIdentifiers":[]},{"name":"Ohana, Ruben","nameType":"Personal","givenName":"Ruben","familyName":"Ohana","affiliation":[],"nameIdentifiers":[]},{"name":"Parker, Liam","nameType":"Personal","givenName":"Liam","familyName":"Parker","affiliation":[],"nameIdentifiers":[]},{"name":"Blancard, Bruno Régaldo-Saint","nameType":"Personal","givenName":"Bruno Régaldo-Saint","familyName":"Blancard","affiliation":[],"nameIdentifiers":[]},{"name":"Tesileanu, Tiberiu","nameType":"Personal","givenName":"Tiberiu","familyName":"Tesileanu","affiliation":[],"nameIdentifiers":[]},{"name":"Cho, Kyunghyun","nameType":"Personal","givenName":"Kyunghyun","familyName":"Cho","affiliation":[],"nameIdentifiers":[]},{"name":"Ho, Shirley","nameType":"Personal","givenName":"Shirley","familyName":"Ho","affiliation":[],"nameIdentifiers":[]}],"titles":[{"title":"xVal: A Continuous Numerical Tokenization for Scientific Language Models"}],"publisher":"arXiv","container":{},"publicationYear":2023,"subjects":[{"lang":"en","subject":"Machine Learning (stat.ML)","subjectScheme":"arXiv"},{"lang":"en","subject":"Artificial Intelligence (cs.AI)","subjectScheme":"arXiv"},{"lang":"en","subject":"Computation and Language (cs.CL)","subjectScheme":"arXiv"},{"lang":"en","subject":"Machine Learning (cs.LG)","subjectScheme":"arXiv"},{"subject":"FOS: Computer and information sciences","subjectScheme":"Fields of Science and Technology (FOS)"},{"subject":"FOS: Computer and information sciences","schemeUri":"http://www.oecd.org/science/inno/38235147.pdf","subjectScheme":"Fields of Science and Technology (FOS)"}],"contributors":[],"dates":[{"date":"2023-10-04T17:26:16Z","dateType":"Submitted","dateInformation":"v1"},{"date":"2023-10-05T00:41:49Z","dateType":"Updated","dateInformation":"v1"},{"date":"2024-12-15T07:07:28Z","dateType":"Submitted","dateInformation":"v2"},{"date":"2024-12-17T01:47:17Z","dateType":"Updated","dateInformation":"v2"},{"date":"2023-10","dateType":"Available","dateInformation":"v1"},{"date":"2023","dateType":"Issued"}],"language":null,"types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"Article","resourceTypeGeneral":"Preprint"},"relatedIdentifiers":[],"relatedItems":[],"sizes":[],"formats":[],"version":"2","rightsList":[{"rights":"arXiv.org perpetual, non-exclusive license","rightsUri":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/"}],"descriptions":[{"description":"Due in part to their discontinuous and discrete default encodings for numbers, Large Language Models (LLMs) have not yet been commonly used to process numerically-dense scientific datasets. Rendering datasets as text, however, could help aggregate diverse and multi-modal scientific data into a single training corpus, thereby potentially facilitating the development of foundation models for science. In this work, we introduce xVal, a strategy for continuously tokenizing numbers within language models that results in a more appropriate inductive bias for scientific applications. By training specially-modified language models from scratch on a variety of scientific datasets formatted as text, we find that xVal generally outperforms other common numerical tokenization strategies on metrics including out-of-distribution generalization and computational efficiency.","descriptionType":"Abstract"},{"description":"15 pages, 12 figures. Appendix: 8 pages, 2 figures. Accepted contribution at the NeurIPS Workshop on ML for the Physical Sciences","descriptionType":"Other"}],"geoLocations":[],"fundingReferences":[],"xml":"PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHJlc291cmNlIHhtbG5zPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCIgeG1sbnM6eHNpPSJodHRwOi8vd3d3LnczLm9yZy8yMDAxL1hNTFNjaGVtYS1pbnN0YW5jZSIgeHNpOnNjaGVtYUxvY2F0aW9uPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCBodHRwOi8vc2NoZW1hLmRhdGFjaXRlLm9yZy9tZXRhL2tlcm5lbC00LjMvbWV0YWRhdGEueHNkIj4KICA8aWRlbnRpZmllciBpZGVudGlmaWVyVHlwZT0iRE9JIj4xMC40ODU1MC9BUlhJVi4yMzEwLjAyOTg5PC9pZGVudGlmaWVyPgogIDxhbHRlcm5hdGVJZGVudGlmaWVycz4KICAgIDxhbHRlcm5hdGVJZGVudGlmaWVyIGFsdGVybmF0ZUlkZW50aWZpZXJUeXBlPSJhclhpdiI+MjMxMC4wMjk4OTwvYWx0ZXJuYXRlSWRlbnRpZmllcj4KICA8L2FsdGVybmF0ZUlkZW50aWZpZXJzPgogIDxjcmVhdG9ycz4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5Hb2xrYXIsIFNpYXZhc2g8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPlNpYXZhc2g8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+R29sa2FyPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPlBldHRlZSwgTWFyaWVsPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5NYXJpZWw8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+UGV0dGVlPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPkVpY2tlbmJlcmcsIE1pY2hhZWw8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPk1pY2hhZWw8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+RWlja2VuYmVyZzwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5CaWV0dGksIEFsYmVydG88L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPkFsYmVydG88L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+QmlldHRpPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPkNyYW5tZXIsIE1pbGVzPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5NaWxlczwvZ2l2ZW5OYW1lPgogICAgICA8ZmFtaWx5TmFtZT5DcmFubWVyPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPktyYXdlemlrLCBHZXJhdWQ8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPkdlcmF1ZDwvZ2l2ZW5OYW1lPgogICAgICA8ZmFtaWx5TmFtZT5LcmF3ZXppazwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5MYW51c3NlLCBGcmFuY29pczwvY3JlYXRvck5hbWU+CiAgICAgIDxnaXZlbk5hbWU+RnJhbmNvaXM8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+TGFudXNzZTwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5NY0NhYmUsIE1pY2hhZWw8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPk1pY2hhZWw8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+TWNDYWJlPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPk9oYW5hLCBSdWJlbjwvY3JlYXRvck5hbWU+CiAgICAgIDxnaXZlbk5hbWU+UnViZW48L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+T2hhbmE8L2ZhbWlseU5hbWU+CiAgICA8L2NyZWF0b3I+CiAgICA8Y3JlYXRvcj4KICAgICAgPGNyZWF0b3JOYW1lIG5hbWVUeXBlPSJQZXJzb25hbCI+UGFya2VyLCBMaWFtPC9jcmVhdG9yTmFtZT4KICAgICAgPGdpdmVuTmFtZT5MaWFtPC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPlBhcmtlcjwvZmFtaWx5TmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWUgbmFtZVR5cGU9IlBlcnNvbmFsIj5CbGFuY2FyZCwgQnJ1bm8gUsOpZ2FsZG8tU2FpbnQ8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPkJydW5vIFLDqWdhbGRvLVNhaW50PC9naXZlbk5hbWU+CiAgICAgIDxmYW1pbHlOYW1lPkJsYW5jYXJkPC9mYW1pbHlOYW1lPgogICAgPC9jcmVhdG9yPgogICAgPGNyZWF0b3I+CiAgICAgIDxjcmVhdG9yTmFtZSBuYW1lVHlwZT0iUGVyc29uYWwiPlRlc2lsZWFudSwgVGliZXJpdTwvY3JlYXRvck5hbWU+CiAgICAgIDxnaXZlbk5hbWU+VGliZXJpdTwvZ2l2ZW5OYW1lPgogICAgICA8ZmFtaWx5TmFtZT5UZXNpbGVhbnU8L2ZhbWlseU5hbWU+CiAgICA8L2NyZWF0b3I+CiAgICA8Y3JlYXRvcj4KICAgICAgPGNyZWF0b3JOYW1lIG5hbWVUeXBlPSJQZXJzb25hbCI+Q2hvLCBLeXVuZ2h5dW48L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPkt5dW5naHl1bjwvZ2l2ZW5OYW1lPgogICAgICA8ZmFtaWx5TmFtZT5DaG88L2ZhbWlseU5hbWU+CiAgICA8L2NyZWF0b3I+CiAgICA8Y3JlYXRvcj4KICAgICAgPGNyZWF0b3JOYW1lIG5hbWVUeXBlPSJQZXJzb25hbCI+SG8sIFNoaXJsZXk8L2NyZWF0b3JOYW1lPgogICAgICA8Z2l2ZW5OYW1lPlNoaXJsZXk8L2dpdmVuTmFtZT4KICAgICAgPGZhbWlseU5hbWU+SG88L2ZhbWlseU5hbWU+CiAgICA8L2NyZWF0b3I+CiAgPC9jcmVhdG9ycz4KICA8dGl0bGVzPgogICAgPHRpdGxlPnhWYWw6IEEgQ29udGludW91cyBOdW1lcmljYWwgVG9rZW5pemF0aW9uIGZvciBTY2llbnRpZmljIExhbmd1YWdlIE1vZGVsczwvdGl0bGU+CiAgPC90aXRsZXM+CiAgPHB1Ymxpc2hlcj5hclhpdjwvcHVibGlzaGVyPgogIDxwdWJsaWNhdGlvblllYXI+MjAyMzwvcHVibGljYXRpb25ZZWFyPgogIDxzdWJqZWN0cz4KICAgIDxzdWJqZWN0IHhtbDpsYW5nPSJlbiIgc3ViamVjdFNjaGVtZT0iYXJYaXYiPk1hY2hpbmUgTGVhcm5pbmcgKHN0YXQuTUwpPC9zdWJqZWN0PgogICAgPHN1YmplY3QgeG1sOmxhbmc9ImVuIiBzdWJqZWN0U2NoZW1lPSJhclhpdiI+QXJ0aWZpY2lhbCBJbnRlbGxpZ2VuY2UgKGNzLkFJKTwvc3ViamVjdD4KICAgIDxzdWJqZWN0IHhtbDpsYW5nPSJlbiIgc3ViamVjdFNjaGVtZT0iYXJYaXYiPkNvbXB1dGF0aW9uIGFuZCBMYW5ndWFnZSAoY3MuQ0wpPC9zdWJqZWN0PgogICAgPHN1YmplY3QgeG1sOmxhbmc9ImVuIiBzdWJqZWN0U2NoZW1lPSJhclhpdiI+TWFjaGluZSBMZWFybmluZyAoY3MuTEcpPC9zdWJqZWN0PgogICAgPHN1YmplY3Qgc3ViamVjdFNjaGVtZT0iRmllbGRzIG9mIFNjaWVuY2UgYW5kIFRlY2hub2xvZ3kgKEZPUykiPkZPUzogQ29tcHV0ZXIgYW5kIGluZm9ybWF0aW9uIHNjaWVuY2VzPC9zdWJqZWN0PgogIDwvc3ViamVjdHM+CiAgPGRhdGVzPgogICAgPGRhdGUgZGF0ZVR5cGU9IlN1Ym1pdHRlZCIgZGF0ZUluZm9ybWF0aW9uPSJ2MSI+MjAyMy0xMC0wNFQxNzoyNjoxNlo8L2RhdGU+CiAgICA8ZGF0ZSBkYXRlVHlwZT0iVXBkYXRlZCIgZGF0ZUluZm9ybWF0aW9uPSJ2MSI+MjAyMy0xMC0wNVQwMDo0MTo0OVo8L2RhdGU+CiAgICA8ZGF0ZSBkYXRlVHlwZT0iU3VibWl0dGVkIiBkYXRlSW5mb3JtYXRpb249InYyIj4yMDI0LTEyLTE1VDA3OjA3OjI4WjwvZGF0ZT4KICAgIDxkYXRlIGRhdGVUeXBlPSJVcGRhdGVkIiBkYXRlSW5mb3JtYXRpb249InYyIj4yMDI0LTEyLTE3VDAxOjQ3OjE3WjwvZGF0ZT4KICAgIDxkYXRlIGRhdGVUeXBlPSJBdmFpbGFibGUiIGRhdGVJbmZvcm1hdGlvbj0idjEiPjIwMjMtMTA8L2RhdGU+CiAgPC9kYXRlcz4KICA8cmVzb3VyY2VUeXBlIHJlc291cmNlVHlwZUdlbmVyYWw9IlByZXByaW50Ij5BcnRpY2xlPC9yZXNvdXJjZVR5cGU+CiAgPHZlcnNpb24+MjwvdmVyc2lvbj4KICA8cmlnaHRzTGlzdD4KICAgIDxyaWdodHMgcmlnaHRzVVJJPSJodHRwOi8vYXJ4aXYub3JnL2xpY2Vuc2VzL25vbmV4Y2x1c2l2ZS1kaXN0cmliLzEuMC8iPmFyWGl2Lm9yZyBwZXJwZXR1YWwsIG5vbi1leGNsdXNpdmUgbGljZW5zZTwvcmlnaHRzPgogIDwvcmlnaHRzTGlzdD4KICA8ZGVzY3JpcHRpb25zPgogICAgPGRlc2NyaXB0aW9uIGRlc2NyaXB0aW9uVHlwZT0iQWJzdHJhY3QiPkR1ZSBpbiBwYXJ0IHRvIHRoZWlyIGRpc2NvbnRpbnVvdXMgYW5kIGRpc2NyZXRlIGRlZmF1bHQgZW5jb2RpbmdzIGZvciBudW1iZXJzLCBMYXJnZSBMYW5ndWFnZSBNb2RlbHMgKExMTXMpIGhhdmUgbm90IHlldCBiZWVuIGNvbW1vbmx5IHVzZWQgdG8gcHJvY2VzcyBudW1lcmljYWxseS1kZW5zZSBzY2llbnRpZmljIGRhdGFzZXRzLiBSZW5kZXJpbmcgZGF0YXNldHMgYXMgdGV4dCwgaG93ZXZlciwgY291bGQgaGVscCBhZ2dyZWdhdGUgZGl2ZXJzZSBhbmQgbXVsdGktbW9kYWwgc2NpZW50aWZpYyBkYXRhIGludG8gYSBzaW5nbGUgdHJhaW5pbmcgY29ycHVzLCB0aGVyZWJ5IHBvdGVudGlhbGx5IGZhY2lsaXRhdGluZyB0aGUgZGV2ZWxvcG1lbnQgb2YgZm91bmRhdGlvbiBtb2RlbHMgZm9yIHNjaWVuY2UuIEluIHRoaXMgd29yaywgd2UgaW50cm9kdWNlIHhWYWwsIGEgc3RyYXRlZ3kgZm9yIGNvbnRpbnVvdXNseSB0b2tlbml6aW5nIG51bWJlcnMgd2l0aGluIGxhbmd1YWdlIG1vZGVscyB0aGF0IHJlc3VsdHMgaW4gYSBtb3JlIGFwcHJvcHJpYXRlIGluZHVjdGl2ZSBiaWFzIGZvciBzY2llbnRpZmljIGFwcGxpY2F0aW9ucy4gQnkgdHJhaW5pbmcgc3BlY2lhbGx5LW1vZGlmaWVkIGxhbmd1YWdlIG1vZGVscyBmcm9tIHNjcmF0Y2ggb24gYSB2YXJpZXR5IG9mIHNjaWVudGlmaWMgZGF0YXNldHMgZm9ybWF0dGVkIGFzIHRleHQsIHdlIGZpbmQgdGhhdCB4VmFsIGdlbmVyYWxseSBvdXRwZXJmb3JtcyBvdGhlciBjb21tb24gbnVtZXJpY2FsIHRva2VuaXphdGlvbiBzdHJhdGVnaWVzIG9uIG1ldHJpY3MgaW5jbHVkaW5nIG91dC1vZi1kaXN0cmlidXRpb24gZ2VuZXJhbGl6YXRpb24gYW5kIGNvbXB1dGF0aW9uYWwgZWZmaWNpZW5jeS48L2Rlc2NyaXB0aW9uPgogICAgPGRlc2NyaXB0aW9uIGRlc2NyaXB0aW9uVHlwZT0iT3RoZXIiPjE1IHBhZ2VzLCAxMiBmaWd1cmVzLiBBcHBlbmRpeDogOCBwYWdlcywgMiBmaWd1cmVzLiBBY2NlcHRlZCBjb250cmlidXRpb24gYXQgdGhlIE5ldXJJUFMgV29ya3Nob3Agb24gTUwgZm9yIHRoZSBQaHlzaWNhbCBTY2llbmNlczwvZGVzY3JpcHRpb24+CiAgPC9kZXNjcmlwdGlvbnM+CjwvcmVzb3VyY2U+","url":"https://arxiv.org/abs/2310.02989","contentUrl":null,"metadataVersion":1,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"viewCount":0,"viewsOverTime":[],"downloadCount":0,"downloadsOverTime":[],"referenceCount":0,"citationCount":0,"citationsOverTime":[],"partCount":0,"partOfCount":0,"versionCount":0,"versionOfCount":0,"created":"2023-10-05T01:39:56.000Z","registered":"2023-10-05T01:39:56.000Z","published":"2023","updated":"2024-12-17T04:52:31.000Z"},"relationships":{"client":{"data":{"id":"arxiv.content","type":"clients"}},"provider":{"data":{"id":"arxiv","type":"providers"}},"media":{"data":{"id":"10.48550/arxiv.2310.02989","type":"media"}},"references":{"data":[]},"citations":{"data":[]},"parts":{"data":[]},"partOf":{"data":[]},"versions":{"data":[]},"versionOf":{"data":[]}}}}