MolT5-Tokenizer / tokenizer.json
haydn-jones's picture
Upload tokenizer
ee5b87a verified
{
"version": "1.0",
"truncation": null,
"padding": {
"strategy": "BatchLongest",
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 2,
"pad_type_id": 0,
"pad_token": "[PAD]"
},
"added_tokens": [
{
"id": 0,
"content": "[START]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[STOP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "Split",
"pattern": {
"Regex": "\\[START\\]|\\[STOP\\]|\\[MASK\\]|\\[PAD\\]|@TH|@AL|@SP|@TB|@OH|He|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ar|Ca|Sc|Ti|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|Xe|Cs|Ba|La|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|Hf|Ta|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|Fr|Ra|Ac|Th|Pa|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Nh|Fl|Mc|Lv|Ts|Og|\\-|\\+|\\#|\\$|\\\\|@@|\\(|\\)|\\[|\\]|\\.|\\*|H|B|C|N|O|F|P|S|K|V|Y|I|W|U|=|:|/|@|0|1|2|3|4|5|6|7|8|9|%"
},
"behavior": "Isolated",
"invert": false
}
]
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[START]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[STOP]",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"[START]": {
"id": "[START]",
"ids": [
0
],
"tokens": [
"[START]"
]
},
"[STOP]": {
"id": "[STOP]",
"ids": [
1
],
"tokens": [
"[STOP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "",
"cleanup": true
},
"model": {
"type": "WordLevel",
"vocab": {
"[START]": 0,
"[STOP]": 1,
"[PAD]": 2,
"[MASK]": 3,
"H": 4,
"He": 5,
"Li": 6,
"Be": 7,
"B": 8,
"C": 9,
"N": 10,
"O": 11,
"F": 12,
"Ne": 13,
"Na": 14,
"Mg": 15,
"Al": 16,
"Si": 17,
"P": 18,
"S": 19,
"Cl": 20,
"Ar": 21,
"K": 22,
"Ca": 23,
"Sc": 24,
"Ti": 25,
"V": 26,
"Cr": 27,
"Mn": 28,
"Fe": 29,
"Co": 30,
"Ni": 31,
"Cu": 32,
"Zn": 33,
"Ga": 34,
"Ge": 35,
"As": 36,
"Se": 37,
"Br": 38,
"Kr": 39,
"Rb": 40,
"Sr": 41,
"Y": 42,
"Zr": 43,
"Nb": 44,
"Mo": 45,
"Tc": 46,
"Ru": 47,
"Rh": 48,
"Pd": 49,
"Ag": 50,
"Cd": 51,
"In": 52,
"Sn": 53,
"Sb": 54,
"Te": 55,
"I": 56,
"Xe": 57,
"Cs": 58,
"Ba": 59,
"La": 60,
"Ce": 61,
"Pr": 62,
"Nd": 63,
"Pm": 64,
"Sm": 65,
"Eu": 66,
"Gd": 67,
"Tb": 68,
"Dy": 69,
"Ho": 70,
"Er": 71,
"Tm": 72,
"Yb": 73,
"Lu": 74,
"Hf": 75,
"Ta": 76,
"W": 77,
"Re": 78,
"Os": 79,
"Ir": 80,
"Pt": 81,
"Au": 82,
"Hg": 83,
"Tl": 84,
"Pb": 85,
"Bi": 86,
"Po": 87,
"At": 88,
"Rn": 89,
"Fr": 90,
"Ra": 91,
"Ac": 92,
"Th": 93,
"Pa": 94,
"U": 95,
"Np": 96,
"Pu": 97,
"Am": 98,
"Cm": 99,
"Bk": 100,
"Cf": 101,
"Es": 102,
"Fm": 103,
"Md": 104,
"No": 105,
"Lr": 106,
"Rf": 107,
"Db": 108,
"Sg": 109,
"Bh": 110,
"Hs": 111,
"Mt": 112,
"Ds": 113,
"Rg": 114,
"Cn": 115,
"Nh": 116,
"Fl": 117,
"Mc": 118,
"Lv": 119,
"Ts": 120,
"Og": 121,
"-": 122,
"+": 123,
"=": 124,
"#": 125,
"$": 126,
":": 127,
"/": 128,
"\\": 129,
"@": 130,
"@@": 131,
"@TH": 132,
"@AL": 133,
"@SP": 134,
"@TB": 135,
"@OH": 136,
"0": 137,
"1": 138,
"2": 139,
"3": 140,
"4": 141,
"5": 142,
"6": 143,
"7": 144,
"8": 145,
"9": 146,
"(": 147,
")": 148,
"[": 149,
"]": 150,
".": 151,
"*": 152,
"%": 153
},
"unk_token": "[UNK]"
}
}