mist-28M-0uiq7o7m-freesolv / tokenizer.json
anoushka2000's picture
Upload folder using huggingface_hub
9905e27 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 159,
"content": "[BOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 160,
"content": "[EOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 161,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 162,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 163,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 164,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Replace",
"pattern": {
"String": "++"
},
"content": "+2"
},
{
"type": "Replace",
"pattern": {
"String": "--"
},
"content": "-2"
},
{
"type": "Strip",
"strip_left": true,
"strip_right": true
}
]
},
"pre_tokenizer": {
"outer": "Br?|Cl?|F|I|N|O|P|S|b|c|n|o|p|s|\\*|[\\.\\-=\\#\\$:/\\\\]|\\d|%|\\(|\\)|\\[.*?]",
"inner": "(\\d+)?(A[c|g|l|m|r|s|t|u]|B[a|e|h|i|k|r]?|C[a|d|e|f|l|m|n|o|r|s|u]?|D[b|s|y]|E[r|s|u]|F[e|l|m|r]?|G[a|d|e]|H[e|f|g|o|s]?|I[n|r]?|Kr?|L[a|i|r|u|v]|M[c|d|g|n|o|t]|N[a|b|d|e|h|i|o|p]?|O[g|s]?|P[a|b|d|m|o|r|t|u]?|R[a|b|e|f|g|h|n|u]|S[b|c|e|g|i|m|n|r]?|T[a|b|c|e|h|i|l|m|s]|U|V|W|Xe|Yb?|Z[n|r]|as|b|c|n|o|p|se?|\\*)(?:(@(?:@|AL|OH|SP|T[B|H])?)(\\d{1,2})?)?(?:(H)(\\d)?)?(?:([+-]{1,2})(\\d{0,2}))?(?:(:)(\\d+))?"
},
"post_processor": null,
"decoder": {
"type": "Fuse"
},
"model": {
"type": "WordLevel",
"vocab": {
"[UNK]": 0,
"#": 1,
"$": 2,
"%": 3,
"(": 4,
")": 5,
"*": 6,
"+": 7,
"-": 8,
".": 9,
"/": 10,
"0": 11,
"1": 12,
"2": 13,
"3": 14,
"4": 15,
"5": 16,
"6": 17,
"7": 18,
"8": 19,
"9": 20,
":": 21,
"=": 22,
"@": 23,
"@@": 24,
"@AL": 25,
"@OH": 26,
"@SP": 27,
"@TB": 28,
"@TH": 29,
"Ac": 30,
"Ag": 31,
"Al": 32,
"Am": 33,
"Ar": 34,
"As": 35,
"At": 36,
"Au": 37,
"B": 38,
"Ba": 39,
"Be": 40,
"Bh": 41,
"Bi": 42,
"Bk": 43,
"Br": 44,
"C": 45,
"Ca": 46,
"Cd": 47,
"Ce": 48,
"Cf": 49,
"Cl": 50,
"Cm": 51,
"Cn": 52,
"Co": 53,
"Cr": 54,
"Cs": 55,
"Cu": 56,
"Db": 57,
"Ds": 58,
"Dy": 59,
"Er": 60,
"Es": 61,
"Eu": 62,
"F": 63,
"Fe": 64,
"Fl": 65,
"Fm": 66,
"Fr": 67,
"Ga": 68,
"Gd": 69,
"Ge": 70,
"H": 71,
"He": 72,
"Hf": 73,
"Hg": 74,
"Ho": 75,
"Hs": 76,
"I": 77,
"In": 78,
"Ir": 79,
"K": 80,
"Kr": 81,
"La": 82,
"Li": 83,
"Lr": 84,
"Lu": 85,
"Lv": 86,
"Mc": 87,
"Md": 88,
"Mg": 89,
"Mn": 90,
"Mo": 91,
"Mt": 92,
"N": 93,
"Na": 94,
"Nb": 95,
"Nd": 96,
"Ne": 97,
"Nh": 98,
"Ni": 99,
"No": 100,
"Np": 101,
"O": 102,
"Og": 103,
"Os": 104,
"P": 105,
"Pa": 106,
"Pb": 107,
"Pd": 108,
"Pm": 109,
"Po": 110,
"Pr": 111,
"Pt": 112,
"Pu": 113,
"Ra": 114,
"Rb": 115,
"Re": 116,
"Rf": 117,
"Rg": 118,
"Rh": 119,
"Rn": 120,
"Ru": 121,
"S": 122,
"Sb": 123,
"Sc": 124,
"Se": 125,
"Sg": 126,
"Si": 127,
"Sm": 128,
"Sn": 129,
"Sr": 130,
"Ta": 131,
"Tb": 132,
"Tc": 133,
"Te": 134,
"Th": 135,
"Ti": 136,
"Tl": 137,
"Tm": 138,
"Ts": 139,
"U": 140,
"V": 141,
"W": 142,
"Xe": 143,
"Y": 144,
"Yb": 145,
"Zn": 146,
"Zr": 147,
"[": 148,
"\\": 149,
"]": 150,
"as": 151,
"b": 152,
"c": 153,
"n": 154,
"o": 155,
"p": 156,
"s": 157,
"se": 158
},
"unk_token": "[UNK]"
}
}