nanocatalyst / tokenizer_v3 /tokenizer.json
everythingchalna's picture
Upload folder using huggingface_hub
35547c6 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|bos|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<|cond|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<|sep|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<|eos|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Replace",
"pattern": {
"Regex": "="
},
"content": " = "
},
{
"type": "Replace",
"pattern": {
"Regex": "(?<=[A-Za-z0-9])(?=[A-Z])"
},
"content": " "
}
]
},
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "Split",
"pattern": {
"String": "\n"
},
"behavior": "Isolated",
"invert": false
},
{
"type": "Split",
"pattern": {
"String": " "
},
"behavior": "Removed",
"invert": false
},
{
"type": "Digits",
"individual_digits": false
},
{
"type": "Split",
"pattern": {
"Regex": "\\d{2}|\\d"
},
"behavior": "Isolated",
"invert": false
}
]
},
"post_processor": null,
"decoder": {
"type": "Fuse"
},
"model": {
"type": "WordLevel",
"vocab": {
"[UNK]": 4,
".": 5,
"\n": 6,
"00": 7,
"2": 8,
"0": 9,
"4": 10,
"3": 11,
"1": 12,
"10": 13,
"6": 14,
"5": 15,
"11": 16,
"12": 17,
"13": 18,
"14": 19,
"7": 20,
"15": 21,
"16": 22,
"17": 23,
"18": 24,
"8": 25,
"19": 26,
"20": 27,
"9": 28,
"21": 29,
"22": 30,
"23": 31,
"24": 32,
"25": 33,
"26": 34,
"90": 35,
"27": 36,
"28": 37,
"29": 38,
"H": 39,
"30": 40,
"31": 41,
"32": 42,
"33": 43,
"01": 44,
"34": 45,
"S": 46,
"02": 47,
"03": 48,
"35": 49,
"04": 50,
"36": 51,
"37": 52,
"05": 53,
"38": 54,
"06": 55,
"39": 56,
"09": 57,
"95": 58,
"08": 59,
"42": 60,
"07": 61,
"98": 62,
"40": 63,
"93": 64,
"56": 65,
"94": 66,
"71": 67,
"97": 68,
"85": 69,
"91": 70,
"84": 71,
"41": 72,
"60": 73,
"92": 74,
"46": 75,
"55": 76,
"86": 77,
"49": 78,
"53": 79,
"79": 80,
"51": 81,
"43": 82,
"88": 83,
"89": 84,
"48": 85,
"87": 86,
"45": 87,
"54": 88,
"78": 89,
"63": 90,
"66": 91,
"69": 92,
"99": 93,
"57": 94,
"82": 95,
"62": 96,
"96": 97,
"74": 98,
"83": 99,
"47": 100,
"58": 101,
"80": 102,
"65": 103,
"77": 104,
"73": 105,
"68": 106,
"81": 107,
"75": 108,
"59": 109,
"72": 110,
"64": 111,
"61": 112,
"76": 113,
"52": 114,
"70": 115,
"44": 116,
"50": 117,
"67": 118,
"=": 119,
"-": 120,
"Se": 121,
"Al": 122,
"C": 123,
"Te": 124,
"Si": 125,
"Ti": 126,
"P": 127,
"Ga": 128,
"N": 129,
"Pd": 130,
"O": 131,
"Cl": 132,
"Ca": 133,
"Hf": 134,
"As": 135,
"In": 136,
"Pt": 137,
"Ni": 138,
"Na": 139,
"Ge": 140,
"Zn": 141,
"Sn": 142,
"Cu": 143,
"Zr": 144,
"Rh": 145,
"Au": 146,
"Sb": 147,
"Ag": 148,
"V": 149,
"Y": 150,
"K": 151,
"Sc": 152,
"Ta": 153,
"Nb": 154,
"<|bos|>": 155,
"<|cond|>": 156,
"<|eos|>": 157,
"<|sep|>": 158,
"ads": 159,
"bin": 160,
"composition": 161,
"relax": 162,
"target_bin": 163,
"task": 164,
"Sr": 165,
"Mo": 166,
"Co": 167,
"Pb": 168,
"Hg": 169,
"Ru": 170,
"Ir": 171,
"Bi": 172,
"Mn": 173,
"Fe": 174,
"Tl": 175,
"Cd": 176,
"Cr": 177,
"Rb": 178,
"W": 179,
"Re": 180,
"Tc": 181,
"Cs": 182,
"Os": 183,
"B": 184
},
"unk_token": "[UNK]"
}
}