SMIRK_Tokenizer / tokenizer.json
haydn-jones's picture
Upload tokenizer
8a73b01 verified
{
"version": "1.0",
"truncation": null,
"padding": {
"strategy": "BatchLongest",
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 0,
"pad_type_id": 0,
"pad_token": "[PAD]"
},
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[BOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[EOS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "Split",
"pattern": {
"Regex": "\\[[^\\[\\]]+\\]|@TH|@AL|@SP|@TB|@OH|Cl|Br|se|as|\\-|\\#|\\$|\\\\|\\.|\\+|\\-|@@|\\(|\\)|\\*|He|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ar|Ca|Ti|Cr|Mn|Fe|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Zr|Mo|Tc|Ru|Rh|Pd|Ag|Cd|Te|Xe|Ba|La|Hf|Ta|Re|Ir|Pt|Au|Hg|Tl|Bi|At|Rn|Fr|Ra|Ac|Rf|Db|Sg|Bh|Mt|Ds|Rg|Nh|Fl|Mc|Lv|Ts|Og|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Er|Tm|Lu|Th|Pa|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|Lr|te|si|B|C|N|O|P|S|F|I|b|c|n|o|p|s|=|:|/|@|0|1|2|3|4|5|6|7|8|9|%|H|B|C|N|O|F|P|S|K|V|Y|I|W|U"
},
"behavior": "Isolated",
"invert": false
},
{
"type": "Split",
"pattern": {
"Regex": "@TH|@AL|@SP|@TB|@OH|He|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ar|Ca|Sc|Ti|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|Xe|Cs|Ba|La|Hf|Ta|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|Fr|Ra|Ac|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Nh|Fl|Mc|Lv|Ts|Og|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|Th|Pa|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr|te|si|se|as|\\-|\\#|\\$|\\\\|\\.|\\+|\\-|@@|\\[|\\]|H|B|C|N|O|F|P|S|K|V|Y|I|W|U|b|c|n|o|p|s|=|:|/|@|0|1|2|3|4|5|6|7|8|9|%|%"
},
"behavior": "Isolated",
"invert": false
}
]
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[BOS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[EOS]",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"[BOS]": {
"id": "[BOS]",
"ids": [
1
],
"tokens": [
"[BOS]"
]
},
"[EOS]": {
"id": "[EOS]",
"ids": [
2
],
"tokens": [
"[EOS]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "",
"cleanup": true
},
"model": {
"type": "WordLevel",
"vocab": {
"[PAD]": 0,
"[BOS]": 1,
"[EOS]": 2,
"[MASK]": 3,
"[UNK]": 4,
"H": 5,
"He": 6,
"Li": 7,
"Be": 8,
"B": 9,
"C": 10,
"N": 11,
"O": 12,
"F": 13,
"Ne": 14,
"Na": 15,
"Mg": 16,
"Al": 17,
"Si": 18,
"P": 19,
"S": 20,
"Cl": 21,
"Ar": 22,
"K": 23,
"Ca": 24,
"Sc": 25,
"Ti": 26,
"V": 27,
"Cr": 28,
"Mn": 29,
"Fe": 30,
"Co": 31,
"Ni": 32,
"Cu": 33,
"Zn": 34,
"Ga": 35,
"Ge": 36,
"As": 37,
"Se": 38,
"Br": 39,
"Kr": 40,
"Rb": 41,
"Sr": 42,
"Y": 43,
"Zr": 44,
"Nb": 45,
"Mo": 46,
"Tc": 47,
"Ru": 48,
"Rh": 49,
"Pd": 50,
"Ag": 51,
"Cd": 52,
"In": 53,
"Sn": 54,
"Sb": 55,
"Te": 56,
"I": 57,
"Xe": 58,
"Cs": 59,
"Ba": 60,
"La": 61,
"Hf": 62,
"Ta": 63,
"W": 64,
"Re": 65,
"Os": 66,
"Ir": 67,
"Pt": 68,
"Au": 69,
"Hg": 70,
"Tl": 71,
"Pb": 72,
"Bi": 73,
"Po": 74,
"At": 75,
"Rn": 76,
"Fr": 77,
"Ra": 78,
"Ac": 79,
"Rf": 80,
"Db": 81,
"Sg": 82,
"Bh": 83,
"Hs": 84,
"Mt": 85,
"Ds": 86,
"Rg": 87,
"Cn": 88,
"Nh": 89,
"Fl": 90,
"Mc": 91,
"Lv": 92,
"Ts": 93,
"Og": 94,
"Ce": 95,
"Pr": 96,
"Nd": 97,
"Pm": 98,
"Sm": 99,
"Eu": 100,
"Gd": 101,
"Tb": 102,
"Dy": 103,
"Ho": 104,
"Er": 105,
"Tm": 106,
"Yb": 107,
"Lu": 108,
"Th": 109,
"Pa": 110,
"U": 111,
"Np": 112,
"Pu": 113,
"Am": 114,
"Cm": 115,
"Bk": 116,
"Cf": 117,
"Es": 118,
"Fm": 119,
"Md": 120,
"No": 121,
"Lr": 122,
"te": 123,
"si": 124,
"b": 125,
"c": 126,
"n": 127,
"o": 128,
"p": 129,
"s": 130,
"se": 131,
"as": 132,
"-": 133,
"=": 134,
"#": 135,
"$": 136,
":": 137,
"/": 138,
"\\": 139,
".": 140,
"+": 141,
"@": 142,
"@@": 143,
"@TH": 144,
"@AL": 145,
"@SP": 146,
"@TB": 147,
"@OH": 148,
"(": 149,
")": 150,
"*": 151,
"0": 152,
"1": 153,
"2": 154,
"3": 155,
"4": 156,
"5": 157,
"6": 158,
"7": 159,
"8": 160,
"9": 161,
"%": 162,
"[": 163,
"]": 164
},
"unk_token": "[UNK]"
}
}