perovskite-tokenizer / tokenizer.json
GoshKolotyan's picture
Initial upload of perovskite tokenizer - tokenizer.json
da7dd17 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "NFD"
},
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": "##",
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
".": 5,
"0": 6,
"1": 7,
"2": 8,
"3": 9,
"4": 10,
"5": 11,
"6": 12,
"7": 13,
"8": 14,
"9": 15,
"A": 16,
"B": 17,
"C": 18,
"D": 19,
"F": 20,
"G": 21,
"H": 22,
"I": 23,
"K": 24,
"L": 25,
"M": 26,
"N": 27,
"P": 28,
"R": 29,
"S": 30,
"T": 31,
"Y": 32,
"Z": 33,
"a": 34,
"b": 35,
"d": 36,
"e": 37,
"g": 38,
"i": 39,
"l": 40,
"n": 41,
"o": 42,
"r": 43,
"s": 44,
"u": 45,
"##A": 46,
"##a": 47,
"##3": 48,
"##7": 49,
"##o": 50,
"##2": 51,
"##l": 52,
"##M": 53,
"##n": 54,
"##4": 55,
"##b": 56,
"##9": 57,
"##s": 58,
"##g": 59,
"##6": 60,
"##8": 61,
"##e": 62,
"##r": 63,
"##5": 64,
"##d": 65,
"##1": 66,
"##u": 67,
"##i": 68,
"01": 69,
"In": 70,
"Na": 71,
"Yb": 72,
"Br": 73,
"Cl": 74,
"Se": 75,
"Te": 76,
"Ba": 77,
"Cs": 78,
"DM": 79,
"FA": 80,
"Hg": 81,
"La": 82,
"Li": 83,
"MA": 84,
"Rb": 85,
"Sr": 86,
"Tl": 87,
"DMA": 88,
"Ag": 89,
"Au": 90,
"Bi": 91,
"Co": 92,
"Cd": 93,
"Cu": 94,
"Fe": 95,
"Ga": 96,
"Ge": 97,
"Mn": 98,
"Mg": 99,
"Nb": 100,
"Ni": 101,
"Pb": 102,
"Pd": 103,
"Sn": 104,
"Sb": 105,
"Tb": 106,
"Ti": 107,
"Zn": 108,
"03": 109,
"09": 110,
"06": 111,
"12": 112,
"18": 113,
"15": 114,
"82": 115,
"88": 116,
"85": 117,
"94": 118,
"91": 119,
"97": 120,
"07": 121,
"02": 122,
"04": 123,
"08": 124,
"05": 125,
"13": 126,
"17": 127,
"14": 128,
"19": 129,
"16": 130,
"11": 131,
"83": 132,
"87": 133,
"84": 134,
"89": 135,
"86": 136,
"81": 137,
"93": 138,
"92": 139,
"96": 140,
"98": 141,
"95": 142,
"79": 143,
"99": 144,
"27": 145,
"24": 146,
"21": 147,
"33": 148,
"39": 149,
"36": 150,
"43": 151,
"42": 152,
"49": 153,
"46": 154,
"48": 155,
"45": 156,
"57": 157,
"52": 158,
"54": 159,
"58": 160,
"55": 161,
"51": 162,
"67": 163,
"64": 164,
"61": 165,
"73": 166,
"76": 167,
"37": 168
},
"merges": [
[
"0",
"##1"
],
[
"I",
"##n"
],
[
"N",
"##a"
],
[
"Y",
"##b"
],
[
"B",
"##r"
],
[
"C",
"##l"
],
[
"S",
"##e"
],
[
"T",
"##e"
],
[
"B",
"##a"
],
[
"C",
"##s"
],
[
"D",
"##M"
],
[
"F",
"##A"
],
[
"H",
"##g"
],
[
"L",
"##a"
],
[
"L",
"##i"
],
[
"M",
"##A"
],
[
"R",
"##b"
],
[
"S",
"##r"
],
[
"T",
"##l"
],
[
"DM",
"##A"
],
[
"A",
"##g"
],
[
"A",
"##u"
],
[
"B",
"##i"
],
[
"C",
"##o"
],
[
"C",
"##d"
],
[
"C",
"##u"
],
[
"F",
"##e"
],
[
"G",
"##a"
],
[
"G",
"##e"
],
[
"M",
"##n"
],
[
"M",
"##g"
],
[
"N",
"##b"
],
[
"N",
"##i"
],
[
"P",
"##b"
],
[
"P",
"##d"
],
[
"S",
"##n"
],
[
"S",
"##b"
],
[
"T",
"##b"
],
[
"T",
"##i"
],
[
"Z",
"##n"
],
[
"0",
"##3"
],
[
"0",
"##9"
],
[
"0",
"##6"
],
[
"1",
"##2"
],
[
"1",
"##8"
],
[
"1",
"##5"
],
[
"8",
"##2"
],
[
"8",
"##8"
],
[
"8",
"##5"
],
[
"9",
"##4"
],
[
"9",
"##1"
],
[
"9",
"##7"
],
[
"0",
"##7"
],
[
"0",
"##2"
],
[
"0",
"##4"
],
[
"0",
"##8"
],
[
"0",
"##5"
],
[
"1",
"##3"
],
[
"1",
"##7"
],
[
"1",
"##4"
],
[
"1",
"##9"
],
[
"1",
"##6"
],
[
"1",
"##1"
],
[
"8",
"##3"
],
[
"8",
"##7"
],
[
"8",
"##4"
],
[
"8",
"##9"
],
[
"8",
"##6"
],
[
"8",
"##1"
],
[
"9",
"##3"
],
[
"9",
"##2"
],
[
"9",
"##6"
],
[
"9",
"##8"
],
[
"9",
"##5"
],
[
"7",
"##9"
],
[
"9",
"##9"
],
[
"2",
"##7"
],
[
"2",
"##4"
],
[
"2",
"##1"
],
[
"3",
"##3"
],
[
"3",
"##9"
],
[
"3",
"##6"
],
[
"4",
"##3"
],
[
"4",
"##2"
],
[
"4",
"##9"
],
[
"4",
"##6"
],
[
"4",
"##8"
],
[
"4",
"##5"
],
[
"5",
"##7"
],
[
"5",
"##2"
],
[
"5",
"##4"
],
[
"5",
"##8"
],
[
"5",
"##5"
],
[
"5",
"##1"
],
[
"6",
"##7"
],
[
"6",
"##4"
],
[
"6",
"##1"
],
[
"7",
"##3"
],
[
"7",
"##6"
],
[
"3",
"##7"
]
]
}
}