cz_ec_bpe250 / tokenizer.json
iszoke's picture
Upload tokenizer
5cb150a verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "([bos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "([eos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "([unk])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "([pad])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "([mask])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 247,
"content": "(LNG)",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 248,
"content": "(UNK)",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 249,
"content": "(SPN)",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 1
}
}
],
"special_tokens": {
"([bos])": {
"id": "([bos])",
"ids": [
0
],
"tokens": [
"([bos])"
]
},
"([eos])": {
"id": "([eos])",
"ids": [
1
],
"tokens": [
"([eos])"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "([unk])",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"([bos])": 0,
"([eos])": 1,
"([unk])": 2,
"([pad])": 3,
"([mask])": 4,
"!": 5,
"%": 6,
"'": 7,
"(": 8,
")": 9,
"*": 10,
"+": 11,
",": 12,
"-": 13,
".": 14,
"/": 15,
"0": 16,
"1": 17,
"2": 18,
"3": 19,
"4": 20,
"5": 21,
"6": 22,
"7": 23,
"8": 24,
"9": 25,
":": 26,
";": 27,
"?": 28,
"A": 29,
"B": 30,
"C": 31,
"D": 32,
"E": 33,
"F": 34,
"G": 35,
"H": 36,
"I": 37,
"J": 38,
"K": 39,
"L": 40,
"M": 41,
"N": 42,
"O": 43,
"P": 44,
"Q": 45,
"R": 46,
"S": 47,
"T": 48,
"U": 49,
"V": 50,
"W": 51,
"X": 52,
"Y": 53,
"Z": 54,
"a": 55,
"b": 56,
"c": 57,
"d": 58,
"e": 59,
"f": 60,
"g": 61,
"h": 62,
"i": 63,
"j": 64,
"k": 65,
"l": 66,
"m": 67,
"n": 68,
"o": 69,
"p": 70,
"q": 71,
"r": 72,
"s": 73,
"t": 74,
"u": 75,
"v": 76,
"w": 77,
"x": 78,
"y": 79,
"z": 80,
"¡": 81,
"£": 82,
"¤": 83,
"¥": 84,
"¦": 85,
"§": 86,
"¨": 87,
"©": 88,
"ª": 89,
"¬": 90,
"®": 91,
"¯": 92,
"±": 93,
"²": 94,
"³": 95,
"´": 96,
"µ": 97,
"¶": 98,
"·": 99,
"¸": 100,
"¹": 101,
"º": 102,
"»": 103,
"¼": 104,
"½": 105,
"¾": 106,
"¿": 107,
"Â": 108,
"Ã": 109,
"Ä": 110,
"Å": 111,
"Ì": 112,
"Î": 113,
"Ï": 114,
"â": 115,
"Ġ": 116,
"Ģ": 117,
"ģ": 118,
"Ĥ": 119,
"ĥ": 120,
"Ħ": 121,
"ħ": 122,
"Ĩ": 123,
"ĩ": 124,
"Ī": 125,
"ī": 126,
"Ĭ": 127,
"ĭ": 128,
"Į": 129,
"į": 130,
"İ": 131,
"IJ": 132,
"ij": 133,
"Ĵ": 134,
"ĵ": 135,
"Ķ": 136,
"ķ": 137,
"ĸ": 138,
"Ĺ": 139,
"ĺ": 140,
"Ļ": 141,
"ļ": 142,
"Ľ": 143,
"ľ": 144,
"Ŀ": 145,
"ŀ": 146,
"Ł": 147,
"ł": 148,
"Ń": 149,
"ĠÎ": 150,
"α": 151,
"ÏĦ": 152,
"ι": 153,
"ο": 154,
"ε": 155,
"ν": 156,
"Ïģ": 157,
"ĠÏ": 158,
"Ïħ": 159,
"ÏĤ": 160,
"η": 161,
"Ïĥ": 162,
"ĠÏĦ": 163,
"μ": 164,
"ί": 165,
"ÏĢ": 166,
"λ": 167,
"κ": 168,
"ĠÏĢ": 169,
"ά": 170,
"οÏħ": 171,
"Ġκ": 172,
"ÏĮ": 173,
"Ġε": 174,
"Ġα": 175,
"ÎŃ": 176,
"ĠÏĥ": 177,
"Ïī": 178,
"αι": 179,
"ή": 180,
"Ïį": 181,
"Ġμ": 182,
"γ": 183,
"ει": 184,
"Ïĩ": 185,
"Ïİ": 186,
"ια": 187,
"με": 188,
"Ġν": 189,
"ικ": 190,
"Ġδ": 191,
"ĠÏĦη": 192,
"Ġκαι": 193,
"Ġνα": 194,
"δ": 195,
"θ": 196,
"Ïģο": 197,
"ÏĨ": 198,
"ÏĦα": 199,
"ÏĦε": 200,
"ία": 201,
"ÏĢο": 202,
"ĠÏĦο": 203,
"Ġγ": 204,
"μα": 205,
"Ïģα": 206,
"Ïĥη": 207,
"Ġο": 208,
"ÏĦη": 209,
"Ïģι": 210,
"ÏĦο": 211,
"ĠÎŃ": 212,
"Ġθ": 213,
"ίν": 214,
"Ïīν": 215,
"ĠÏĮ": 216,
"β": 217,
"Ġγια": 218,
"οÏħμε": 219,
"ÏĦι": 220,
"Ġη": 221,
"ĠÏĦην": 222,
"να": 223,
"νο": 224,
"ĠÏĦοÏħ": 225,
"ĠαÏħ": 226,
"Ġ,": 227,
"Ġμε": 228,
"Ġ.": 229,
"ĠÏĢÏģ": 230,
"ολ": 231,
"ÏĦικ": 232,
"ĠÏĥÏħ": 233,
"εÏĤ": 234,
"ξ": 235,
"Ġκα": 236,
"ÏħÏģ": 237,
"ÏĢÏĮ": 238,
"οÏį": 239,
"ĠαÏħÏĦ": 240,
"Ġθα": 241,
"ζ": 242,
"ĠÏĦηÏĤ": 243,
"αÏĤ": 244,
"οι": 245,
"ÎŃÏĤ": 246
},
"merges": [
"Ġ Î",
"Î ±",
"Ï Ħ",
"Î ¹",
"Î ¿",
"Î µ",
"Î ½",
"Ï ģ",
"Ġ Ï",
"Ï ħ",
"Ï Ĥ",
"Î ·",
"Ï ĥ",
"Ġ ÏĦ",
"Î ¼",
"Î ¯",
"Ï Ģ",
"Î »",
"Î º",
"ĠÏ Ģ",
"Î ¬",
"ο Ïħ",
"ĠÎ º",
"Ï Į",
"ĠÎ µ",
"ĠÎ ±",
"Î Ń",
"ĠÏ ĥ",
"Ï ī",
"α ι",
"Î ®",
"Ï į",
"ĠÎ ¼",
"Î ³",
"ε ι",
"Ï ĩ",
"Ï İ",
"ι α",
"μ ε",
"ĠÎ ½",
"ι κ",
"ĠÎ ´",
"ĠÏĦ η",
"Ġκ αι",
"Ġν α",
"Î ´",
"Î ¸",
"Ïģ ο",
"Ï Ĩ",
"ÏĦ α",
"ÏĦ ε",
"ί α",
"ÏĢ Î¿",
"ĠÏĦ ο",
"ĠÎ ³",
"μ α",
"Ïģ α",
"Ïĥ η",
"ĠÎ ¿",
"ÏĦ η",
"Ïģ ι",
"ÏĦ ο",
"ĠÎ Ń",
"ĠÎ ¸",
"ί ν",
"Ïī ν",
"ĠÏ Į",
"Î ²",
"Ġγ ια",
"οÏħ με",
"ÏĦ ι",
"ĠÎ ·",
"ĠÏĦη ν",
"ν α",
"ν ο",
"ĠÏĦ οÏħ",
"Ġα Ïħ",
"Ġ ,",
"Ġμ ε",
"Ġ .",
"ĠÏĢ Ïģ",
"ο λ",
"ÏĦ ικ",
"ĠÏĥ Ïħ",
"ε ÏĤ",
"Î ¾",
"Ġκ α",
"Ïħ Ïģ",
"ÏĢ ÏĮ",
"ο Ïį",
"ĠαÏħ ÏĦ",
"Ġθ α",
"Î ¶",
"ĠÏĦη ÏĤ",
"α ÏĤ",
"ο ι",
"ÎŃ ÏĤ"
]
}
}