et_ec_bpe250 / tokenizer.json
iszoke's picture
Upload tokenizer
28b2429 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "([bos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "([eos])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "([unk])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "([pad])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "([mask])",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 247,
"content": "(LNG)",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 248,
"content": "(UNK)",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
},
{
"id": 249,
"content": "(SPN)",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": true,
"special": false
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "([eos])",
"type_id": 1
}
}
],
"special_tokens": {
"([bos])": {
"id": "([bos])",
"ids": [
0
],
"tokens": [
"([bos])"
]
},
"([eos])": {
"id": "([eos])",
"ids": [
1
],
"tokens": [
"([eos])"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "([unk])",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"([bos])": 0,
"([eos])": 1,
"([unk])": 2,
"([pad])": 3,
"([mask])": 4,
"!": 5,
"%": 6,
"'": 7,
"(": 8,
")": 9,
"*": 10,
"+": 11,
",": 12,
"-": 13,
".": 14,
"/": 15,
"0": 16,
"1": 17,
"2": 18,
"3": 19,
"4": 20,
"5": 21,
"6": 22,
"7": 23,
"8": 24,
"9": 25,
":": 26,
";": 27,
"?": 28,
"A": 29,
"B": 30,
"C": 31,
"D": 32,
"E": 33,
"F": 34,
"G": 35,
"H": 36,
"I": 37,
"J": 38,
"K": 39,
"L": 40,
"M": 41,
"N": 42,
"O": 43,
"P": 44,
"Q": 45,
"R": 46,
"S": 47,
"T": 48,
"U": 49,
"V": 50,
"W": 51,
"X": 52,
"Y": 53,
"Z": 54,
"a": 55,
"b": 56,
"c": 57,
"d": 58,
"e": 59,
"f": 60,
"g": 61,
"h": 62,
"i": 63,
"j": 64,
"k": 65,
"l": 66,
"m": 67,
"n": 68,
"o": 69,
"p": 70,
"q": 71,
"r": 72,
"s": 73,
"t": 74,
"u": 75,
"v": 76,
"w": 77,
"x": 78,
"y": 79,
"z": 80,
"¡": 81,
"¢": 82,
"£": 83,
"¤": 84,
"¥": 85,
"§": 86,
"¨": 87,
"©": 88,
"ª": 89,
"«": 90,
"¬": 91,
"®": 92,
"¯": 93,
"°": 94,
"±": 95,
"²": 96,
"³": 97,
"´": 98,
"µ": 99,
"¶": 100,
"·": 101,
"¸": 102,
"º": 103,
"¼": 104,
"½": 105,
"¾": 106,
"Â": 107,
"Ã": 108,
"Ä": 109,
"Å": 110,
"È": 111,
"â": 112,
"Ġ": 113,
"Ģ": 114,
"ģ": 115,
"Ĥ": 116,
"ĥ": 117,
"Ħ": 118,
"ħ": 119,
"ĩ": 120,
"ī": 121,
"Į": 122,
"į": 123,
"İ": 124,
"IJ": 125,
"ķ": 126,
"ĸ": 127,
"Ĺ": 128,
"ĺ": 129,
"Ļ": 130,
"Ľ": 131,
"ľ": 132,
"Ł": 133,
"ł": 134,
"Ń": 135,
"se": 136,
"Ġk": 137,
"Ġ,": 138,
"st": 139,
"Ġ.": 140,
"le": 141,
"ä": 142,
"Ġt": 143,
"Ġm": 144,
"Ġo": 145,
"õ": 146,
"Ġe": 147,
"id": 148,
"in": 149,
"Ġp": 150,
"Ġv": 151,
"ja": 152,
"Ġs": 153,
"da": 154,
"li": 155,
"Ġse": 156,
"ma": 157,
"me": 158,
"Ġa": 159,
"Ġn": 160,
"oo": 161,
"it": 162,
"ü": 163,
"is": 164,
"Ġon": 165,
"ga": 166,
"ud": 167,
"Ġja": 168,
"ra": 169,
"ks": 170,
"Ġme": 171,
"us": 172,
"te": 173,
"va": 174,
"ta": 175,
"ik": 176,
"Ġte": 177,
"ur": 178,
"Ġka": 179,
"en": 180,
"Ġet": 181,
"Ġva": 182,
"la": 183,
"Ġko": 184,
"si": 185,
"lle": 186,
"es": 187,
"aa": 188,
"ust": 189,
"lt": 190,
"na": 191,
"õi": 192,
"mi": 193,
"ri": 194,
"use": 195,
"Ġh": 196,
"Ġj": 197,
"pa": 198,
"ö": 199,
"ge": 200,
"gi": 201,
"ne": 202,
"Ġku": 203,
"ee": 204,
"Ġ(": 205,
"lu": 206,
"ea": 207,
"il": 208,
"Ġselle": 209,
"Ġpa": 210,
"Ġü": 211,
"de": 212,
"ĠE": 213,
"gu": 214,
"Ġole": 215,
"Ġr": 216,
"Ġsee": 217,
"Ġvä": 218,
"uroo": 219,
"sta": 220,
"nd": 221,
"ine": 222,
"ku": 223,
"Ġta": 224,
"uroopa": 225,
"ti": 226,
"Ġei": 227,
"ĠEuroopa": 228,
"är": 229,
"ida": 230,
"ko": 231,
"ha": 232,
"Ġl": 233,
"el": 234,
"ii": 235,
"Ġpea": 236,
"Ġtu": 237,
"ni": 238,
"vad": 239,
"tud": 240,
"Ġsu": 241,
"Ġsaa": 242,
"Ġtä": 243,
"est": 244,
"Ġsi": 245,
"Ġma": 246
},
"merges": [
"s e",
"Ġ k",
"Ġ ,",
"s t",
"Ġ .",
"l e",
"Ã ¤",
"Ġ t",
"Ġ m",
"Ġ o",
"Ã µ",
"Ġ e",
"i d",
"i n",
"Ġ p",
"Ġ v",
"j a",
"Ġ s",
"d a",
"l i",
"Ġ se",
"m a",
"m e",
"Ġ a",
"Ġ n",
"o o",
"i t",
"Ã ¼",
"i s",
"Ġo n",
"g a",
"u d",
"Ġ ja",
"r a",
"k s",
"Ġm e",
"u s",
"t e",
"v a",
"t a",
"i k",
"Ġt e",
"u r",
"Ġk a",
"e n",
"Ġe t",
"Ġv a",
"l a",
"Ġk o",
"s i",
"l le",
"e s",
"a a",
"u st",
"l t",
"n a",
"õ i",
"m i",
"r i",
"u se",
"Ġ h",
"Ġ j",
"p a",
"Ã ¶",
"g e",
"g i",
"n e",
"Ġk u",
"e e",
"Ġ (",
"l u",
"e a",
"i l",
"Ġse lle",
"Ġp a",
"Ġ ü",
"d e",
"Ġ E",
"g u",
"Ġo le",
"Ġ r",
"Ġse e",
"Ġv ä",
"ur oo",
"st a",
"n d",
"in e",
"k u",
"Ġt a",
"uroo pa",
"t i",
"Ġe i",
"ĠE uroopa",
"ä r",
"id a",
"k o",
"h a",
"Ġ l",
"e l",
"i i",
"Ġp ea",
"Ġt u",
"n i",
"va d",
"t ud",
"Ġs u",
"Ġs aa",
"Ġt ä",
"e st",
"Ġs i",
"Ġm a"
]
}
}