bpe-character-tokenizer / tokenizer.json
qikp's picture
Upload 3 files
80cf276 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": null,
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"!": 0,
"\"": 1,
"#": 2,
"$": 3,
"%": 4,
"&": 5,
"'": 6,
"(": 7,
")": 8,
"*": 9,
"+": 10,
",": 11,
"-": 12,
".": 13,
"/": 14,
"0": 15,
"1": 16,
"2": 17,
"3": 18,
"4": 19,
"5": 20,
"6": 21,
"7": 22,
"8": 23,
"9": 24,
":": 25,
";": 26,
"<": 27,
"=": 28,
">": 29,
"?": 30,
"@": 31,
"A": 32,
"B": 33,
"C": 34,
"D": 35,
"E": 36,
"F": 37,
"G": 38,
"H": 39,
"I": 40,
"J": 41,
"K": 42,
"L": 43,
"M": 44,
"N": 45,
"O": 46,
"P": 47,
"Q": 48,
"R": 49,
"S": 50,
"T": 51,
"U": 52,
"V": 53,
"W": 54,
"X": 55,
"Y": 56,
"Z": 57,
"[": 58,
"\\": 59,
"]": 60,
"^": 61,
"_": 62,
"`": 63,
"a": 64,
"b": 65,
"c": 66,
"d": 67,
"e": 68,
"f": 69,
"g": 70,
"h": 71,
"i": 72,
"j": 73,
"k": 74,
"l": 75,
"m": 76,
"n": 77,
"o": 78,
"p": 79,
"q": 80,
"r": 81,
"s": 82,
"t": 83,
"u": 84,
"v": 85,
"w": 86,
"x": 87,
"y": 88,
"z": 89,
"{": 90,
"|": 91,
"}": 92,
"~": 93,
"¡": 94,
"¢": 95,
"£": 96,
"¤": 97,
"¥": 98,
"¦": 99,
"§": 100,
"¨": 101,
"©": 102,
"ª": 103,
"«": 104,
"¬": 105,
"®": 106,
"¯": 107,
"°": 108,
"±": 109,
"²": 110,
"³": 111,
"´": 112,
"µ": 113,
"¶": 114,
"·": 115,
"¸": 116,
"¹": 117,
"º": 118,
"»": 119,
"¼": 120,
"½": 121,
"¾": 122,
"¿": 123,
"Â": 124,
"Ã": 125,
"Ā": 126,
"ā": 127,
"Ă": 128,
"ă": 129,
"Ą": 130,
"ą": 131,
"Ć": 132,
"ć": 133,
"Ĉ": 134,
"ĉ": 135,
"Ċ": 136,
"ċ": 137,
"Č": 138,
"č": 139,
"Ď": 140,
"ď": 141,
"Đ": 142,
"đ": 143,
"Ē": 144,
"ē": 145,
"Ĕ": 146,
"ĕ": 147,
"Ė": 148,
"ė": 149,
"Ę": 150,
"ę": 151,
"Ě": 152,
"ě": 153,
"Ĝ": 154,
"ĝ": 155,
"Ğ": 156,
"ğ": 157,
"Ġ": 158,
"ġ": 159,
"Ģ": 160,
"ģ": 161,
"Ĥ": 162,
"ĥ": 163,
"Ħ": 164,
"ħ": 165,
"Ĩ": 166,
"ĩ": 167,
"Ī": 168,
"ī": 169,
"Ĭ": 170,
"ĭ": 171,
"Į": 172,
"į": 173,
"İ": 174,
"ı": 175,
"IJ": 176,
"ij": 177,
"Ĵ": 178,
"ĵ": 179,
"Ķ": 180,
"ķ": 181,
"ĸ": 182,
"Ĺ": 183,
"ĺ": 184,
"Ļ": 185,
"ļ": 186,
"Ľ": 187,
"ľ": 188,
"Ŀ": 189,
"ŀ": 190,
"Ł": 191,
"ł": 192,
"Ń": 193,
"¡": 194,
"¢": 195,
"£": 196,
"¤": 197,
"Â¥": 198,
"¦": 199,
"§": 200,
"¨": 201,
"©": 202,
"ª": 203,
"«": 204,
"¬": 205,
"®": 206,
"¯": 207,
"°": 208,
"±": 209,
"²": 210,
"³": 211,
"´": 212,
"µ": 213,
"¶": 214,
"·": 215,
"¸": 216,
"¹": 217,
"º": 218,
"»": 219,
"¼": 220,
"½": 221,
"¾": 222,
"¿": 223,
"ÂĢ": 224,
"Âģ": 225,
"ÂĤ": 226,
"Âĥ": 227,
"ÂĦ": 228,
"Âħ": 229,
"ÂĨ": 230,
"Âĩ": 231,
"ÂĪ": 232,
"Âī": 233,
"ÂĬ": 234,
"Âĭ": 235,
"ÂĮ": 236,
"Âį": 237,
"Âİ": 238,
"Âı": 239,
"ÂIJ": 240,
"Âij": 241,
"ÂĴ": 242,
"Âĵ": 243,
"ÂĶ": 244,
"Âķ": 245,
"Âĸ": 246,
"ÂĹ": 247,
"Âĺ": 248,
"ÂĻ": 249,
"Âļ": 250,
"ÂĽ": 251,
"Âľ": 252,
"ÂĿ": 253,
"Âŀ": 254,
"ÂŁ": 255,
"Âł": 256,
"ÂŃ": 257,
"á": 258,
"â": 259,
"ã": 260,
"ä": 261,
"Ã¥": 262,
"æ": 263,
"ç": 264,
"è": 265,
"é": 266,
"ê": 267,
"ë": 268,
"ì": 269,
"î": 270,
"ï": 271,
"ð": 272,
"ñ": 273,
"ò": 274,
"ó": 275,
"ô": 276,
"õ": 277,
"ö": 278,
"÷": 279,
"ø": 280,
"ù": 281,
"ú": 282,
"û": 283,
"ü": 284,
"ý": 285,
"þ": 286,
"ÿ": 287,
"ÃĢ": 288,
"Ãģ": 289,
"ÃĤ": 290,
"Ãĥ": 291,
"ÃĦ": 292,
"Ãħ": 293,
"ÃĨ": 294,
"Ãĩ": 295,
"ÃĪ": 296,
"Ãī": 297,
"ÃĬ": 298,
"Ãĭ": 299,
"ÃĮ": 300,
"Ãį": 301,
"Ãİ": 302,
"Ãı": 303,
"ÃIJ": 304,
"Ãij": 305,
"ÃĴ": 306,
"Ãĵ": 307,
"ÃĶ": 308,
"Ãķ": 309,
"Ãĸ": 310,
"ÃĹ": 311,
"Ãĺ": 312,
"ÃĻ": 313,
"Ãļ": 314,
"ÃĽ": 315,
"Ãľ": 316,
"ÃĿ": 317,
"Ãŀ": 318,
"ÃŁ": 319,
"Ãł": 320,
"ÃŃ": 321
},
"merges": [
[
"Â",
"¡"
],
[
"Â",
"¢"
],
[
"Â",
"£"
],
[
"Â",
"¤"
],
[
"Â",
"¥"
],
[
"Â",
"¦"
],
[
"Â",
"§"
],
[
"Â",
"¨"
],
[
"Â",
"©"
],
[
"Â",
"ª"
],
[
"Â",
"«"
],
[
"Â",
"¬"
],
[
"Â",
"®"
],
[
"Â",
"¯"
],
[
"Â",
"°"
],
[
"Â",
"±"
],
[
"Â",
"²"
],
[
"Â",
"³"
],
[
"Â",
"´"
],
[
"Â",
"µ"
],
[
"Â",
"¶"
],
[
"Â",
"·"
],
[
"Â",
"¸"
],
[
"Â",
"¹"
],
[
"Â",
"º"
],
[
"Â",
"»"
],
[
"Â",
"¼"
],
[
"Â",
"½"
],
[
"Â",
"¾"
],
[
"Â",
"¿"
],
[
"Â",
"Ģ"
],
[
"Â",
"ģ"
],
[
"Â",
"Ĥ"
],
[
"Â",
"ĥ"
],
[
"Â",
"Ħ"
],
[
"Â",
"ħ"
],
[
"Â",
"Ĩ"
],
[
"Â",
"ĩ"
],
[
"Â",
"Ī"
],
[
"Â",
"ī"
],
[
"Â",
"Ĭ"
],
[
"Â",
"ĭ"
],
[
"Â",
"Į"
],
[
"Â",
"į"
],
[
"Â",
"İ"
],
[
"Â",
"ı"
],
[
"Â",
"IJ"
],
[
"Â",
"ij"
],
[
"Â",
"Ĵ"
],
[
"Â",
"ĵ"
],
[
"Â",
"Ķ"
],
[
"Â",
"ķ"
],
[
"Â",
"ĸ"
],
[
"Â",
"Ĺ"
],
[
"Â",
"ĺ"
],
[
"Â",
"Ļ"
],
[
"Â",
"ļ"
],
[
"Â",
"Ľ"
],
[
"Â",
"ľ"
],
[
"Â",
"Ŀ"
],
[
"Â",
"ŀ"
],
[
"Â",
"Ł"
],
[
"Â",
"ł"
],
[
"Â",
"Ń"
],
[
"Ã",
"¡"
],
[
"Ã",
"¢"
],
[
"Ã",
"£"
],
[
"Ã",
"¤"
],
[
"Ã",
"¥"
],
[
"Ã",
"¦"
],
[
"Ã",
"§"
],
[
"Ã",
"¨"
],
[
"Ã",
"©"
],
[
"Ã",
"ª"
],
[
"Ã",
"«"
],
[
"Ã",
"¬"
],
[
"Ã",
"®"
],
[
"Ã",
"¯"
],
[
"Ã",
"°"
],
[
"Ã",
"±"
],
[
"Ã",
"²"
],
[
"Ã",
"³"
],
[
"Ã",
"´"
],
[
"Ã",
"µ"
],
[
"Ã",
"¶"
],
[
"Ã",
"·"
],
[
"Ã",
"¸"
],
[
"Ã",
"¹"
],
[
"Ã",
"º"
],
[
"Ã",
"»"
],
[
"Ã",
"¼"
],
[
"Ã",
"½"
],
[
"Ã",
"¾"
],
[
"Ã",
"¿"
],
[
"Ã",
"Ģ"
],
[
"Ã",
"ģ"
],
[
"Ã",
"Ĥ"
],
[
"Ã",
"ĥ"
],
[
"Ã",
"Ħ"
],
[
"Ã",
"ħ"
],
[
"Ã",
"Ĩ"
],
[
"Ã",
"ĩ"
],
[
"Ã",
"Ī"
],
[
"Ã",
"ī"
],
[
"Ã",
"Ĭ"
],
[
"Ã",
"ĭ"
],
[
"Ã",
"Į"
],
[
"Ã",
"į"
],
[
"Ã",
"İ"
],
[
"Ã",
"ı"
],
[
"Ã",
"IJ"
],
[
"Ã",
"ij"
],
[
"Ã",
"Ĵ"
],
[
"Ã",
"ĵ"
],
[
"Ã",
"Ķ"
],
[
"Ã",
"ķ"
],
[
"Ã",
"ĸ"
],
[
"Ã",
"Ĺ"
],
[
"Ã",
"ĺ"
],
[
"Ã",
"Ļ"
],
[
"Ã",
"ļ"
],
[
"Ã",
"Ľ"
],
[
"Ã",
"ľ"
],
[
"Ã",
"Ŀ"
],
[
"Ã",
"ŀ"
],
[
"Ã",
"Ł"
],
[
"Ã",
"ł"
],
[
"Ã",
"Ń"
]
]
}
}