OpenWebMath-tokenizer-1 / tokenizer.json
parislo's picture
Upload tokenizer
b5d2f99 verified
raw
history blame
11.1 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"!": 0,
"\"": 1,
"#": 2,
"$": 3,
"%": 4,
"&": 5,
"'": 6,
"(": 7,
")": 8,
"*": 9,
"+": 10,
",": 11,
"-": 12,
".": 13,
"/": 14,
"0": 15,
"1": 16,
"2": 17,
"3": 18,
"4": 19,
"5": 20,
"6": 21,
"7": 22,
"8": 23,
"9": 24,
":": 25,
";": 26,
"<": 27,
"=": 28,
">": 29,
"?": 30,
"@": 31,
"A": 32,
"B": 33,
"C": 34,
"D": 35,
"E": 36,
"F": 37,
"G": 38,
"H": 39,
"I": 40,
"J": 41,
"K": 42,
"L": 43,
"M": 44,
"N": 45,
"O": 46,
"P": 47,
"Q": 48,
"R": 49,
"S": 50,
"T": 51,
"U": 52,
"V": 53,
"W": 54,
"X": 55,
"Y": 56,
"Z": 57,
"[": 58,
"\\": 59,
"]": 60,
"^": 61,
"_": 62,
"`": 63,
"a": 64,
"b": 65,
"c": 66,
"d": 67,
"e": 68,
"f": 69,
"g": 70,
"h": 71,
"i": 72,
"j": 73,
"k": 74,
"l": 75,
"m": 76,
"n": 77,
"o": 78,
"p": 79,
"q": 80,
"r": 81,
"s": 82,
"t": 83,
"u": 84,
"v": 85,
"w": 86,
"x": 87,
"y": 88,
"z": 89,
"{": 90,
"|": 91,
"}": 92,
"~": 93,
"¡": 94,
"¢": 95,
"£": 96,
"¤": 97,
"¥": 98,
"¦": 99,
"§": 100,
"¨": 101,
"©": 102,
"ª": 103,
"«": 104,
"¬": 105,
"®": 106,
"¯": 107,
"°": 108,
"±": 109,
"²": 110,
"³": 111,
"´": 112,
"µ": 113,
"¶": 114,
"·": 115,
"¸": 116,
"¹": 117,
"º": 118,
"»": 119,
"¼": 120,
"½": 121,
"¾": 122,
"¿": 123,
"À": 124,
"Á": 125,
"Â": 126,
"Ã": 127,
"Ä": 128,
"Å": 129,
"Æ": 130,
"Ç": 131,
"È": 132,
"É": 133,
"Ê": 134,
"Ë": 135,
"Ì": 136,
"Í": 137,
"Î": 138,
"Ï": 139,
"Ð": 140,
"Ñ": 141,
"Ò": 142,
"Ó": 143,
"Ô": 144,
"Õ": 145,
"Ö": 146,
"×": 147,
"Ø": 148,
"Ù": 149,
"Ú": 150,
"Û": 151,
"Ü": 152,
"Ý": 153,
"Þ": 154,
"ß": 155,
"à": 156,
"á": 157,
"â": 158,
"ã": 159,
"ä": 160,
"å": 161,
"æ": 162,
"ç": 163,
"è": 164,
"é": 165,
"ê": 166,
"ë": 167,
"ì": 168,
"í": 169,
"î": 170,
"ï": 171,
"ð": 172,
"ñ": 173,
"ò": 174,
"ó": 175,
"ô": 176,
"õ": 177,
"ö": 178,
"÷": 179,
"ø": 180,
"ù": 181,
"ú": 182,
"û": 183,
"ü": 184,
"ý": 185,
"þ": 186,
"ÿ": 187,
"Ā": 188,
"ā": 189,
"Ă": 190,
"ă": 191,
"Ą": 192,
"ą": 193,
"Ć": 194,
"ć": 195,
"Ĉ": 196,
"ĉ": 197,
"Ċ": 198,
"ċ": 199,
"Č": 200,
"č": 201,
"Ď": 202,
"ď": 203,
"Đ": 204,
"đ": 205,
"Ē": 206,
"ē": 207,
"Ĕ": 208,
"ĕ": 209,
"Ė": 210,
"ė": 211,
"Ę": 212,
"ę": 213,
"Ě": 214,
"ě": 215,
"Ĝ": 216,
"ĝ": 217,
"Ğ": 218,
"ğ": 219,
"Ġ": 220,
"ġ": 221,
"Ģ": 222,
"ģ": 223,
"Ĥ": 224,
"ĥ": 225,
"Ħ": 226,
"ħ": 227,
"Ĩ": 228,
"ĩ": 229,
"Ī": 230,
"ī": 231,
"Ĭ": 232,
"ĭ": 233,
"Į": 234,
"į": 235,
"İ": 236,
"ı": 237,
"IJ": 238,
"ij": 239,
"Ĵ": 240,
"ĵ": 241,
"Ķ": 242,
"ķ": 243,
"ĸ": 244,
"Ĺ": 245,
"ĺ": 246,
"Ļ": 247,
"ļ": 248,
"Ľ": 249,
"ľ": 250,
"Ŀ": 251,
"ŀ": 252,
"Ł": 253,
"ł": 254,
"Ń": 255,
"th": 256,
"ν": 257,
"ÏĦ": 258,
"the": 259,
"Ïģ": 260,
"ι": 261,
"Ġthe": 262,
"ε": 263,
"Ġi": 264,
"Ïģι": 265,
"Ġa": 266,
"ÏĦÏģι": 267,
"en": 268,
"re": 269,
"ĠÎ": 270,
"ÏĦÏģιÎ": 271,
"Ġo": 272,
"νÎ": 273,
"Ïī": 274,
"si": 275,
"Ïİ": 276,
"Ġis": 277,
"al": 278,
"ri": 279,
"Ïĥ": 280,
"Ïİν": 281,
"at": 282,
"es": 283,
"le": 284,
"on": 285,
"³Ïī": 286,
"¼Îµ": 287,
"¿Î": 288,
"Ġf": 289,
"ĠÏĦÏģιÎ": 290,
"Ġof": 291,
"³ÏīνÎ": 292,
"nd": 293,
"ºÏİν": 294,
"η": 295,
"Ġ2": 296,
"ÏĦÏģικÏİν": 297,
"¼ÎµÏĦÏģικÏİν": 298,
"¿Î¼ÎµÏĦÏģικÏİν": 299,
"ĠÏĦÏģιγÏīνÎ": 300,
"ĠÏĦÏģιγÏīνομεÏĦÏģικÏİν": 301,
"an": 302,
"he": 303,
"Ïħ": 304,
"Ġb": 305,
"Ġc": 306,
"Ġe": 307,
"Ġs": 308,
"Ġt": 309,
"Eu": 310,
"ion": 311,
"la": 312,
"mu": 313,
"om": 314,
"or": 315,
"ore": 316,
"se": 317,
"ten": 318,
"α": 319,
"ĠT": 320,
"Ġsi": 321,
"ĠEu": 322,
"Ġand": 323,
"Ġfu": 324,
"mula": 325,
"ormula": 326,
"Ġsid": 327,
"'s": 328,
"ag": 329,
"et": 330,
"hy": 331,
"po": 332,
"qu": 333,
"use": 334,
"ÏĢ": 335,
"ÏĤ": 336,
"Ġ+": 337,
"Ġ-": 338,
"Ġ=": 339,
"Ġl": 340,
"Ġth": 341,
"ĠÏĦ": 342,
"Ġre": 343,
"ĠÏĥ": 344,
"ther": 345,
"εÎ": 346,
"Ġin": 347,
"eng": 348,
"ent": 349,
"Ġγ": 350,
"Ġother": 351,
"rig": 352,
"Ïĥη": 353,
"ler": 354,
"Ġformula": 355,
"ĠThe": 356,
"ĠEuler": 357,
"Ġsides": 358,
"It": 359,
"Py": 360,
"am": 361,
"are": 362,
"ct": 363,
"gle": 364,
"hi": 365,
"ht": 366,
"in": 367,
"li": 368,
"lat": 369,
"nct": 370,
"ple": 371,
"ry": 372,
"um": 373,
"wo": 374,
"whe": 375,
"ºÎ": 376,
"¿Ïģ": 377,
"¿Ïħ": 378,
"Ġg": 379,
"Ġn": 380,
"Ġhy": 381,
"Ġrig": 382,
"ĠIt": 383,
"ĠPy": 384,
"thag": 385,
"ÏĦÎ": 386,
"Ġan": 387,
"Ġα": 388,
"Ġβ": 389,
"να": 390,
"Ïīν": 391,
"ndam": 392,
"Ġcom": 393,
"Ġex": 394,
"Ġsqu": 395,
"Ġtwo": 396,
"ions": 397,
"omet": 398,
"orem": 399,
"orean": 400,
"tenuse": 401,
"Ġfunct": 402,
"Ġfundam": 403,
"potenuse": 404,
"ÏĢÎ": 405,
"Ġleng": 406,
"Ġthat": 407,
"ĠÏĦη": 408,
"Ġrelat": 409,
"ĠÏĥÏħ": 410,
"εί": 411,
"ental": 412,
"plex": 413,
"where": 414,
"Ġhypotenuse": 415,
"Ġright": 416,
"ĠPythag": 417,
"Ġcomplex": 418,
"Ġsquare": 419,
"Ġfundamental": 420,
"Ġlength": 421,
"ĠPythagorean": 422
},
"merges": [
"t h",
"Î ½",
"Ï Ħ",
"th e",
"Ï ģ",
"Î ¹",
"Ġ the",
"Î µ",
"Ġ i",
"Ïģ ι",
"Ġ a",
"ÏĦ Ïģι",
"e n",
"r e",
"Ġ Î",
"ÏĦÏģι Î",
"Ġ o",
"ν Î",
"Ï ī",
"s i",
"Ï İ",
"Ġi s",
"a l",
"r i",
"Ï ĥ",
"Ïİ Î½",
"a t",
"e s",
"l e",
"o n",
"³ Ïī",
"¼ ε",
"¿ Î",
"Ġ f",
"Ġ ÏĦÏģιÎ",
"Ġo f",
"³Ïī νÎ",
"n d",
"º Ïİν",
"Î ·",
"Ġ 2",
"ÏĦÏģιΠºÏİν",
"¼Îµ ÏĦÏģικÏİν",
"¿Î ¼ÎµÏĦÏģικÏİν",
"ĠÏĦÏģιΠ³ÏīνÎ",
"ĠÏĦÏģιγÏīνΠ¿Î¼ÎµÏĦÏģικÏİν",
"a n",
"h e",
"Ï ħ",
"Ġ b",
"Ġ c",
"Ġ e",
"Ġ s",
"Ġ t",
"E u",
"i on",
"l a",
"m u",
"o m",
"o r",
"o re",
"s e",
"t en",
"Î ±",
"Ġ T",
"Ġ si",
"Ġ Eu",
"Ġa nd",
"Ġf u",
"mu la",
"or mula",
"Ġsi d",
"' s",
"a g",
"e t",
"h y",
"p o",
"q u",
"u se",
"Ï Ģ",
"Ï Ĥ",
"Ġ +",
"Ġ -",
"Ġ =",
"Ġ l",
"Ġ th",
"Ġ ÏĦ",
"Ġ re",
"Ġ Ïĥ",
"the r",
"ε Î",
"Ġi n",
"en g",
"en t",
"ĠÎ ³",
"Ġo ther",
"ri g",
"Ïĥ η",
"le r",
"Ġf ormula",
"ĠT he",
"ĠEu ler",
"Ġsid es",
"I t",
"P y",
"a m",
"a re",
"c t",
"g le",
"h i",
"h t",
"i n",
"l i",
"l at",
"n ct",
"p le",
"r y",
"u m",
"w o",
"w he",
"º Î",
"¿ Ïģ",
"¿ Ïħ",
"Ġ g",
"Ġ n",
"Ġ hy",
"Ġ rig",
"Ġ It",
"Ġ Py",
"th ag",
"ÏĦ Î",
"Ġa n",
"ĠÎ ±",
"ĠÎ ²",
"νΠ±",
"Ïī ν",
"nd am",
"Ġc om",
"Ġe x",
"Ġs qu",
"Ġt wo",
"ion s",
"om et",
"ore m",
"ore an",
"ten use",
"Ġfu nct",
"Ġfu ndam",
"po tenuse",
"ÏĢ Î",
"Ġl eng",
"Ġth at",
"ĠÏĦ η",
"Ġre lat",
"ĠÏĥ Ïħ",
"εΠ¯",
"ent al",
"ple x",
"whe re",
"Ġhy potenuse",
"Ġrig ht",
"ĠPy thag",
"Ġcom plex",
"Ġsqu are",
"Ġfundam ental",
"Ġleng th",
"ĠPythag orean"
]
}
}