maddition_ell_Grek_2000 / tokenizer.json
gsaltintas's picture
Upload folder using huggingface_hub
24dc7ef verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "NFC"
},
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "Split",
"pattern": {
"Regex": "[+=]|[^\\S\\r\\n]*[\\n\\r]+|[^\\S\\r\\n]+"
},
"behavior": "Isolated",
"invert": false
},
{
"type": "Split",
"pattern": {
"Regex": "\\p{N}{1,3}"
},
"behavior": "Isolated",
"invert": false
},
{
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": false
}
]
},
"post_processor": null,
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<s>": 0,
"</s>": 1,
"<pad>": 2,
"!": 3,
"\"": 4,
"#": 5,
"$": 6,
"%": 7,
"&": 8,
"'": 9,
"(": 10,
")": 11,
"*": 12,
"+": 13,
",": 14,
"-": 15,
".": 16,
"/": 17,
"0": 18,
"1": 19,
"2": 20,
"3": 21,
"4": 22,
"5": 23,
"6": 24,
"7": 25,
"8": 26,
"9": 27,
":": 28,
";": 29,
"<": 30,
"=": 31,
">": 32,
"?": 33,
"@": 34,
"A": 35,
"B": 36,
"C": 37,
"D": 38,
"E": 39,
"F": 40,
"G": 41,
"H": 42,
"I": 43,
"J": 44,
"K": 45,
"L": 46,
"M": 47,
"N": 48,
"O": 49,
"P": 50,
"Q": 51,
"R": 52,
"S": 53,
"T": 54,
"U": 55,
"V": 56,
"W": 57,
"X": 58,
"Y": 59,
"Z": 60,
"[": 61,
"\\": 62,
"]": 63,
"^": 64,
"_": 65,
"`": 66,
"a": 67,
"b": 68,
"c": 69,
"d": 70,
"e": 71,
"f": 72,
"g": 73,
"h": 74,
"i": 75,
"j": 76,
"k": 77,
"l": 78,
"m": 79,
"n": 80,
"o": 81,
"p": 82,
"q": 83,
"r": 84,
"s": 85,
"t": 86,
"u": 87,
"v": 88,
"w": 89,
"x": 90,
"y": 91,
"z": 92,
"{": 93,
"|": 94,
"}": 95,
"~": 96,
"¡": 97,
"¢": 98,
"£": 99,
"¤": 100,
"¥": 101,
"¦": 102,
"§": 103,
"¨": 104,
"©": 105,
"ª": 106,
"«": 107,
"¬": 108,
"®": 109,
"¯": 110,
"°": 111,
"±": 112,
"²": 113,
"³": 114,
"´": 115,
"µ": 116,
"¶": 117,
"·": 118,
"¸": 119,
"¹": 120,
"º": 121,
"»": 122,
"¼": 123,
"½": 124,
"¾": 125,
"¿": 126,
"À": 127,
"Á": 128,
"Â": 129,
"Ã": 130,
"Ä": 131,
"Å": 132,
"Æ": 133,
"Ç": 134,
"È": 135,
"É": 136,
"Ê": 137,
"Ë": 138,
"Ì": 139,
"Í": 140,
"Î": 141,
"Ï": 142,
"Ð": 143,
"Ñ": 144,
"Ò": 145,
"Ó": 146,
"Ô": 147,
"Õ": 148,
"Ö": 149,
"×": 150,
"Ø": 151,
"Ù": 152,
"Ú": 153,
"Û": 154,
"Ü": 155,
"Ý": 156,
"Þ": 157,
"ß": 158,
"à": 159,
"á": 160,
"â": 161,
"ã": 162,
"ä": 163,
"å": 164,
"æ": 165,
"ç": 166,
"è": 167,
"é": 168,
"ê": 169,
"ë": 170,
"ì": 171,
"í": 172,
"î": 173,
"ï": 174,
"ð": 175,
"ñ": 176,
"ò": 177,
"ó": 178,
"ô": 179,
"õ": 180,
"ö": 181,
"÷": 182,
"ø": 183,
"ù": 184,
"ú": 185,
"û": 186,
"ü": 187,
"ý": 188,
"þ": 189,
"ÿ": 190,
"Ā": 191,
"ā": 192,
"Ă": 193,
"ă": 194,
"Ą": 195,
"ą": 196,
"Ć": 197,
"ć": 198,
"Ĉ": 199,
"ĉ": 200,
"Ċ": 201,
"ċ": 202,
"Č": 203,
"č": 204,
"Ď": 205,
"ď": 206,
"Đ": 207,
"đ": 208,
"Ē": 209,
"ē": 210,
"Ĕ": 211,
"ĕ": 212,
"Ė": 213,
"ė": 214,
"Ę": 215,
"ę": 216,
"Ě": 217,
"ě": 218,
"Ĝ": 219,
"ĝ": 220,
"Ğ": 221,
"ğ": 222,
"Ġ": 223,
"ġ": 224,
"Ģ": 225,
"ģ": 226,
"Ĥ": 227,
"ĥ": 228,
"Ħ": 229,
"ħ": 230,
"Ĩ": 231,
"ĩ": 232,
"Ī": 233,
"ī": 234,
"Ĭ": 235,
"ĭ": 236,
"Į": 237,
"į": 238,
"İ": 239,
"ı": 240,
"IJ": 241,
"ij": 242,
"Ĵ": 243,
"ĵ": 244,
"Ķ": 245,
"ķ": 246,
"ĸ": 247,
"Ĺ": 248,
"ĺ": 249,
"Ļ": 250,
"ļ": 251,
"Ľ": 252,
"ľ": 253,
"Ŀ": 254,
"ŀ": 255,
"Ł": 256,
"ł": 257,
"Ń": 258,
"α": 259,
"ÏĦ": 260,
"ν": 261,
"ε": 262,
"ι": 263,
"κ": 264,
"ια": 265,
"Ïĥ": 266,
"ÏĮ": 267,
"ÏĦα": 268,
"νÏĦα": 269,
"κÏĮ": 270,
"Ïĥια": 271,
"κÏĮÏĥια": 272,
"Ïģ": 273,
"ο": 274,
"ÎŃ": 275,
"ÏĢ": 276,
"εν": 277,
"δ": 278,
"®Î½ÏĦα": 279,
"ήνÏĦα": 280,
"ÏĦÏģ": 281,
"εÎ": 282,
"ά": 283,
"ιακÏĮÏĥια": 284,
"ί": 285,
"ÏĦε": 286,
"ÎŃν": 287,
"ακÏĮÏĥια": 288,
"εÏĢ": 289,
"κα": 290,
"ÏĦακÏĮÏĥια": 291,
"οÎ": 292,
"οκ": 293,
"ενν": 294,
"ενήνÏĦα": 295,
"εξ": 296,
"άνÏĦα": 297,
"εκα": 298,
"»Î¹Î±": 299,
"λια": 300,
"Ïĩ": 301,
"ίλια": 302,
"Ïĩίλια": 303,
"Ïİ": 304,
"ÎŃνÏĦε": 305,
"εκαÏĦ": 306,
"¯Îº": 307,
"²Î´": 308,
"³Î´": 309,
"¼Î®Î½ÏĦα": 310,
"¾Î¹": 311,
"ξι": 312,
"αÏģ": 313,
"ÏĦά": 314,
"ÏĦÏİ": 315,
"ενÏĦα": 316,
"εÏģ": 317,
"ιάνÏĦα": 318,
"Ïĥι": 319,
"ÏĥÏĥ": 320,
"ÏĥαÏģ": 321,
"ÏĮνÏĦα": 322,
"οÏĥι": 323,
"ÎŃα": 324,
"ÎŃξι": 325,
"ÎŃÏĥÏĥ": 326,
"ÏĢενήνÏĦα": 327,
"ÏĢÎŃνÏĦε": 328,
"ÏĢενÏĦα": 329,
"ενενήνÏĦα": 330,
"διακÏĮÏĥια": 331,
"ÏĦÏģιακÏĮÏĥια": 332,
"ÏĦÏģί": 333,
"ÏĦÏģακÏĮÏĥια": 334,
"ÏĦÏģιάνÏĦα": 335,
"είκ": 336,
"εβδ": 337,
"ÏĦεÏĦÏģακÏĮÏĥια": 338,
"εÏĢÏĦακÏĮÏĥια": 339,
"εÏĢÏĦά": 340,
"ογδ": 341,
"ομήνÏĦα": 342,
"οκÏĦακÏĮÏĥια": 343,
"οκÏĦÏİ": 344,
"εννιακÏĮÏĥια": 345,
"εννÎŃα": 346,
"εξήνÏĦα": 347,
"εξακÏĮÏĥια": 348,
"εκαÏĦÏĮ": 349,
"εÏģα": 350,
"ÏĥαÏģάνÏĦα": 351,
"ÎŃÏĥÏĥεÏģα": 352,
"ÏĢενÏĦακÏĮÏĥια": 353,
"ÏĦÏģία": 354,
"είκοÏĥι": 355,
"εβδομήνÏĦα": 356,
"ογδÏĮνÏĦα": 357,
"εκαÏĦÏĮν": 358,
"Ïį": 359,
"ÏĦÎŃÏĥÏĥεÏģα": 360,
"δÏį": 361,
"ÎŃνα": 362,
"δÏįο": 363,
"δεκα": 364,
"ÎŃκα": 365,
"δÏİ": 366,
"δεκαÏĦ": 367,
"δÎŃκα": 368,
"ÎŃνÏĦεκα": 369,
"δεκαÎŃξι": 370,
"δεκαÏĢÎŃνÏĦε": 371,
"δεκαεÏĢÏĦά": 372,
"δεκαοκÏĦÏİ": 373,
"δεκαεννÎŃα": 374,
"δεκαÏĦÏģία": 375,
"δÏİδεκα": 376,
"δεκαÏĦÎŃÏĥÏĥεÏģα": 377,
"·Î´": 378,
"¼Î": 379,
"μÎ": 380,
"·Î´ÎŃν": 381,
"μηδÎŃν": 382
},
"merges": [
[
"Î",
"±"
],
[
"Ï",
"Ħ"
],
[
"Î",
"½"
],
[
"Î",
"µ"
],
[
"Î",
"¹"
],
[
"Î",
"º"
],
[
"ι",
"α"
],
[
"Ï",
"ĥ"
],
[
"Ï",
"Į"
],
[
"ÏĦ",
"α"
],
[
"ν",
"ÏĦα"
],
[
"κ",
"ÏĮ"
],
[
"Ïĥ",
"ια"
],
[
"κÏĮ",
"Ïĥια"
],
[
"Ï",
"ģ"
],
[
"Î",
"¿"
],
[
"Î",
"Ń"
],
[
"Ï",
"Ģ"
],
[
"ε",
"ν"
],
[
"Î",
"´"
],
[
"®",
"νÏĦα"
],
[
"Î",
"®Î½ÏĦα"
],
[
"ÏĦ",
"Ïģ"
],
[
"ε",
"Î"
],
[
"Î",
"¬"
],
[
"ια",
"κÏĮÏĥια"
],
[
"Î",
"¯"
],
[
"ÏĦ",
"ε"
],
[
"ÎŃ",
"ν"
],
[
"α",
"κÏĮÏĥια"
],
[
"ε",
"ÏĢ"
],
[
"κ",
"α"
],
[
"ÏĦα",
"κÏĮÏĥια"
],
[
"ο",
"Î"
],
[
"ο",
"κ"
],
[
"εν",
"ν"
],
[
"εν",
"ήνÏĦα"
],
[
"εÎ",
"¾"
],
[
"ά",
"νÏĦα"
],
[
"ε",
"κα"
],
[
"»",
"ια"
],
[
"Î",
"»Î¹Î±"
],
[
"Ï",
"ĩ"
],
[
"ί",
"λια"
],
[
"Ïĩ",
"ίλια"
],
[
"Ï",
"İ"
],
[
"ÎŃν",
"ÏĦε"
],
[
"εκα",
"ÏĦ"
],
[
"¯",
"κ"
],
[
"²",
"δ"
],
[
"³",
"δ"
],
[
"¼",
"ήνÏĦα"
],
[
"¾",
"ι"
],
[
"Î",
"¾Î¹"
],
[
"α",
"Ïģ"
],
[
"ÏĦ",
"ά"
],
[
"ÏĦ",
"Ïİ"
],
[
"ε",
"νÏĦα"
],
[
"ε",
"Ïģ"
],
[
"ι",
"άνÏĦα"
],
[
"Ïĥ",
"ι"
],
[
"Ïĥ",
"Ïĥ"
],
[
"Ïĥ",
"αÏģ"
],
[
"ÏĮ",
"νÏĦα"
],
[
"ο",
"Ïĥι"
],
[
"ÎŃ",
"α"
],
[
"ÎŃ",
"ξι"
],
[
"ÎŃ",
"ÏĥÏĥ"
],
[
"ÏĢ",
"ενήνÏĦα"
],
[
"ÏĢ",
"ÎŃνÏĦε"
],
[
"ÏĢ",
"ενÏĦα"
],
[
"εν",
"ενήνÏĦα"
],
[
"δ",
"ιακÏĮÏĥια"
],
[
"ÏĦÏģ",
"ιακÏĮÏĥια"
],
[
"ÏĦÏģ",
"ί"
],
[
"ÏĦÏģ",
"ακÏĮÏĥια"
],
[
"ÏĦÏģ",
"ιάνÏĦα"
],
[
"εÎ",
"¯Îº"
],
[
"εÎ",
"²Î´"
],
[
"ÏĦε",
"ÏĦÏģακÏĮÏĥια"
],
[
"εÏĢ",
"ÏĦακÏĮÏĥια"
],
[
"εÏĢ",
"ÏĦά"
],
[
"οÎ",
"³Î´"
],
[
"οÎ",
"¼Î®Î½ÏĦα"
],
[
"οκ",
"ÏĦακÏĮÏĥια"
],
[
"οκ",
"ÏĦÏİ"
],
[
"ενν",
"ιακÏĮÏĥια"
],
[
"ενν",
"ÎŃα"
],
[
"εξ",
"ήνÏĦα"
],
[
"εξ",
"ακÏĮÏĥια"
],
[
"εκαÏĦ",
"ÏĮ"
],
[
"εÏģ",
"α"
],
[
"ÏĥαÏģ",
"άνÏĦα"
],
[
"ÎŃÏĥÏĥ",
"εÏģα"
],
[
"ÏĢενÏĦα",
"κÏĮÏĥια"
],
[
"ÏĦÏģί",
"α"
],
[
"είκ",
"οÏĥι"
],
[
"εβδ",
"ομήνÏĦα"
],
[
"ογδ",
"ÏĮνÏĦα"
],
[
"εκαÏĦÏĮ",
"ν"
],
[
"Ï",
"į"
],
[
"ÏĦ",
"ÎŃÏĥÏĥεÏģα"
],
[
"δ",
"Ïį"
],
[
"ÎŃν",
"α"
],
[
"δÏį",
"ο"
],
[
"δ",
"εκα"
],
[
"ÎŃ",
"κα"
],
[
"δ",
"Ïİ"
],
[
"δ",
"εκαÏĦ"
],
[
"δ",
"ÎŃκα"
],
[
"ÎŃνÏĦε",
"κα"
],
[
"δεκα",
"ÎŃξι"
],
[
"δεκα",
"ÏĢÎŃνÏĦε"
],
[
"δεκα",
"εÏĢÏĦά"
],
[
"δεκα",
"οκÏĦÏİ"
],
[
"δεκα",
"εννÎŃα"
],
[
"δεκα",
"ÏĦÏģία"
],
[
"δÏİ",
"δεκα"
],
[
"δεκαÏĦ",
"ÎŃÏĥÏĥεÏģα"
],
[
"·",
"δ"
],
[
"¼",
"Î"
],
[
"Î",
"¼Î"
],
[
"·Î´",
"ÎŃν"
],
[
"μÎ",
"·Î´ÎŃν"
]
]
}
}