Upload tokenizer
241cc30 verified | | { |
| | "version": "1.0", |
| | "truncation": null, |
| | "padding": null, |
| | "added_tokens": [ |
| | { |
| | "id": 0, |
| | "content": "[PAD]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | }, |
| | { |
| | "id": 1, |
| | "content": "[UNK]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | }, |
| | { |
| | "id": 2, |
| | "content": "[CLS]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | }, |
| | { |
| | "id": 3, |
| | "content": "[SEP]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | }, |
| | { |
| | "id": 4, |
| | "content": "[MASK]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | }, |
| | { |
| | "id": 5, |
| | "content": "[SOS]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | }, |
| | { |
| | "id": 6, |
| | "content": "[EOS]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | }, |
| | { |
| | "id": 7, |
| | "content": "[SPACE]", |
| | "single_word": false, |
| | "lstrip": false, |
| | "rstrip": false, |
| | "normalized": false, |
| | "special": true |
| | } |
| | ], |
| | "normalizer": null, |
| | "pre_tokenizer": { |
| | "type": "Whitespace" |
| | }, |
| | "post_processor": null, |
| | "decoder": null, |
| | "model": { |
| | "type": "WordLevel", |
| | "vocab": { |
| | "[PAD]": 0, |
| | "[UNK]": 1, |
| | "[CLS]": 2, |
| | "[SEP]": 3, |
| | "[MASK]": 4, |
| | "[SOS]": 5, |
| | "[EOS]": 6, |
| | "[SPACE]": 7, |
| | "’": 8, |
| | "Ng": 9, |
| | "a": 10, |
| | "e": 11, |
| | "ng": 12, |
| | "o": 13, |
| | "\u0000": 14, |
| | "\u0001": 15, |
| | "\u0002": 16, |
| | "\u0003": 17, |
| | "\u0004": 18, |
| | "\u0005": 19, |
| | "\u0006": 20, |
| | "\u0007": 21, |
| | "\b": 22, |
| | "\u000e": 23, |
| | "\u000f": 24, |
| | "\u0010": 25, |
| | "\u0011": 26, |
| | "\u0012": 27, |
| | "\u0013": 28, |
| | "\u0014": 29, |
| | "\u0015": 30, |
| | "\u0016": 31, |
| | "\u0017": 32, |
| | "\u0018": 33, |
| | "\u0019": 34, |
| | "\u001a": 35, |
| | "\u001b": 36, |
| | "\u001c": 37, |
| | "\u001d": 38, |
| | "\u001e": 39, |
| | "\u001f": 40, |
| | "!": 41, |
| | "\"": 42, |
| | "#": 43, |
| | "$": 44, |
| | "%": 45, |
| | "&": 46, |
| | "'": 47, |
| | "(": 48, |
| | ")": 49, |
| | "*": 50, |
| | "+": 51, |
| | ",": 52, |
| | "-": 53, |
| | ".": 54, |
| | "/": 55, |
| | "0": 56, |
| | "1": 57, |
| | "2": 58, |
| | "3": 59, |
| | "4": 60, |
| | "5": 61, |
| | "6": 62, |
| | "7": 63, |
| | "8": 64, |
| | "9": 65, |
| | ":": 66, |
| | ";": 67, |
| | "<": 68, |
| | "=": 69, |
| | ">": 70, |
| | "?": 71, |
| | "@": 72, |
| | "A": 73, |
| | "B": 74, |
| | "Ba": 75, |
| | "Be": 76, |
| | "Bi": 77, |
| | "Bo": 78, |
| | "Bu": 79, |
| | "Bwa": 80, |
| | "Bwe": 81, |
| | "Bwi": 82, |
| | "C": 83, |
| | "Cha": 84, |
| | "Che": 85, |
| | "Chi": 86, |
| | "Cho": 87, |
| | "Chu": 88, |
| | "Chwa": 89, |
| | "Chwe": 90, |
| | "Chwi": 91, |
| | "D": 92, |
| | "Da": 93, |
| | "De": 94, |
| | "Dha": 95, |
| | "Dhe": 96, |
| | "Dhi": 97, |
| | "Dho": 98, |
| | "Dhu": 99, |
| | "Di": 100, |
| | "Do": 101, |
| | "Du": 102, |
| | "E": 103, |
| | "F": 104, |
| | "Fa": 105, |
| | "Fe": 106, |
| | "Fi": 107, |
| | "Fo": 108, |
| | "Fu": 109, |
| | "G": 110, |
| | "Ga": 111, |
| | "Ge": 112, |
| | "Gha": 113, |
| | "Ghe": 114, |
| | "Ghi": 115, |
| | "Gho": 116, |
| | "Ghu": 117, |
| | "Gi": 118, |
| | "Go": 119, |
| | "Gu": 120, |
| | "Gwa": 121, |
| | "Gwe": 122, |
| | "Gwi": 123, |
| | "H": 124, |
| | "Ha": 125, |
| | "He": 126, |
| | "Hi": 127, |
| | "Ho": 128, |
| | "Hu": 129, |
| | "I": 130, |
| | "J": 131, |
| | "Ja": 132, |
| | "Je": 133, |
| | "Ji": 134, |
| | "Jo": 135, |
| | "Ju": 136, |
| | "Jwa": 137, |
| | "Jwe": 138, |
| | "Jwi": 139, |
| | "K": 140, |
| | "Ka": 141, |
| | "Ke": 142, |
| | "Kha": 143, |
| | "Khe": 144, |
| | "Kho": 145, |
| | "Khu": 146, |
| | "Ki": 147, |
| | "Ko": 148, |
| | "Ku": 149, |
| | "Kwa": 150, |
| | "Kwe": 151, |
| | "Kwi": 152, |
| | "L": 153, |
| | "La": 154, |
| | "Le": 155, |
| | "Li": 156, |
| | "Lo": 157, |
| | "Lu": 158, |
| | "Lwa": 159, |
| | "Lwe": 160, |
| | "Lwi": 161, |
| | "M": 162, |
| | "Ma": 163, |
| | "Mba": 164, |
| | "Mbe": 165, |
| | "Mbi": 166, |
| | "Mbo": 167, |
| | "Mbu": 168, |
| | "Mbwa": 169, |
| | "Mbwe": 170, |
| | "Mbwi": 171, |
| | "Me": 172, |
| | "Mi": 173, |
| | "Mo": 174, |
| | "Mu": 175, |
| | "Mwa": 176, |
| | "Mwe": 177, |
| | "Mwi": 178, |
| | "N": 179, |
| | "Na": 180, |
| | "Nda": 181, |
| | "Nde": 182, |
| | "Ndi": 183, |
| | "Ndo": 184, |
| | "Ndu": 185, |
| | "Ndwa": 186, |
| | "Ndwe": 187, |
| | "Ndwi": 188, |
| | "Ne": 189, |
| | "Nga": 190, |
| | "Nge": 191, |
| | "Ngi": 192, |
| | "Ngo": 193, |
| | "Ngu": 194, |
| | "Ngwa": 195, |
| | "Ngwe": 196, |
| | "Ngwi": 197, |
| | "Ni": 198, |
| | "Nja": 199, |
| | "Nje": 200, |
| | "Nji": 201, |
| | "Njo": 202, |
| | "Nju": 203, |
| | "Njwa": 204, |
| | "Njwe": 205, |
| | "Njwi": 206, |
| | "No": 207, |
| | "Nu": 208, |
| | "Nya": 209, |
| | "Nye": 210, |
| | "Nyi": 211, |
| | "Nyo": 212, |
| | "Nyu": 213, |
| | "Nywa": 214, |
| | "Nywe": 215, |
| | "Nza": 216, |
| | "Nze": 217, |
| | "Nzi": 218, |
| | "Nzo": 219, |
| | "Nzu": 220, |
| | "O": 221, |
| | "P": 222, |
| | "Pa": 223, |
| | "Pe": 224, |
| | "Pi": 225, |
| | "Po": 226, |
| | "Pu": 227, |
| | "Pwa": 228, |
| | "Pwe": 229, |
| | "Pwi": 230, |
| | "Pwo": 231, |
| | "Q": 232, |
| | "R": 233, |
| | "Ra": 234, |
| | "Re": 235, |
| | "Ri": 236, |
| | "Ro": 237, |
| | "Ru": 238, |
| | "S": 239, |
| | "Sa": 240, |
| | "Se": 241, |
| | "Sha": 242, |
| | "She": 243, |
| | "Shi": 244, |
| | "Sho": 245, |
| | "Shu": 246, |
| | "Shwa": 247, |
| | "Shwe": 248, |
| | "Shwi": 249, |
| | "Si": 250, |
| | "So": 251, |
| | "Su": 252, |
| | "Swa": 253, |
| | "Swe": 254, |
| | "Swi": 255, |
| | "T": 256, |
| | "Ta": 257, |
| | "Te": 258, |
| | "Tha": 259, |
| | "The": 260, |
| | "Thi": 261, |
| | "Tho": 262, |
| | "Thu": 263, |
| | "Ti": 264, |
| | "To": 265, |
| | "Twa": 266, |
| | "Twe": 267, |
| | "Twi": 268, |
| | "U": 269, |
| | "V": 270, |
| | "Va": 271, |
| | "Ve": 272, |
| | "Vi": 273, |
| | "Vo": 274, |
| | "Vu": 275, |
| | "Vya": 276, |
| | "Vye": 277, |
| | "Vyo": 278, |
| | "W": 279, |
| | "Wa": 280, |
| | "We": 281, |
| | "Wi": 282, |
| | "Wo": 283, |
| | "Wu": 284, |
| | "X": 285, |
| | "Y": 286, |
| | "Ya": 287, |
| | "Ye": 288, |
| | "Yi": 289, |
| | "Yo": 290, |
| | "Yu": 291, |
| | "Z": 292, |
| | "Za": 293, |
| | "Ze": 294, |
| | "Zi": 295, |
| | "Zo": 296, |
| | "Zu": 297, |
| | "Zwa": 298, |
| | "Zwe": 299, |
| | "Zwi": 300, |
| | "[": 301, |
| | "\\": 302, |
| | "]": 303, |
| | "^": 304, |
| | "_": 305, |
| | "`": 306, |
| | "b": 307, |
| | "ba": 308, |
| | "be": 309, |
| | "bi": 310, |
| | "bo": 311, |
| | "bu": 312, |
| | "bwa": 313, |
| | "bwe": 314, |
| | "bwi": 315, |
| | "c": 316, |
| | "cha": 317, |
| | "che": 318, |
| | "chi": 319, |
| | "cho": 320, |
| | "chu": 321, |
| | "chwa": 322, |
| | "chwe": 323, |
| | "chwi": 324, |
| | "d": 325, |
| | "da": 326, |
| | "de": 327, |
| | "dha": 328, |
| | "dhe": 329, |
| | "dhi": 330, |
| | "dho": 331, |
| | "dhu": 332, |
| | "di": 333, |
| | "do": 334, |
| | "du": 335, |
| | "f": 336, |
| | "fa": 337, |
| | "fe": 338, |
| | "fi": 339, |
| | "fo": 340, |
| | "fu": 341, |
| | "g": 342, |
| | "ga": 343, |
| | "ge": 344, |
| | "gha": 345, |
| | "ghe": 346, |
| | "ghi": 347, |
| | "gho": 348, |
| | "ghu": 349, |
| | "gi": 350, |
| | "go": 351, |
| | "gu": 352, |
| | "gwa": 353, |
| | "gwe": 354, |
| | "gwi": 355, |
| | "h": 356, |
| | "ha": 357, |
| | "he": 358, |
| | "hi": 359, |
| | "ho": 360, |
| | "hu": 361, |
| | "i": 362, |
| | "j": 363, |
| | "ja": 364, |
| | "je": 365, |
| | "ji": 366, |
| | "jo": 367, |
| | "ju": 368, |
| | "jwa": 369, |
| | "jwe": 370, |
| | "jwi": 371, |
| | "k": 372, |
| | "ka": 373, |
| | "ke": 374, |
| | "kha": 375, |
| | "khe": 376, |
| | "kho": 377, |
| | "khu": 378, |
| | "ki": 379, |
| | "ko": 380, |
| | "ku": 381, |
| | "kwa": 382, |
| | "kwe": 383, |
| | "kwi": 384, |
| | "l": 385, |
| | "la": 386, |
| | "le": 387, |
| | "li": 388, |
| | "lo": 389, |
| | "lu": 390, |
| | "lwa": 391, |
| | "lwe": 392, |
| | "lwi": 393, |
| | "m": 394, |
| | "ma": 395, |
| | "mba": 396, |
| | "mbe": 397, |
| | "mbi": 398, |
| | "mbo": 399, |
| | "mbu": 400, |
| | "mbwa": 401, |
| | "mbwe": 402, |
| | "mbwi": 403, |
| | "me": 404, |
| | "mi": 405, |
| | "mo": 406, |
| | "mu": 407, |
| | "mwa": 408, |
| | "mwe": 409, |
| | "mwi": 410, |
| | "n": 411, |
| | "na": 412, |
| | "nda": 413, |
| | "nde": 414, |
| | "ndi": 415, |
| | "ndo": 416, |
| | "ndu": 417, |
| | "ndwa": 418, |
| | "ndwe": 419, |
| | "ndwi": 420, |
| | "ne": 421, |
| | "nga": 422, |
| | "nge": 423, |
| | "ngi": 424, |
| | "ngo": 425, |
| | "ngu": 426, |
| | "ngwa": 427, |
| | "ngwe": 428, |
| | "ngwi": 429, |
| | "ni": 430, |
| | "nja": 431, |
| | "nje": 432, |
| | "nji": 433, |
| | "njo": 434, |
| | "nju": 435, |
| | "njwa": 436, |
| | "njwe": 437, |
| | "njwi": 438, |
| | "no": 439, |
| | "nu": 440, |
| | "nya": 441, |
| | "nye": 442, |
| | "nyi": 443, |
| | "nyo": 444, |
| | "nyu": 445, |
| | "nywa": 446, |
| | "nywe": 447, |
| | "nza": 448, |
| | "nze": 449, |
| | "nzi": 450, |
| | "nzo": 451, |
| | "nzu": 452, |
| | "p": 453, |
| | "pa": 454, |
| | "pe": 455, |
| | "pi": 456, |
| | "po": 457, |
| | "pu": 458, |
| | "pwa": 459, |
| | "pwe": 460, |
| | "pwi": 461, |
| | "pwo": 462, |
| | "q": 463, |
| | "r": 464, |
| | "ra": 465, |
| | "re": 466, |
| | "ri": 467, |
| | "ro": 468, |
| | "ru": 469, |
| | "s": 470, |
| | "sa": 471, |
| | "se": 472, |
| | "sha": 473, |
| | "she": 474, |
| | "shi": 475, |
| | "sho": 476, |
| | "shu": 477, |
| | "shwa": 478, |
| | "shwe": 479, |
| | "shwi": 480, |
| | "si": 481, |
| | "so": 482, |
| | "su": 483, |
| | "swa": 484, |
| | "swe": 485, |
| | "swi": 486, |
| | "t": 487, |
| | "ta": 488, |
| | "te": 489, |
| | "tha": 490, |
| | "the": 491, |
| | "thi": 492, |
| | "tho": 493, |
| | "thu": 494, |
| | "ti": 495, |
| | "to": 496, |
| | "twa": 497, |
| | "twe": 498, |
| | "twi": 499, |
| | "u": 500, |
| | "v": 501, |
| | "va": 502, |
| | "ve": 503, |
| | "vi": 504, |
| | "vo": 505, |
| | "vu": 506, |
| | "vya": 507, |
| | "vye": 508, |
| | "vyo": 509, |
| | "w": 510, |
| | "wa": 511, |
| | "we": 512, |
| | "wi": 513, |
| | "wo": 514, |
| | "wu": 515, |
| | "x": 516, |
| | "y": 517, |
| | "ya": 518, |
| | "ye": 519, |
| | "yi": 520, |
| | "yo": 521, |
| | "yu": 522, |
| | "z": 523, |
| | "za": 524, |
| | "ze": 525, |
| | "zi": 526, |
| | "zo": 527, |
| | "zu": 528, |
| | "zwa": 529, |
| | "zwe": 530, |
| | "zwi": 531, |
| | "{": 532, |
| | "|": 533, |
| | "}": 534, |
| | "~": 535, |
| | "": 536, |
| | "": 537, |
| | "": 538, |
| | "": 539, |
| | "": 540, |
| | "": 541, |
| | "": 542, |
| | "": 543, |
| | "": 544, |
| | "": 545, |
| | "": 546, |
| | "": 547, |
| | "": 548, |
| | "": 549, |
| | "": 550, |
| | "": 551, |
| | "": 552, |
| | "": 553, |
| | "": 554, |
| | "": 555, |
| | "": 556, |
| | "": 557, |
| | "": 558, |
| | "": 559, |
| | "": 560, |
| | "": 561, |
| | "": 562, |
| | "": 563, |
| | "": 564, |
| | "": 565, |
| | "": 566, |
| | "": 567, |
| | "¡": 568, |
| | "¢": 569, |
| | "£": 570, |
| | "¤": 571, |
| | "¥": 572, |
| | "¦": 573, |
| | "§": 574, |
| | "¨": 575, |
| | "©": 576, |
| | "ª": 577, |
| | "«": 578, |
| | "¬": 579, |
| | "": 580, |
| | "®": 581, |
| | "¯": 582, |
| | "°": 583, |
| | "±": 584, |
| | "²": 585, |
| | "³": 586, |
| | "´": 587, |
| | "µ": 588, |
| | "¶": 589, |
| | "·": 590, |
| | "¸": 591, |
| | "¹": 592, |
| | "º": 593, |
| | "»": 594, |
| | "¼": 595, |
| | "½": 596, |
| | "¾": 597, |
| | "¿": 598, |
| | "À": 599, |
| | "Á": 600, |
| | "Â": 601, |
| | "Ã": 602, |
| | "Ä": 603, |
| | "Å": 604, |
| | "Æ": 605, |
| | "Ç": 606, |
| | "È": 607, |
| | "É": 608, |
| | "Ê": 609, |
| | "Ë": 610, |
| | "Ì": 611, |
| | "Í": 612, |
| | "Î": 613, |
| | "Ï": 614, |
| | "Ð": 615, |
| | "Ñ": 616, |
| | "Ò": 617, |
| | "Ó": 618, |
| | "Ô": 619, |
| | "Õ": 620, |
| | "Ö": 621, |
| | "×": 622, |
| | "Ø": 623, |
| | "Ù": 624, |
| | "Ú": 625, |
| | "Û": 626, |
| | "Ü": 627, |
| | "Ý": 628, |
| | "Þ": 629, |
| | "ß": 630, |
| | "à": 631, |
| | "á": 632, |
| | "â": 633, |
| | "ã": 634, |
| | "ä": 635, |
| | "å": 636, |
| | "æ": 637, |
| | "ç": 638, |
| | "è": 639, |
| | "é": 640, |
| | "ê": 641, |
| | "ë": 642, |
| | "ì": 643, |
| | "í": 644, |
| | "î": 645, |
| | "ï": 646, |
| | "ð": 647, |
| | "ñ": 648, |
| | "ò": 649, |
| | "ó": 650, |
| | "ô": 651, |
| | "õ": 652, |
| | "ö": 653, |
| | "÷": 654, |
| | "ø": 655, |
| | "ù": 656, |
| | "ú": 657, |
| | "û": 658, |
| | "ü": 659, |
| | "ý": 660, |
| | "þ": 661 |
| | }, |
| | "unk_token": "[UNK]" |
| | } |
| | } |