| { | |
| "version": "1.0", | |
| "truncation": null, | |
| "padding": null, | |
| "added_tokens": [ | |
| { | |
| "id": 0, | |
| "content": "[STOP]", | |
| "single_word": false, | |
| "lstrip": false, | |
| "rstrip": false, | |
| "normalized": false, | |
| "special": true | |
| }, | |
| { | |
| "id": 1, | |
| "content": "[UNK]", | |
| "single_word": false, | |
| "lstrip": false, | |
| "rstrip": false, | |
| "normalized": false, | |
| "special": true | |
| }, | |
| { | |
| "id": 2, | |
| "content": "[SPACE]", | |
| "single_word": false, | |
| "lstrip": false, | |
| "rstrip": false, | |
| "normalized": false, | |
| "special": true | |
| }, | |
| { | |
| "id": 255, | |
| "content": "[START]", | |
| "single_word": false, | |
| "lstrip": false, | |
| "rstrip": false, | |
| "normalized": false, | |
| "special": true | |
| } | |
| ], | |
| "normalizer": null, | |
| "pre_tokenizer": { | |
| "type": "Whitespace" | |
| }, | |
| "post_processor": null, | |
| "decoder": null, | |
| "model": { | |
| "type": "BPE", | |
| "dropout": null, | |
| "unk_token": "[UNK]", | |
| "continuing_subword_prefix": null, | |
| "end_of_word_suffix": null, | |
| "fuse_unk": false, | |
| "byte_fallback": false, | |
| "vocab": { | |
| "[STOP]": 0, | |
| "[UNK]": 1, | |
| "[SPACE]": 2, | |
| " ": 3, | |
| "!": 4, | |
| "%": 5, | |
| "&": 6, | |
| "'": 7, | |
| ",": 8, | |
| "-": 9, | |
| ".": 10, | |
| "0": 11, | |
| "1": 12, | |
| "2": 13, | |
| "3": 14, | |
| "4": 15, | |
| "5": 16, | |
| "6": 17, | |
| "7": 18, | |
| "8": 19, | |
| "9": 20, | |
| "?": 21, | |
| "a": 22, | |
| "b": 23, | |
| "c": 24, | |
| "d": 25, | |
| "e": 26, | |
| "f": 27, | |
| "g": 28, | |
| "h": 29, | |
| "i": 30, | |
| "j": 31, | |
| "k": 32, | |
| "l": 33, | |
| "m": 34, | |
| "n": 35, | |
| "o": 36, | |
| "p": 37, | |
| "q": 38, | |
| "r": 39, | |
| "s": 40, | |
| "t": 41, | |
| "u": 42, | |
| "v": 43, | |
| "w": 44, | |
| "x": 45, | |
| "y": 46, | |
| "z": 47, | |
| "à": 48, | |
| "á": 49, | |
| "ả": 50, | |
| "ã": 51, | |
| "ạ": 52, | |
| "ă": 53, | |
| "ắ": 54, | |
| "ằ": 55, | |
| "ẳ": 56, | |
| "ẵ": 57, | |
| "ặ": 58, | |
| "â": 59, | |
| "ấ": 60, | |
| "ầ": 61, | |
| "ẩ": 62, | |
| "ẫ": 63, | |
| "ậ": 64, | |
| "è": 65, | |
| "é": 66, | |
| "ẻ": 67, | |
| "ẽ": 68, | |
| "ẹ": 69, | |
| "ê": 70, | |
| "ế": 71, | |
| "ề": 72, | |
| "ể": 73, | |
| "ễ": 74, | |
| "ệ": 75, | |
| "ì": 76, | |
| "í": 77, | |
| "ỉ": 78, | |
| "ĩ": 79, | |
| "ị": 80, | |
| "ò": 81, | |
| "ó": 82, | |
| "ỏ": 83, | |
| "õ": 84, | |
| "ọ": 85, | |
| "ô": 86, | |
| "ố": 87, | |
| "ồ": 88, | |
| "ổ": 89, | |
| "ỗ": 90, | |
| "ộ": 91, | |
| "ơ": 92, | |
| "ớ": 93, | |
| "ờ": 94, | |
| "ở": 95, | |
| "ỡ": 96, | |
| "ợ": 97, | |
| "ù": 98, | |
| "ú": 99, | |
| "ủ": 100, | |
| "ũ": 101, | |
| "ụ": 102, | |
| "ư": 103, | |
| "ứ": 104, | |
| "ừ": 105, | |
| "ử": 106, | |
| "ữ": 107, | |
| "ự": 108, | |
| "ỳ": 109, | |
| "ý": 110, | |
| "ỷ": 111, | |
| "ỹ": 112, | |
| "ỵ": 113, | |
| "đ": 114, | |
| "A": 115, | |
| "B": 116, | |
| "C": 117, | |
| "D": 118, | |
| "E": 119, | |
| "F": 120, | |
| "G": 121, | |
| "H": 122, | |
| "I": 123, | |
| "J": 124, | |
| "K": 125, | |
| "L": 126, | |
| "M": 127, | |
| "N": 128, | |
| "O": 129, | |
| "P": 130, | |
| "Q": 131, | |
| "R": 132, | |
| "S": 133, | |
| "T": 134, | |
| "U": 135, | |
| "V": 136, | |
| "W": 137, | |
| "X": 138, | |
| "Y": 139, | |
| "Z": 140, | |
| "À": 141, | |
| "Á": 142, | |
| "Ả": 143, | |
| "Ã": 144, | |
| "Ạ": 145, | |
| "Ă": 146, | |
| "Ắ": 147, | |
| "Ằ": 148, | |
| "Ẳ": 149, | |
| "Ẵ": 150, | |
| "Ặ": 151, | |
| "Â": 152, | |
| "Ấ": 153, | |
| "Ầ": 154, | |
| "Ẩ": 155, | |
| "Ẫ": 156, | |
| "Ậ": 157, | |
| "È": 158, | |
| "É": 159, | |
| "Ẻ": 160, | |
| "Ẽ": 161, | |
| "Ẹ": 162, | |
| "Ê": 163, | |
| "Ế": 164, | |
| "Ề": 165, | |
| "Ể": 166, | |
| "Ễ": 167, | |
| "Ệ": 168, | |
| "Ì": 169, | |
| "Í": 170, | |
| "Ỉ": 171, | |
| "Ĩ": 172, | |
| "Ị": 173, | |
| "Ò": 174, | |
| "Ó": 175, | |
| "Ỏ": 176, | |
| "Õ": 177, | |
| "Ọ": 178, | |
| "Ô": 179, | |
| "Ố": 180, | |
| "Ồ": 181, | |
| "Ổ": 182, | |
| "Ỗ": 183, | |
| "Ộ": 184, | |
| "Ơ": 185, | |
| "Ớ": 186, | |
| "Ờ": 187, | |
| "Ở": 188, | |
| "Ỡ": 189, | |
| "Ợ": 190, | |
| "Ù": 191, | |
| "Ú": 192, | |
| "Ủ": 193, | |
| "Ũ": 194, | |
| "Ụ": 195, | |
| "Ư": 196, | |
| "Ứ": 197, | |
| "Ừ": 198, | |
| "Ử": 199, | |
| "Ữ": 200, | |
| "Ự": 201, | |
| "Ỳ": 202, | |
| "Ý": 203, | |
| "Ỷ": 204, | |
| "Ỹ": 205, | |
| "Ỵ": 206, | |
| "Đ": 207, | |
| ":": 208, | |
| ";": 209, | |
| "(": 210, | |
| ")": 211, | |
| "[": 212, | |
| "]": 213, | |
| "{": 214, | |
| "}": 215, | |
| "/": 216, | |
| "\\": 217, | |
| "@": 218, | |
| "#": 219, | |
| "$": 220, | |
| "*": 221, | |
| "+": 222, | |
| "=": 223, | |
| "<": 224, | |
| ">": 225, | |
| "~": 226, | |
| "`": 227, | |
| "^": 228, | |
| "_": 229, | |
| "|": 230, | |
| "\"": 231, | |
| ", ": 232, | |
| "…": 233, | |
| "—": 234, | |
| "–": 235, | |
| ",": 236, | |
| "、": 237, | |
| "。": 238, | |
| "!": 239, | |
| "?": 240, | |
| "°": 241, | |
| "±": 242, | |
| "×": 243, | |
| "÷": 244, | |
| "€": 245, | |
| "£": 246, | |
| "¥": 247, | |
| "ƀ": 248, | |
| "Ɓ": 249, | |
| "Ƃ": 250, | |
| "ƃ": 251, | |
| "Ƅ": 252, | |
| "ƅ": 253, | |
| "Ɔ": 254, | |
| "[START]": 255, | |
| "ng": 256, | |
| "nh": 257, | |
| "th": 258, | |
| "ch": 259, | |
| "tr": 260, | |
| "kh": 261, | |
| "ph": 262, | |
| "gh": 263, | |
| "gi": 264, | |
| "qu": 265, | |
| "có": 266, | |
| "là": 267, | |
| "và": 268, | |
| "một": 269, | |
| "của": 270, | |
| "không": 271, | |
| "thể": 272, | |
| "người": 273, | |
| "các": 274, | |
| "trong": 275, | |
| "những": 276, | |
| "cho": 277, | |
| "để": 278, | |
| "được": 279, | |
| "tôi": 280, | |
| "bạn": 281, | |
| "với": 282, | |
| "đã": 283, | |
| "sự": 284, | |
| "ta": 285, | |
| "việc": 286, | |
| "sẽ": 287, | |
| "chúng": 288, | |
| "khi": 289, | |
| "cũng": 290, | |
| "như": 291, | |
| "mà": 292, | |
| "đến": 293, | |
| "ra": 294, | |
| "này": 295, | |
| "từ": 296, | |
| "về": 297, | |
| "nên": 298, | |
| "sau": 299, | |
| "thì": 300, | |
| "năm": 301, | |
| "ngày": 302, | |
| "họ": 303, | |
| "mình": 304, | |
| "rất": 305, | |
| "đang": 306, | |
| "còn": 307, | |
| "vẫn": 308, | |
| "đều": 309, | |
| "cả": 310, | |
| "nhiều": 311, | |
| "nào": 312, | |
| "hay": 313, | |
| "đó": 314, | |
| "nó": 315, | |
| "ai": 316, | |
| "gì": 317, | |
| "đây": 318, | |
| "đấy": 319, | |
| "ấy": 320, | |
| "kia": 321, | |
| "nọ": 322, | |
| "bao": 323, | |
| "bất": 324, | |
| "cứ": 325, | |
| "mỗi": 326, | |
| "mọi": 327, | |
| "tất": 328, | |
| "toàn": 329, | |
| "cùng": 330, | |
| "nhau": 331, | |
| "nhất": 332, | |
| "hơn": 333, | |
| "lại": 334, | |
| "nữa": 335, | |
| "thêm": 336, | |
| "luôn": 337, | |
| "vừa": 338, | |
| "mới": 339, | |
| "sắp": 340, | |
| "rồi": 341, | |
| "xong": 342, | |
| "hết": 343, | |
| "bị": 344, | |
| "phải": 345, | |
| "muốn": 346, | |
| "thích": 347, | |
| "yêu": 348, | |
| "ghét": 349, | |
| "biết": 350, | |
| "hiểu": 351, | |
| "nghĩ": 352, | |
| "tin": 353, | |
| "làm": 354, | |
| "nói": 355, | |
| "hỏi": 356, | |
| "trả": 357, | |
| "lời": 358, | |
| "kể": 359, | |
| "bảo": 360, | |
| "gọi": 361, | |
| "đọc": 362, | |
| "viết": 363, | |
| "nghe": 364, | |
| "nhìn": 365, | |
| "thấy": 366, | |
| "Ā": 367, | |
| "ā": 368, | |
| "Ą": 369, | |
| "ą": 370, | |
| "Ć": 371, | |
| "ć": 372, | |
| "Ĉ": 373, | |
| "ĉ": 374, | |
| "Ċ": 375, | |
| "ċ": 376, | |
| "Č": 377, | |
| "č": 378, | |
| "Ď": 379, | |
| "ď": 380, | |
| "Ē": 381, | |
| "ē": 382, | |
| "Ĕ": 383, | |
| "ĕ": 384, | |
| "Ė": 385, | |
| "ė": 386, | |
| "Ę": 387, | |
| "ę": 388, | |
| "Ě": 389, | |
| "ě": 390, | |
| "Ĝ": 391, | |
| "ĝ": 392, | |
| "Ğ": 393, | |
| "ğ": 394, | |
| "Ġ": 395, | |
| "ġ": 396, | |
| "Ģ": 397, | |
| "ģ": 398, | |
| "Ĥ": 399, | |
| "ĥ": 400, | |
| "Ħ": 401, | |
| "ħ": 402, | |
| "Ī": 403, | |
| "ī": 404, | |
| "Ĭ": 405, | |
| "ĭ": 406, | |
| "Į": 407, | |
| "į": 408, | |
| "İ": 409, | |
| "ı": 410, | |
| "IJ": 411, | |
| "ij": 412, | |
| "Ĵ": 413, | |
| "ĵ": 414, | |
| "Ķ": 415, | |
| "ķ": 416, | |
| "ĸ": 417, | |
| "Ĺ": 418, | |
| "ĺ": 419, | |
| "Ļ": 420, | |
| "ļ": 421, | |
| "Ľ": 422, | |
| "ľ": 423, | |
| "Ŀ": 424, | |
| "ŀ": 425, | |
| "Ł": 426, | |
| "ł": 427, | |
| "Ń": 428, | |
| "ń": 429, | |
| "Ņ": 430, | |
| "ņ": 431, | |
| "Ň": 432, | |
| "ň": 433, | |
| "ʼn": 434, | |
| "Ŋ": 435, | |
| "ŋ": 436, | |
| "Ō": 437, | |
| "ō": 438, | |
| "Ŏ": 439, | |
| "ŏ": 440, | |
| "Ő": 441, | |
| "ő": 442, | |
| "Œ": 443, | |
| "œ": 444, | |
| "Ŕ": 445, | |
| "ŕ": 446, | |
| "Ŗ": 447, | |
| "ŗ": 448, | |
| "Ř": 449, | |
| "ř": 450, | |
| "Ś": 451, | |
| "ś": 452, | |
| "Ŝ": 453, | |
| "ŝ": 454, | |
| "Ş": 455, | |
| "ş": 456, | |
| "Š": 457, | |
| "š": 458, | |
| "Ţ": 459, | |
| "ţ": 460, | |
| "Ť": 461, | |
| "ť": 462, | |
| "Ŧ": 463, | |
| "ŧ": 464, | |
| "Ū": 465, | |
| "ū": 466, | |
| "Ŭ": 467, | |
| "ŭ": 468, | |
| "Ů": 469, | |
| "ů": 470, | |
| "Ű": 471, | |
| "ű": 472, | |
| "Ų": 473, | |
| "ų": 474, | |
| "Ŵ": 475, | |
| "ŵ": 476, | |
| "Ŷ": 477, | |
| "ŷ": 478, | |
| "Ÿ": 479, | |
| "Ź": 480, | |
| "ź": 481, | |
| "Ż": 482, | |
| "ż": 483, | |
| "Ž": 484, | |
| "ž": 485, | |
| "ſ": 486, | |
| "Ƈ": 487, | |
| "ƈ": 488, | |
| "Ɖ": 489, | |
| "Ɗ": 490, | |
| "Ƌ": 491, | |
| "ƌ": 492, | |
| "ƍ": 493, | |
| "Ǝ": 494, | |
| "Ə": 495, | |
| "Ɛ": 496, | |
| "Ƒ": 497, | |
| "ƒ": 498, | |
| "Ɠ": 499, | |
| "Ɣ": 500, | |
| "ƕ": 501, | |
| "Ɩ": 502, | |
| "Ɨ": 503, | |
| "Ƙ": 504, | |
| "ƙ": 505, | |
| "ƚ": 506, | |
| "ƛ": 507, | |
| "Ɯ": 508, | |
| "Ɲ": 509, | |
| "ƞ": 510, | |
| "Ɵ": 511, | |
| "Ƣ": 512, | |
| "ƣ": 513, | |
| "Ƥ": 514, | |
| "ƥ": 515, | |
| "Ʀ": 516, | |
| "Ƨ": 517, | |
| "ƨ": 518, | |
| "Ʃ": 519, | |
| "ƪ": 520, | |
| "ƫ": 521, | |
| "Ƭ": 522, | |
| "ƭ": 523, | |
| "Ʈ": 524, | |
| "Ʊ": 525, | |
| "Ʋ": 526, | |
| "Ƴ": 527, | |
| "ƴ": 528, | |
| "Ƶ": 529, | |
| "ƶ": 530, | |
| "Ʒ": 531, | |
| "Ƹ": 532, | |
| "ƹ": 533, | |
| "ƺ": 534, | |
| "ƻ": 535, | |
| "Ƽ": 536, | |
| "ƽ": 537, | |
| "ƾ": 538, | |
| "ƿ": 539, | |
| "ǀ": 540, | |
| "ǁ": 541, | |
| "ǂ": 542, | |
| "ǃ": 543, | |
| "DŽ": 544, | |
| "Dž": 545, | |
| "dž": 546, | |
| "LJ": 547, | |
| "Lj": 548, | |
| "lj": 549, | |
| "NJ": 550, | |
| "Nj": 551, | |
| "nj": 552, | |
| "Ǎ": 553, | |
| "ǎ": 554, | |
| "Ǐ": 555, | |
| "ǐ": 556, | |
| "Ǒ": 557, | |
| "ǒ": 558, | |
| "Ǔ": 559, | |
| "ǔ": 560, | |
| "Ǖ": 561, | |
| "ǖ": 562, | |
| "Ǘ": 563, | |
| "ǘ": 564, | |
| "Ǚ": 565, | |
| "ǚ": 566, | |
| "Ǜ": 567, | |
| "ǜ": 568, | |
| "ǝ": 569, | |
| "Ǟ": 570, | |
| "ǟ": 571, | |
| "Ǡ": 572, | |
| "ǡ": 573, | |
| "Ǣ": 574, | |
| "ǣ": 575, | |
| "Ǥ": 576, | |
| "ǥ": 577, | |
| "Ǧ": 578, | |
| "ǧ": 579, | |
| "Ǩ": 580, | |
| "ǩ": 581, | |
| "Ǫ": 582, | |
| "ǫ": 583, | |
| "Ǭ": 584, | |
| "ǭ": 585, | |
| "Ǯ": 586, | |
| "ǯ": 587, | |
| "ǰ": 588, | |
| "DZ": 589, | |
| "Dz": 590, | |
| "dz": 591, | |
| "Ǵ": 592, | |
| "ǵ": 593, | |
| "Ƕ": 594, | |
| "Ƿ": 595, | |
| "Ǹ": 596, | |
| "ǹ": 597, | |
| "Ǻ": 598, | |
| "ǻ": 599, | |
| "Ǽ": 600, | |
| "ǽ": 601, | |
| "Ǿ": 602, | |
| "ǿ": 603, | |
| "[UH]": 604, | |
| "[UM]": 605, | |
| "[giggle]": 606, | |
| "[laughter]": 607, | |
| "[guffaw]": 608, | |
| "[inhale]": 609, | |
| "[exhale]": 610, | |
| "[sigh]": 611, | |
| "[cry]": 612, | |
| "[bark]": 613, | |
| "[howl]": 614, | |
| "[meow]": 615, | |
| "[singing]": 616, | |
| "[music]": 617, | |
| "[whistle]": 618, | |
| "[humming]": 619, | |
| "[gasp]": 620, | |
| "[groan]": 621, | |
| "[whisper]": 622, | |
| "[mumble]": 623, | |
| "[sniff]": 624, | |
| "[sneeze]": 625, | |
| "[cough]": 626, | |
| "[snore]": 627, | |
| "[chew]": 628, | |
| "[sip]": 629, | |
| "[clear_throat]": 630, | |
| "[kiss]": 631, | |
| "[shhh]": 632, | |
| "[gibberish]": 633, | |
| "[fr]": 634, | |
| "[es]": 635, | |
| "[de]": 636, | |
| "[it]": 637, | |
| "[ipa]": 638, | |
| "[end_of_label]": 639, | |
| "θ": 640, | |
| "ð": 641, | |
| "ʃ": 642, | |
| "ʒ": 643, | |
| "tʃ": 644, | |
| "dʒ": 645, | |
| "ʔ": 646, | |
| "ɑː": 647, | |
| "æ": 648, | |
| "ʌ": 649, | |
| "ɒ": 650, | |
| "ɔː": 651, | |
| "ɜː": 652, | |
| "ə": 653, | |
| "ɪ": 654, | |
| "iː": 655, | |
| "ʊ": 656, | |
| "uː": 657, | |
| "eɪ": 658, | |
| "aɪ": 659, | |
| "ɔɪ": 660, | |
| "aʊ": 661, | |
| "əʊ": 662, | |
| "ɯ": 663, | |
| "ɤ": 664, | |
| "ɨ": 665, | |
| "ʉ": 666, | |
| "ɘ": 667, | |
| "ɵ": 668, | |
| "ɜ": 669, | |
| "ɞ": 670, | |
| "ɐ": 671, | |
| "ɶ": 672, | |
| "ɑ": 673, | |
| "ɔ": 674, | |
| "˧": 675, | |
| "˥": 676, | |
| "˩˧": 677, | |
| "˧˥": 678, | |
| "˧˩˧": 679, | |
| "˧˩": 680, | |
| "ɓ": 681, | |
| "ɗ": 682, | |
| "ɠ": 683, | |
| "ʄ": 684, | |
| "ʛ": 685, | |
| "ɲ": 686, | |
| "ɳ": 687, | |
| "ɱ": 688, | |
| "ʈ": 689, | |
| "ɖ": 690, | |
| "ɟ": 691, | |
| "ɡ": 692, | |
| "ɢ": 693, | |
| "ʡ": 694, | |
| "[PLACEHOLDER55]": 695, | |
| "[PLACEHOLDER56]": 696, | |
| "[PLACEHOLDER57]": 697, | |
| "[PLACEHOLDER58]": 698, | |
| "[PLACEHOLDER59]": 699, | |
| "[PLACEHOLDER60]": 700, | |
| "[PLACEHOLDER61]": 701, | |
| "[PLACEHOLDER62]": 702, | |
| "[PLACEHOLDER63]": 703 | |
| }, | |
| "merges": [ | |
| "n g", | |
| "n h", | |
| "t h", | |
| "c h", | |
| "t r", | |
| "k h", | |
| "p h", | |
| "g h", | |
| "g i", | |
| "q u" | |
| ], | |
| "language": "vi" | |
| } | |
| } |