char-t5-custom-init / tokenizer.json
tranhuyHoang's picture
Upload tokenizer
2348427 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<sos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<eos>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Split",
"pattern": {
"String": ""
},
"behavior": "Isolated",
"invert": false
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<sos>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<eos>",
"type_id": 0
}
}
],
"pair": [
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<eos>": {
"id": "<eos>",
"ids": [
3
],
"tokens": [
"<eos>"
]
},
"<sos>": {
"id": "<sos>",
"ids": [
2
],
"tokens": [
"<sos>"
]
}
}
},
"decoder": {
"type": "Sequence",
"decoders": []
},
"model": {
"type": "WordLevel",
"vocab": {
"<pad>": 0,
"<unk>": 1,
"<sos>": 2,
"<eos>": 3,
"8": 4,
".": 5,
"1": 6,
"3": 7,
" ": 8,
"D": 9,
"u": 10,
"n": 11,
"c": 12,
"a": 13,
"h": 14,
"t": 15,
"y": 16,
"d": 17,
"g": 18,
"i": 19,
"m": 20,
"e": 21,
"o": 22,
"k": 23,
"5": 24,
"0": 25,
"C": 26,
",": 27,
"s": 28,
"l": 29,
"f": 30,
"(": 31,
"N": 32,
"H": 33,
"4": 34,
")": 35,
"2": 36,
"S": 37,
"v": 38,
"L": 39,
"q": 40,
"-": 41,
"μ": 42,
"6": 43,
"b": 44,
"p": 45,
"r": 46,
"Đ": 47,
"á": 48,
"ủ": 49,
"ị": 50,
"ẫ": 51,
"đ": 52,
"ế": 53,
"ệ": 54,
"ộ": 55,
"ả": 56,
"ỏ": 57,
"ừ": 58,
"ọ": 59,
"à": 60,
"ù": 61,
"ũ": 62,
"ấ": 63,
"ề": 64,
"ạ": 65,
"ê": 66,
"ó": 67,
"ố": 68,
"ư": 69,
"ỡ": 70,
"ỗ": 71,
"ì": 72,
"â": 73,
"ử": 74,
"ằ": 75,
"ớ": 76,
"/": 77,
"V": 78,
"ă": 79,
"ò": 80,
"í": 81,
"ý": 82,
"ở": 83,
"ơ": 84,
"B": 85,
"ự": 86,
"T": 87,
"ầ": 88,
"ậ": 89,
"ô": 90,
"U": 91,
"ỉ": 92,
"ú": 93,
"ẵ": 94,
"ồ": 95,
"ặ": 96,
"ổ": 97,
"ẹ": 98,
"P": 99,
"ẽ": 100,
"ỷ": 101,
"ể": 102,
"ẻ": 103,
"ữ": 104,
"ụ": 105,
"Q": 106,
"ờ": 107,
"Y": 108,
"ợ": 109,
"=": 110,
"x": 111,
"@": 112,
"+": 113,
"^": 114,
"Ủ": 115,
"&": 116,
"$": 117,
";": 118,
"!": 119,
"*": 120,
"#": 121,
"%": 122,
"~": 123,
"õ": 124,
"ứ": 125,
"X": 126,
":": 127,
"Ề": 128,
"Ạ": 129,
"Ứ": 130,
"Ụ": 131,
"A": 132,
"G": 133,
"Ư": 134,
"Ờ": 135,
"I": 136,
"K": 137,
"Ý": 138,
"ẩ": 139,
"ã": 140,
"é": 141,
"F": 142,
"W": 143,
"ĩ": 144,
"è": 145,
"M": 146,
"ễ": 147,
"ắ": 148,
"\"": 149,
"7": 150,
"ỳ": 151,
"ẳ": 152,
"ỵ": 153,
"?": 154,
"z": 155,
"•": 156,
"9": 157,
"j": 158,
"Ẩ": 159,
"\n": 160,
"O": 161,
"ỹ": 162,
"E": 163,
"R": 164,
"Ế": 165,
"Ị": 166,
"Ộ": 167,
"Ở": 168,
"Ọ": 169,
"À": 170,
"Ô": 171,
"Ệ": 172,
"Ê": 173,
"Ặ": 174,
"Ó": 175,
"’": 176,
"w": 177,
"Ể": 178,
"Ú": 179,
"Í": 180,
"É": 181,
"Ễ": 182,
"Ỡ": 183,
"Â": 184,
"Á": 185,
"Ỹ": 186,
"Ậ": 187,
"Ã": 188,
"Ố": 189,
"Ầ": 190,
"“": 191,
"”": 192,
"Ừ": 193,
"Ả": 194,
"Ằ": 195,
"Ớ": 196,
"Ự": 197,
"Õ": 198,
"Ĩ": 199,
"Ữ": 200,
"Ò": 201,
"Ử": 202,
"≥": 203,
"Ì": 204,
"Ổ": 205,
"°": 206,
"'": 207,
"Ơ": 208,
"Ă": 209,
"Ỉ": 210,
"Ẹ": 211,
"[": 212,
"]": 213,
"<": 214,
">": 215,
"…": 216,
"Z": 217,
"J": 218,
"Ấ": 219,
"Ợ": 220,
"🥺": 221,
"–": 222,
"Ẫ": 223,
"ē": 224,
"ō": 225,
"ā": 226,
",": 227,
"П": 228,
"Ù": 229,
"Ũ": 230,
"Ẵ": 231,
"|": 232,
"±": 233,
"ę": 234,
"³": 235,
"_": 236,
"Ỗ": 237,
"È": 238,
"Ỏ": 239,
"Ắ": 240,
"Ẳ": 241,
"∆": 242,
"Ẽ": 243,
"Ồ": 244,
"Ỳ": 245,
"Ỵ": 246,
"—": 247,
"ǎ": 248,
"Ỷ": 249,
"ϕ": 250,
"≤": 251,
"α": 252,
"θ": 253,
"Σ": 254,
"­": 255,
"÷": 256,
"Ω": 257,
"β": 258,
"ç": 259,
"{": 260,
"}": 261,
"ü": 262,
"̀": 263,
"Ẻ": 264,
"□": 265,
"ϑ": 266,
"ʃ": 267,
"£": 268,
"√": 269,
"∑": 270,
"‰": 271,
"ǐ": 272,
"►": 273,
"́": 274,
"̉": 275,
"̣": 276,
"·": 277,
"‘": 278,
"ǔ": 279,
"Î": 280,
"ï": 281,
"ǒ": 282,
"̃": 283,
"å": 284,
"½": 285,
"Μ": 286,
"ύ": 287,
"ς": 288,
"ū": 289,
"●": 290,
"δ": 291,
"ž": 292,
"Å": 293,
"ö": 294,
"ß": 295,
"λ": 296,
"²": 297,
"↔": 298,
"║": 299,
"ɸ": 300,
"≠": 301,
"γ": 302,
"ě": 303,
"¹": 304,
"µ": 305,
"≈": 306,
"์": 307,
"¾": 308,
"ą": 309,
"异": 310,
"物": 311,
"ī": 312,
"¨": 313,
"®": 314,
"þ": 315,
"♦": 316,
"§": 317,
"▪": 318,
"σ": 319,
"Ä": 320,
"ε": 321,
"∙": 322,
"û": 323,
"ρ": 324,
"令": 325,
"ద": 326,
"ి": 327,
"ం": 328,
"Δ": 329,
"¬": 330,
"?": 331,
"φ": 332,
"": 333,
"‚": 334,
"ƒ": 335,
"„": 336,
"π": 337,
"🗿": 338,
"Ø": 339,
"′": 340,
"�": 341,
"业": 342,
"主": 343,
"下": 344,
"一": 345,
"ǰ": 346,
"ξ": 347,
"■": 348,
"→": 349,
"ӧ": 350,
"린": 351,
"च": 352,
"्": 353,
"य": 354,
"ं": 355,
"ु": 356,
"ग": 357,
"म": 358,
"ू": 359,
"स": 360,
"ा": 361,
"क": 362,
"श": 363,
"े": 364,
"त": 365,
"ै": 366,
"प": 367,
"ृ": 368,
"थ": 369,
"ि": 370,
"भ": 371,
"ह": 372,
"॑": 373,
"व": 374,
"灌": 375,
"ठ": 376,
"ी": 377,
"工": 378,
"商": 379,
"ข": 380,
"ึ": 381,
"要": 382,
"坚": 383,
"持": 384,
"‎": 385,
"\\": 386,
"×": 387,
"": 388,
"ä": 389,
"Ö": 390,
";": 391,
"Η": 392,
"Ο": 393,
"Α": 394,
"Τ": 395,
"Θ": 396,
"Ν": 397,
"Γ": 398,
"Β": 399,
"Ψ": 400,
"Ι": 401,
"Д": 402,
"ø": 403,
"辜": 404,
"¼": 405,
"培": 406,
"标": 407,
"త": 408,
"等": 409,
"》": 410,
"、": 411,
"항": 412,
"„": 413,
"ラ": 414,
"ン": 415,
"讓": 416,
"嘻": 417,
"在": 418,
"人": 419,
"ँ": 420,
"景": 421,
"点": 422,
"ప": 423,
"ు": 424,
"却": 425,
"不": 426,
"平": 427,
"均": 428,
"🫠": 429,
"`": 430,
"😭": 431,
"ล": 432,
"้": 433,
"า": 434,
"น": 435,
"章": 436,
"¥": 437,
"η": 438,
"中": 439,
"心": 440,
"的": 441,
"陛": 442,
"朝": 443,
"群": 444,
"众": 445,
"ใ": 446,
"缺": 447,
"陷": 448,
"以": 449,
"防": 450,
"本": 451,
"Ð": 452,
"👀": 453,
"永": 454,
"久": 455,
"可": 456,
"某": 457,
"种": 458,
"方": 459,
"差": 460,
"↑": 461,
"和": 462,
"外": 463,
"危": 464,
"出": 465,
"版": 466,
"社": 467,
"ë": 468,
"є": 469,
"Æ": 470,
"받": 471,
"️": 472,
"有": 473,
"效": 474,
"期": 475,
"现": 476,
"我": 477,
"τ": 478,
"開": 479,
"会": 480,
"": 481,
"न": 482,
"ण": 483,
"ो": 484,
"र": 485,
"్": 486,
"ุ": 487,
"ḏ": 488,
"路": 489,
"口": 490,
"↓": 491,
"│": 492,
"框": 493,
"о": 494,
"к": 495,
"教": 496,
"育": 497,
"活": 498,
"动": 499,
"≡": 500,
"с": 501,
"ல": 502,
"்": 503,
"服": 504,
"务": 505,
"氛": 506,
"阶": 507,
"级": 508,
"🇻": 509,
"🇳": 510,
"☆": 511,
"😒": 512,
"遮": 513,
"挡": 514,
"体": 515,
"检": 516,
"数": 517,
"据": 518,
"显": 519,
"示": 520,
"ก": 521,
"ั": 522,
"บ": 523,
"실": 524,
"😻": 525,
"涵": 526,
"盖": 527,
"©": 528,
"墩": 529,
"❤": 530,
"宣": 531,
"言": 532,
"ὅ": 533,
"ι": 534,
"ὀ": 535,
"ὑ": 536,
"ὁ": 537,
"ὄ": 538,
"∞": 539,
"ɒ": 540,
"ɛ": 541,
"ɑ": 542,
"ᾶ": 543,
"υ": 544,
"Κ": 545,
"Ε": 546,
"Π": 547,
"Χ": 548,
"Λ": 549,
"¸": 550,
"値": 551,
"º": 552,
"Ü": 553,
"😏": 554,
"第": 555,
"三": 556,
"步": 557,
"肖": 558,
"š": 559,
"😂": 560,
"ళ": 561,
"乔": 562,
"木": 563,
"线": 564,
"​": 565,
"作": 566,
"ı": 567,
"😔": 568,
"😃": 569,
"💔": 570,
"걸": 571,
"😌": 572,
"ఉ": 573,
"エ": 574,
"无": 575,
"总": 576,
"竞": 577,
"赛": 578,
"Φ": 579,
"»": 580,
"语": 581,
"表": 582,
"达": 583,
"🙃": 584,
"顯": 585,
"ో": 586,
"Ἐ": 587,
"ἀ": 588,
"ὶ": 589,
"ν": 590,
"ἄ": 591,
"ο": 592,
")": 593,
"💢": 594,
"哉": 595,
"开": 596,
"实": 597,
"็": 598,
"🥰": 599,
"ˉ": 600,
"学": 601,
"报": 602,
"ณ": 603,
"ె": 604,
"🙂": 605,
"ř": 606,
"": 607,
"ð": 608,
"右": 609,
"上": 610,
"音": 611,
"调": 612,
"奎": 613,
"": 614,
"": 615,
"兑": 616,
":": 617,
"ట": 618,
"ῆ": 619,
"ᶻ": 620,
"𝗓": 621,
"𐰁": 622,
"暴": 623,
"د": 624,
"َ": 625,
"ا": 626,
"ت": 627,
"ُ": 628,
"و": 629,
"ْ": 630,
"م": 631,
"غ": 632,
"ق": 633,
"ن": 634,
"ل": 635,
"ِ": 636,
"ي": 637,
"ح": 638,
"ك": 639,
"足": 640,
"™": 641,
"与": 642,
"应": 643,
"用": 644,
"毛": 645,
"纠": 646,
"息": 647,
"번": 648,
"授": 649,
"课": 650,
"ధ": 651,
"ฺ": 652,
"剩": 653,
"☺": 654,
"声": 655,
"మ": 656,
"ా": 657,
"డ": 658,
"信": 659,
"ć": 660,
"č": 661,
"Ï": 662,
"ʊ": 663,
"\t": 664,
"挂": 665,
"ซ": 666,
"😊": 667,
"岗": 668,
"终": 669,
"身": 670,
"Ӏ": 671,
"成": 672,
"り": 673,
"市": 674,
"民": 675,
"讀": 676,
"🤭": 677,
"🙄": 678,
"罕": 679,
"‹": 680,
"涡": 681,
"ื": 682,
"": 683,
"🫰": 684,
"🏻": 685,
"ń": 686,
"欲": 687,
"这": 688,
"组": 689,
"之": 690,
"那": 691,
"樓": 692,
"贯": 693,
"穿": 694,
"阙": 695,
"Ÿ": 696,
"咬": 697,
"ర": 698,
"确": 699,
"定": 700,
"项": 701,
"葬": 702,
"相": 703,
"匈": 704,
"奴": 705,
"ẍ": 706,
"个": 707,
"词": 708,
"刑": 709,
"鼻": 710,
"钢": 711,
"板": 712,
"驰": 713,
"串": 714,
"ม": 715,
"🤬": 716,
"อ": 717,
"ง": 718,
"拉": 719,
"목": 720,
"̂": 721,
"̛": 722,
"ख": 723,
"€": 724,
"😁": 725,
"捐": 726,
"赠": 727,
"其": 728,
"所": 729,
"必": 730,
"须": 731,
"盛": 732,
"ẏ": 733,
"檀": 734,
"闲": 735,
"从": 736,
"头": 737,
"被": 738,
"打": 739,
"四": 740,
"旅": 741,
"游": 742,
"œ": 743,
"ด": 744,
"粹": 745,
"합": 746,
"こ": 747,
"と": 748,
"󠇄": 749,
"హ": 750,
"చ": 751,
"ϱ": 752,
"న": 753,
"గ": 754,
"द": 755,
"ध": 756,
"ौ": 757,
"文": 758,
"句": 759,
"首": 760,
"恰": 761,
"当": 762,
"格": 763,
"山": 764,
"区": 765,
"🥲": 766,
"将": 767,
"ῦ": 768,
"ὸ": 769,
"制": 770,
"Υ": 771,
"Ώ": 772,
"Ξ": 773,
"Ρ": 774,
"😇": 775,
"з": 776,
"м": 777,
"專": 778,
"经": 779,
"济": 780,
"技": 781,
"术": 782,
"世": 783,
"界": 784,
"最": 785,
"😍": 786,
"술": 787,
"克": 788,
"н": 789,
"а": 790,
"р": 791,
"ω": 792,
"覆": 793,
"率": 794,
"士": 795,
"虞": 796,
"": 797,
"을": 798,
"指": 799,
"接": 800,
"收": 801,
"в": 802,
"星": 803,
"空": 804,
"ǚ": 805,
"함": 806,
"并": 807,
"提": 808,
"實": 809,
"斗": 810,
"遁": 811,
"稿": 812,
"素": 813,
"限": 814,
"时": 815,
"ˆ": 816,
"验": 817,
"客": 818,
"龙": 819,
"企": 820,
"它": 821,
"堡": 822,
"垒": 823,
"集": 824,
"任": 825,
"何": 826,
"他": 827,
"😞": 828,
"집": 829,
"冬": 830,
"奏": 831,
"土": 832,
"性": 833,
"地": 834,
"Ñ": 835,
"ท": 836,
"ี": 837,
"่": 838,
"⁴": 839,
"Ⅱ": 840,
"派": 841,
"薪": 842,
"ే": 843,
"洛": 844,
"基": 845,
"准": 846,
"函": 847,
"委": 848,
"托": 849,
"牵": 850,
"歇": 851,
"抽": 852,
"晖": 853,
"媒": 854,
"医": 855,
"ẋ": 856,
"明": 857,
"있": 858,
"´": 859,
"十": 860,
"条": 861,
"ş": 862,
"黒": 863,
"劲": 864,
"🤗": 865,
"始": 866,
"皇": 867,
"气": 868,
"❌": 869,
"✅": 870,
"慎": 871,
"😋": 872,
"ต": 873,
"ู": 874,
"ย": 875,
"ธ": 876,
"ช": 877,
"ิ": 878,
"ศ": 879,
"😮": 880,
"‍": 881,
"💨": 882,
"绩": 883,
"勒": 884,
"斯": 885,
"Ṣ": 886,
"ṯ": 887,
"华": 888,
"南": 889,
"字": 890,
"过": 891,
"М": 892,
"у": 893,
"风": 894,
"电": 895,
"🥵": 896,
"도": 897,
"線": 898,
"逼": 899,
"行": 900,
"争": 901,
"设": 902,
"备": 903,
"ट": 904,
"末": 905,
"ల": 906,
"щ": 907,
"и": 908,
"族": 909,
"自": 910,
"治": 911,
"县": 912,
"😘": 913,
"窗": 914,
"饮": 915,
"占": 916,
"比": 917,
"产": 918,
"化": 919,
"百": 920,
"度": 921,
"़": 922,
"对": 923,
"完": 924,
"为": 925,
"核": 926,
"靠": 927,
"д": 928,
"话": 929,
"你": 930,
"联": 931,
"系": 932,
"是": 933,
"。": 934,
"尚": 935,
"知": 936,
"样": 937,
"涅": 938,
"ś": 939,
"畔": 940,
"할": 941,
"都": 942,
"很": 943,
"习": 944,
"近": 945,
"遗": 946,
"憾": 947,
" ": 948,
"✓": 949,
"เ": 950,
"많": 951,
"脑": 952,
"续": 953,
"增": 954,
"长": 955,
"公": 956,
"司": 957,
"😳": 958,
"视": 959,
"🤨": 960,
"需": 961,
"簿": 962,
"犁": 963,
"存": 964,
"货": 965,
"芙": 966,
"份": 967,
"典": 968,
"们": 969,
"来": 970,
"写": 971,
"强": 972,
"帆": 973,
"資": 974,
"🤣": 975,
"橋": 976,
"⁠": 977,
"🫨": 978,
"╭": 979,
" ̄": 980,
"♡": 981,
"广": 982,
"₫": 983,
"ꈨ": 984,
"ຶ": 985,
"˙": 986,
"̫": 987,
"̮": 988,
"`": 989,
"ノ": 990,
"𝐁": 991,
"𝐂": 992,
"𝐀": 993,
"𝐬": 994,
"𝐢": 995,
"𝐠": 996,
"𝐧": 997,
"𝐦": 998,
"𝐞": 999,
"𝐭": 1000,
"・": 1001,
"✈": 1002,
"̆": 1003,
"᷄": 1004,
"⌓": 1005,
"᷅": 1006
},
"unk_token": "<unk>"
}
}