spell_epoch_1 / char_tokenizer.json
PhucMinh111's picture
Uploading tokenizer content
de3e960 verified
{
"version": "1.0",
"truncation": null,
"padding": {
"strategy": "BatchLongest",
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 0,
"pad_type_id": 0,
"pad_token": "[PAD]"
},
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "WhitespaceSplit"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"n": 4,
"h": 5,
"t": 6,
"c": 7,
"i": 8,
"g": 9,
"a": 10,
"u": 11,
"đ": 12,
"m": 13,
"o": 14,
"r": 15,
"v": 16,
"à": 17,
"l": 18,
"p": 19,
",": 20,
"ư": 21,
"y": 22,
"á": 23,
"s": 24,
"b": 25,
"k": 26,
".": 27,
"d": 28,
"ế": 29,
"ệ": 30,
"ạ": 31,
"ô": 32,
"e": 33,
"ả": 34,
"ê": 35,
"ộ": 36,
"ó": 37,
"ố": 38,
"ớ": 39,
"ấ": 40,
"ờ": 41,
"ị": 42,
"â": 43,
"q": 44,
"ề": 45,
"ủ": 46,
"1": 47,
"ể": 48,
"2": 49,
"ợ": 50,
"0": 51,
"ơ": 52,
"ậ": 53,
"x": 54,
"ì": 55,
"ầ": 56,
"ự": 57,
"ă": 58,
"ứ": 59,
"ở": 60,
"í": 61,
"ã": 62,
"ụ": 63,
"ọ": 64,
"ồ": 65,
"ữ": 66,
"-": 67,
"ắ": 68,
"ú": 69,
"ổ": 70,
"ừ": 71,
"ò": 72,
"ù": 73,
"ặ": 74,
"3": 75,
"ỉ": 76,
"9": 77,
")": 78,
"(": 79,
"ũ": 80,
"5": 81,
"\"": 82,
"ễ": 83,
":": 84,
"ử": 85,
"4": 86,
"/": 87,
"f": 88,
"ẽ": 89,
"ý": 90,
"ỏ": 91,
"6": 92,
"ẩ": 93,
"é": 94,
"8": 95,
"7": 96,
"ẫ": 97,
"ằ": 98,
"ỗ": 99,
"ĩ": 100,
"w": 101,
"ẻ": 102,
";": 103,
"'": 104,
"ỹ": 105,
"ẹ": 106,
"ỷ": 107,
"%": 108,
"ỳ": 109,
"z": 110,
"j": 111,
"ỡ": 112,
"õ": 113,
"è": 114,
"ẳ": 115,
"?": 116,
"ẵ": 117,
"–": 118,
"&": 119,
"!": 120,
"*": 121,
"’": 122,
"+": 123,
"‘": 124,
">": 125,
"|": 126,
"_": 127,
"ỵ": 128,
"=": 129,
"@": 130,
"[": 131,
"]": 132,
"•": 133,
"#": 134,
"●": 135,
"·": 136,
"ð": 137,
"°": 138,
"<": 139,
"ö": 140,
"≥": 141,
"$": 142,
"ü": 143,
"о": 144,
"⁄": 145,
"а": 146,
"\\": 147,
"т": 148,
"е": 149,
"и": 150,
"~": 151,
"с": 152,
"н": 153,
"ä": 154,
"—": 155,
"ç": 156,
"р": 157,
"ø": 158,
"≤": 159,
"ë": 160,
"к": 161,
"п": 162,
"ć": 163,
"л": 164,
"‐": 165,
"μ": 166,
"«": 167,
"š": 168,
"ь": 169,
"×": 170,
"ā": 171,
"м": 172,
"у": 173,
"å": 174,
"β": 175,
"ч": 176,
"я": 177,
"в": 178,
"`": 179,
"ō": 180,
"ï": 181,
"č": 182,
"ŋ": 183,
"̣": 184,
"−": 185,
"α": 186,
"ы": 187,
"�": 188,
"ß": 189,
"д": 190,
"′": 191,
"ń": 192,
"́": 193,
"б": 194,
"̀": 195,
"з": 196,
"ş": 197,
"̉": 198,
"‰": 199,
"»": 200,
"æ": 201,
"δ": 202,
"ν": 203,
"ж": 204,
"ğ": 205,
"ı": 206,
"ɛ": 207,
"虎": 208,
"ǎ": 209,
"г": 210,
"ᅲ": 211,
"人": 212,
"家": 213,
"空": 214,
"ġ": 215,
"ī": 216,
"ł": 217,
"ū": 218,
"ɪ": 219,
"ς": 220,
"ц": 221,
"子": 222,
"礼": 223,
"精": 224,
"自": 225,
"花": 226
},
"unk_token": "[UNK]"
}
}