PuppetLover commited on
Commit
e1776b7
·
verified ·
1 Parent(s): e1cc7e4

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +319 -3
  2. special_tokens_map.json +51 -51
  3. tokenizer_config.json +0 -0
added_tokens.json CHANGED
@@ -1,3 +1,319 @@
1
- {
2
- "<mask>": 64000
3
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1.600_gia_đì": 64247,
3
+ "27-29": 64018,
4
+ "30/4/2025": 64169,
5
+ "<mask>": 64000,
6
+ "A_i": 64122,
7
+ "Akihi_o": 64048,
8
+ "Aoki_Kazuhiko": 64036,
9
+ "Ba_A": 64041,
10
+ "Bê_cạ": 64147,
11
+ "Bùi_Tha_h_Sơ": 64026,
12
+ "Bả": 64085,
13
+ "Bả_Ishiba": 64004,
14
+ "Chuyế": 64019,
15
+ "Chuyế_hăm": 64059,
16
+ "Chá_h": 64034,
17
+ "Chí_h": 64022,
18
+ "Chủ_ịch": 64218,
19
+ "Cố_vấ": 64037,
20
+ "Hai_bê": 64139,
21
+ "Hi_oo": 64051,
22
+ "Hiệ": 64179,
23
+ "Huy_h": 64285,
24
+ "Hòa": 64225,
25
+ "Hội_gười": 64304,
26
+ "I_o": 64097,
27
+ "Iijima_Isao": 64046,
28
+ "Ishiba": 64007,
29
+ "Jea_e": 64284,
30
+ "Kaiha_a": 64054,
31
+ "Ke": 64055,
32
+ "Koji": 64053,
33
+ "Lê_Thị_Bích_T": 64023,
34
+ "Lươ": 64151,
35
+ "Masa_aka": 64044,
36
+ "Mộ_cộ": 64279,
37
+ "Nagashima_Akihisa": 64039,
38
+ "Nakashima": 64050,
39
+ "Nam_Tow": 64264,
40
+ "Nam_chố": 64159,
41
+ "Nam_hực": 64128,
42
+ "Nam_vẫ": 64282,
43
+ "Nam_ă": 64311,
44
+ "Nam_ại": 64031,
45
+ "Nam_ỉ": 64243,
46
+ "Nam_ừ": 64016,
47
+ "Naoki": 64098,
48
+ "Nguyễ_Mi": 64029,
49
+ "Nhậ": 64003,
50
+ "Nhậ_Bả": 64011,
51
+ "Nhữ": 64229,
52
+ "Oka_o": 64043,
53
+ "Phu_hâ": 64006,
54
+ "Phạm_Mi_h": 64021,
55
+ "Phạm_Qua": 64032,
56
+ "Phầ_lớ": 64249,
57
+ "Phố_Việ": 64281,
58
+ "S_isuk": 64271,
59
+ "Shige_u": 64005,
60
+ "TPHCM.": 64183,
61
+ "TPHCM_phá": 64184,
62
+ "TPHCM_sá": 64160,
63
+ "T_o": 64195,
64
+ "T_u": 64216,
65
+ "Tha_i": 64221,
66
+ "Thá": 64272,
67
+ "Thái_La": 64153,
68
+ "Tháp_ù": 64033,
69
+ "Thườ": 64028,
70
+ "Thủ_ướ": 64002,
71
+ "Tsuchimichi": 64047,
72
+ "Tì": 64211,
73
+ "Tổ_g": 64040,
74
+ "Udo": 64220,
75
+ "Udo_Tha": 64244,
76
+ "Vie": 64265,
77
+ "Việ": 64015,
78
+ "Việ_Nam": 64057,
79
+ "Việ_kiều": 64260,
80
+ "Việ_ma": 64256,
81
+ "Việ_ại": 64219,
82
+ "Vũ_Thị_Huỳ_h": 64172,
83
+ "Xuâ": 64273,
84
+ "Xuâ_Hòa": 64152,
85
+ "Yoshi_o": 64052,
86
+ "Yoshiko": 64008,
87
+ "a_g": 64296,
88
+ "a_h": 64278,
89
+ "a_hà_h": 64309,
90
+ "a_o": 64056,
91
+ "aka_Mao": 64049,
92
+ "am_Tow": 64266,
93
+ "biế": 64099,
94
+ "biế_ơ": 64240,
95
+ "biể_hà": 64269,
96
+ "biểu_dươ": 64162,
97
+ "biệ": 64176,
98
+ "buô_bá": 64255,
99
+ "bê_hư": 64145,
100
+ "bì_h": 64071,
101
+ "bả_sắc": 64257,
102
+ "bắ": 64012,
103
+ "bắ_đầu": 64274,
104
+ "bằ": 64251,
105
+ "bổ_g": 64316,
106
+ "chiế": 64226,
107
+ "chiế_lược": 64067,
108
+ "chuyê_cơ": 64001,
109
+ "chuyê_gia": 64205,
110
+ "chuyế": 64062,
111
+ "chuyể": 64091,
112
+ "chuyể_đổi": 64089,
113
+ "chí_h": 64084,
114
+ "chấ": 64077,
115
+ "chức_ă": 64283,
116
+ "co_gười": 64254,
117
+ "co_phố": 64267,
118
+ "co_si": 64250,
119
+ "co_số": 64291,
120
+ "co_xa": 64155,
121
+ "cà_g": 64261,
122
+ "cá_hâ": 64117,
123
+ "cù_g": 64009,
124
+ "cơ_sở_hạ": 64131,
125
+ "cườ": 64102,
126
+ "cải_hiệ": 64313,
127
+ "cậy_chí": 64081,
128
+ "diễ_a": 64199,
129
+ "diệ": 64069,
130
+ "doa": 64207,
131
+ "doa_h": 64193,
132
+ "dâ": 64258,
133
+ "dự_g": 64164,
134
+ "dự_á": 64095,
135
+ "eu_o": 64315,
136
+ "ghiệp": 64134,
137
+ "ghiệp_hố": 64236,
138
+ "ghĩ": 64299,
139
+ "ghĩa": 64060,
140
+ "ghị": 64198,
141
+ "giai_đọa": 64196,
142
+ "goại": 64111,
143
+ "guyệ": 64306,
144
+ "guồ": 64132,
145
+ "guồ_lực": 64200,
146
+ "gày": 64017,
147
+ "gôi_hà": 64292,
148
+ "gười": 64154,
149
+ "gầ": 64201,
150
+ "gắ": 64268,
151
+ "h_Vũ": 64030,
152
+ "h_chí": 64270,
153
+ "h_cư": 64287,
154
+ "h_cảm": 64212,
155
+ "h_hổ": 64181,
156
+ "h_hức": 64014,
157
+ "h_leo": 64146,
158
+ "h_vực": 64080,
159
+ "h_đạo": 64115,
160
+ "hau": 64276,
161
+ "hiêu": 64301,
162
+ "hiế": 64227,
163
+ "hiế_hực": 64191,
164
+ "hiều": 64094,
165
+ "hiều_hội": 64197,
166
+ "hiều_ý": 64203,
167
+ "hiệ": 64129,
168
+ "hiệm": 64170,
169
+ "hà_cửa": 64294,
170
+ "hà_g": 64110,
171
+ "hà_h": 64189,
172
+ "hà_lã": 64118,
173
+ "hào_hơ": 64262,
174
+ "hào_ằ": 64231,
175
+ "hách_hức": 64124,
176
+ "hâ": 64166,
177
+ "hâ_lực": 64133,
178
+ "hì_h": 64150,
179
+ "hòa": 64070,
180
+ "hòa_òa": 64302,
181
+ "hóa": 64135,
182
+ "hăm_TPHCM": 64222,
183
+ "hăm_chí": 64013,
184
+ "hăm_diễ": 64020,
185
+ "hăm_hâ": 64192,
186
+ "hăm_lầ": 64113,
187
+ "hăm_đầu": 64063,
188
+ "hươ": 64137,
189
+ "hướ": 64210,
190
+ "hấ": 64138,
191
+ "hấy": 64290,
192
+ "hậ": 64202,
193
+ "hằm": 64101,
194
+ "hế": 64061,
195
+ "hế_giới": 64107,
196
+ "hể": 64096,
197
+ "hể_chế": 64130,
198
+ "hị": 64141,
199
+ "họa_độ": 64190,
200
+ "hỏ": 64235,
201
+ "hỗ_ợ": 64127,
202
+ "hợp_ác": 64082,
203
+ "hứ": 64297,
204
+ "hữ": 64108,
205
+ "hữu_ghị": 64259,
206
+ "i_Masafumi": 64045,
207
+ "i_cậy": 64116,
208
+ "i_h": 64042,
209
+ "i_ước": 64245,
210
+ "iê": 64064,
211
+ "iế": 64092,
212
+ "iếp_ục": 64086,
213
+ "iề": 64156,
214
+ "iể": 64093,
215
+ "iể_hà": 64165,
216
+ "iể_hực": 64076,
217
+ "kha_g": 64295,
218
+ "khô_g": 64289,
219
+ "khă": 64314,
220
+ "khả_ă": 64143,
221
+ "khẳ": 64119,
222
+ "khỏa": 64180,
223
+ "kiế": 64204,
224
+ "kiệ": 64248,
225
+ "kỷ_iệm": 64167,
226
+ "lam_hắ": 64280,
227
+ "liê_hệ": 64182,
228
+ "luô": 64238,
229
+ "luô_dà": 64174,
230
+ "lã": 64114,
231
+ "lĩ": 64079,
232
+ "lượ": 64232,
233
+ "lầ": 64100,
234
+ "lớ": 64233,
235
+ "miề": 64158,
236
+ "miề_Nam": 64230,
237
+ "muố": 64112,
238
+ "máu_hị": 64178,
239
+ "mặ": 64161,
240
+ "mặ_hà": 64144,
241
+ "mừ": 64157,
242
+ "o_Naoki": 64058,
243
+ "o_lớ": 64300,
244
+ "phươ": 64121,
245
+ "phấ_đấu": 64140,
246
+ "phầ": 64177,
247
+ "phồ": 64072,
248
+ "quyề": 64252,
249
+ "quốc_phò": 64123,
250
+ "quốc_ế": 64125,
251
+ "si_h": 64173,
252
+ "suấ": 64312,
253
+ "sâ": 64024,
254
+ "sả_xuấ": 64194,
255
+ "u_g": 64105,
256
+ "vi_h": 64073,
257
+ "viê": 64215,
258
+ "vă": 64035,
259
+ "vượ": 64185,
260
+ "vấ": 64208,
261
+ "vậ_lực": 64234,
262
+ "vữ": 64186,
263
+ "xa_h": 64090,
264
+ "xuố": 64288,
265
+ "xúc_độ": 64303,
266
+ "xươ": 64228,
267
+ "Ô_g": 64213,
268
+ "ác_o": 64088,
269
+ "âm_sự": 64286,
270
+ "âm_ă": 64106,
271
+ "âm_đặc": 64175,
272
+ "âm_ới": 64307,
273
+ "ê_hế": 64263,
274
+ "ê_hế_giới": 64074,
275
+ "ê_ấ": 64078,
276
+ "ì_h": 64149,
277
+ "í_h": 64246,
278
+ "í_hức": 64206,
279
+ "òa": 64068,
280
+ "ô_g": 64310,
281
+ "ă_mó": 64277,
282
+ "Đô_g": 64103,
283
+ "Đả_g": 64241,
284
+ "Đối_ác": 64066,
285
+ "đa_g": 64104,
286
+ "đà_phá": 64075,
287
+ "đòa": 64010,
288
+ "đú": 64223,
289
+ "đơ_sơ": 64293,
290
+ "đạ": 64188,
291
+ "đại_hóa": 64136,
292
+ "đấ_ước": 64168,
293
+ "đầu_àu": 64187,
294
+ "đặc_biệ": 64038,
295
+ "đề_liê": 64209,
296
+ "đị": 64120,
297
+ "đối_ác": 64087,
298
+ "đứ": 64308,
299
+ "ơ_Chí_h": 64242,
300
+ "ưu_iê": 64109,
301
+ "ươ": 64217,
302
+ "ước_goài": 64163,
303
+ "ước_hà": 64237,
304
+ "ước_hằm": 64148,
305
+ "ước_â": 64065,
306
+ "ước_đa": 64224,
307
+ "ườ": 64142,
308
+ "ưở": 64027,
309
+ "ưở_g": 64025,
310
+ "ấ_hiều": 64298,
311
+ "ấ_đô": 64253,
312
+ "ẩm_hực": 64275,
313
+ "ập_hợp": 64305,
314
+ "ập_u": 64126,
315
+ "ụ_cộ": 64083,
316
+ "Ủy": 64171,
317
+ "ủy": 64214,
318
+ "ự_hào": 64239
319
+ }
special_tokens_map.json CHANGED
@@ -1,51 +1,51 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "</s>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "<pad>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "</s>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "<unk>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
- }
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff