PuppetLover commited on
Commit
8cd4582
·
verified ·
1 Parent(s): e1776b7

Upload tokenizer

Browse files
Files changed (2) hide show
  1. added_tokens.json +0 -36
  2. tokenizer_config.json +0 -664
added_tokens.json CHANGED
@@ -20,13 +20,10 @@
20
  "Hai_bê": 64139,
21
  "Hi_oo": 64051,
22
  "Hiệ": 64179,
23
- "Huy_h": 64285,
24
  "Hòa": 64225,
25
- "Hội_gười": 64304,
26
  "I_o": 64097,
27
  "Iijima_Isao": 64046,
28
  "Ishiba": 64007,
29
- "Jea_e": 64284,
30
  "Kaiha_a": 64054,
31
  "Ke": 64055,
32
  "Koji": 64053,
@@ -39,8 +36,6 @@
39
  "Nam_Tow": 64264,
40
  "Nam_chố": 64159,
41
  "Nam_hực": 64128,
42
- "Nam_vẫ": 64282,
43
- "Nam_ă": 64311,
44
  "Nam_ại": 64031,
45
  "Nam_ỉ": 64243,
46
  "Nam_ừ": 64016,
@@ -54,7 +49,6 @@
54
  "Phạm_Mi_h": 64021,
55
  "Phạm_Qua": 64032,
56
  "Phầ_lớ": 64249,
57
- "Phố_Việ": 64281,
58
  "S_isuk": 64271,
59
  "Shige_u": 64005,
60
  "TPHCM.": 64183,
@@ -84,9 +78,7 @@
84
  "Xuâ_Hòa": 64152,
85
  "Yoshi_o": 64052,
86
  "Yoshiko": 64008,
87
- "a_g": 64296,
88
  "a_h": 64278,
89
- "a_hà_h": 64309,
90
  "a_o": 64056,
91
  "aka_Mao": 64049,
92
  "am_Tow": 64266,
@@ -102,7 +94,6 @@
102
  "bắ": 64012,
103
  "bắ_đầu": 64274,
104
  "bằ": 64251,
105
- "bổ_g": 64316,
106
  "chiế": 64226,
107
  "chiế_lược": 64067,
108
  "chuyê_cơ": 64001,
@@ -112,18 +103,15 @@
112
  "chuyể_đổi": 64089,
113
  "chí_h": 64084,
114
  "chấ": 64077,
115
- "chức_ă": 64283,
116
  "co_gười": 64254,
117
  "co_phố": 64267,
118
  "co_si": 64250,
119
- "co_số": 64291,
120
  "co_xa": 64155,
121
  "cà_g": 64261,
122
  "cá_hâ": 64117,
123
  "cù_g": 64009,
124
  "cơ_sở_hạ": 64131,
125
  "cườ": 64102,
126
- "cải_hiệ": 64313,
127
  "cậy_chí": 64081,
128
  "diễ_a": 64199,
129
  "diệ": 64069,
@@ -132,25 +120,20 @@
132
  "dâ": 64258,
133
  "dự_g": 64164,
134
  "dự_á": 64095,
135
- "eu_o": 64315,
136
  "ghiệp": 64134,
137
  "ghiệp_hố": 64236,
138
- "ghĩ": 64299,
139
  "ghĩa": 64060,
140
  "ghị": 64198,
141
  "giai_đọa": 64196,
142
  "goại": 64111,
143
- "guyệ": 64306,
144
  "guồ": 64132,
145
  "guồ_lực": 64200,
146
  "gày": 64017,
147
- "gôi_hà": 64292,
148
  "gười": 64154,
149
  "gầ": 64201,
150
  "gắ": 64268,
151
  "h_Vũ": 64030,
152
  "h_chí": 64270,
153
- "h_cư": 64287,
154
  "h_cảm": 64212,
155
  "h_hổ": 64181,
156
  "h_hức": 64014,
@@ -158,7 +141,6 @@
158
  "h_vực": 64080,
159
  "h_đạo": 64115,
160
  "hau": 64276,
161
- "hiêu": 64301,
162
  "hiế": 64227,
163
  "hiế_hực": 64191,
164
  "hiều": 64094,
@@ -166,7 +148,6 @@
166
  "hiều_ý": 64203,
167
  "hiệ": 64129,
168
  "hiệm": 64170,
169
- "hà_cửa": 64294,
170
  "hà_g": 64110,
171
  "hà_h": 64189,
172
  "hà_lã": 64118,
@@ -177,7 +158,6 @@
177
  "hâ_lực": 64133,
178
  "hì_h": 64150,
179
  "hòa": 64070,
180
- "hòa_òa": 64302,
181
  "hóa": 64135,
182
  "hăm_TPHCM": 64222,
183
  "hăm_chí": 64013,
@@ -188,7 +168,6 @@
188
  "hươ": 64137,
189
  "hướ": 64210,
190
  "hấ": 64138,
191
- "hấy": 64290,
192
  "hậ": 64202,
193
  "hằm": 64101,
194
  "hế": 64061,
@@ -200,7 +179,6 @@
200
  "hỏ": 64235,
201
  "hỗ_ợ": 64127,
202
  "hợp_ác": 64082,
203
- "hứ": 64297,
204
  "hữ": 64108,
205
  "hữu_ghị": 64259,
206
  "i_Masafumi": 64045,
@@ -214,9 +192,6 @@
214
  "iể": 64093,
215
  "iể_hà": 64165,
216
  "iể_hực": 64076,
217
- "kha_g": 64295,
218
- "khô_g": 64289,
219
- "khă": 64314,
220
  "khả_ă": 64143,
221
  "khẳ": 64119,
222
  "khỏa": 64180,
@@ -240,7 +215,6 @@
240
  "mặ_hà": 64144,
241
  "mừ": 64157,
242
  "o_Naoki": 64058,
243
- "o_lớ": 64300,
244
  "phươ": 64121,
245
  "phấ_đấu": 64140,
246
  "phầ": 64177,
@@ -249,7 +223,6 @@
249
  "quốc_phò": 64123,
250
  "quốc_ế": 64125,
251
  "si_h": 64173,
252
- "suấ": 64312,
253
  "sâ": 64024,
254
  "sả_xuấ": 64194,
255
  "u_g": 64105,
@@ -261,15 +234,11 @@
261
  "vậ_lực": 64234,
262
  "vữ": 64186,
263
  "xa_h": 64090,
264
- "xuố": 64288,
265
- "xúc_độ": 64303,
266
  "xươ": 64228,
267
  "Ô_g": 64213,
268
  "ác_o": 64088,
269
- "âm_sự": 64286,
270
  "âm_ă": 64106,
271
  "âm_đặc": 64175,
272
- "âm_ới": 64307,
273
  "ê_hế": 64263,
274
  "ê_hế_giới": 64074,
275
  "ê_ấ": 64078,
@@ -277,7 +246,6 @@
277
  "í_h": 64246,
278
  "í_hức": 64206,
279
  "òa": 64068,
280
- "ô_g": 64310,
281
  "ă_mó": 64277,
282
  "Đô_g": 64103,
283
  "Đả_g": 64241,
@@ -286,7 +254,6 @@
286
  "đà_phá": 64075,
287
  "đòa": 64010,
288
  "đú": 64223,
289
- "đơ_sơ": 64293,
290
  "đạ": 64188,
291
  "đại_hóa": 64136,
292
  "đấ_ước": 64168,
@@ -295,7 +262,6 @@
295
  "đề_liê": 64209,
296
  "đị": 64120,
297
  "đối_ác": 64087,
298
- "đứ": 64308,
299
  "ơ_Chí_h": 64242,
300
  "ưu_iê": 64109,
301
  "ươ": 64217,
@@ -307,10 +273,8 @@
307
  "ườ": 64142,
308
  "ưở": 64027,
309
  "ưở_g": 64025,
310
- "ấ_hiều": 64298,
311
  "ấ_đô": 64253,
312
  "ẩm_hực": 64275,
313
- "ập_hợp": 64305,
314
  "ập_u": 64126,
315
  "ụ_cộ": 64083,
316
  "Ủy": 64171,
 
20
  "Hai_bê": 64139,
21
  "Hi_oo": 64051,
22
  "Hiệ": 64179,
 
23
  "Hòa": 64225,
 
24
  "I_o": 64097,
25
  "Iijima_Isao": 64046,
26
  "Ishiba": 64007,
 
27
  "Kaiha_a": 64054,
28
  "Ke": 64055,
29
  "Koji": 64053,
 
36
  "Nam_Tow": 64264,
37
  "Nam_chố": 64159,
38
  "Nam_hực": 64128,
 
 
39
  "Nam_ại": 64031,
40
  "Nam_ỉ": 64243,
41
  "Nam_ừ": 64016,
 
49
  "Phạm_Mi_h": 64021,
50
  "Phạm_Qua": 64032,
51
  "Phầ_lớ": 64249,
 
52
  "S_isuk": 64271,
53
  "Shige_u": 64005,
54
  "TPHCM.": 64183,
 
78
  "Xuâ_Hòa": 64152,
79
  "Yoshi_o": 64052,
80
  "Yoshiko": 64008,
 
81
  "a_h": 64278,
 
82
  "a_o": 64056,
83
  "aka_Mao": 64049,
84
  "am_Tow": 64266,
 
94
  "bắ": 64012,
95
  "bắ_đầu": 64274,
96
  "bằ": 64251,
 
97
  "chiế": 64226,
98
  "chiế_lược": 64067,
99
  "chuyê_cơ": 64001,
 
103
  "chuyể_đổi": 64089,
104
  "chí_h": 64084,
105
  "chấ": 64077,
 
106
  "co_gười": 64254,
107
  "co_phố": 64267,
108
  "co_si": 64250,
 
109
  "co_xa": 64155,
110
  "cà_g": 64261,
111
  "cá_hâ": 64117,
112
  "cù_g": 64009,
113
  "cơ_sở_hạ": 64131,
114
  "cườ": 64102,
 
115
  "cậy_chí": 64081,
116
  "diễ_a": 64199,
117
  "diệ": 64069,
 
120
  "dâ": 64258,
121
  "dự_g": 64164,
122
  "dự_á": 64095,
 
123
  "ghiệp": 64134,
124
  "ghiệp_hố": 64236,
 
125
  "ghĩa": 64060,
126
  "ghị": 64198,
127
  "giai_đọa": 64196,
128
  "goại": 64111,
 
129
  "guồ": 64132,
130
  "guồ_lực": 64200,
131
  "gày": 64017,
 
132
  "gười": 64154,
133
  "gầ": 64201,
134
  "gắ": 64268,
135
  "h_Vũ": 64030,
136
  "h_chí": 64270,
 
137
  "h_cảm": 64212,
138
  "h_hổ": 64181,
139
  "h_hức": 64014,
 
141
  "h_vực": 64080,
142
  "h_đạo": 64115,
143
  "hau": 64276,
 
144
  "hiế": 64227,
145
  "hiế_hực": 64191,
146
  "hiều": 64094,
 
148
  "hiều_ý": 64203,
149
  "hiệ": 64129,
150
  "hiệm": 64170,
 
151
  "hà_g": 64110,
152
  "hà_h": 64189,
153
  "hà_lã": 64118,
 
158
  "hâ_lực": 64133,
159
  "hì_h": 64150,
160
  "hòa": 64070,
 
161
  "hóa": 64135,
162
  "hăm_TPHCM": 64222,
163
  "hăm_chí": 64013,
 
168
  "hươ": 64137,
169
  "hướ": 64210,
170
  "hấ": 64138,
 
171
  "hậ": 64202,
172
  "hằm": 64101,
173
  "hế": 64061,
 
179
  "hỏ": 64235,
180
  "hỗ_ợ": 64127,
181
  "hợp_ác": 64082,
 
182
  "hữ": 64108,
183
  "hữu_ghị": 64259,
184
  "i_Masafumi": 64045,
 
192
  "iể": 64093,
193
  "iể_hà": 64165,
194
  "iể_hực": 64076,
 
 
 
195
  "khả_ă": 64143,
196
  "khẳ": 64119,
197
  "khỏa": 64180,
 
215
  "mặ_hà": 64144,
216
  "mừ": 64157,
217
  "o_Naoki": 64058,
 
218
  "phươ": 64121,
219
  "phấ_đấu": 64140,
220
  "phầ": 64177,
 
223
  "quốc_phò": 64123,
224
  "quốc_ế": 64125,
225
  "si_h": 64173,
 
226
  "sâ": 64024,
227
  "sả_xuấ": 64194,
228
  "u_g": 64105,
 
234
  "vậ_lực": 64234,
235
  "vữ": 64186,
236
  "xa_h": 64090,
 
 
237
  "xươ": 64228,
238
  "Ô_g": 64213,
239
  "ác_o": 64088,
 
240
  "âm_ă": 64106,
241
  "âm_đặc": 64175,
 
242
  "ê_hế": 64263,
243
  "ê_hế_giới": 64074,
244
  "ê_ấ": 64078,
 
246
  "í_h": 64246,
247
  "í_hức": 64206,
248
  "òa": 64068,
 
249
  "ă_mó": 64277,
250
  "Đô_g": 64103,
251
  "Đả_g": 64241,
 
254
  "đà_phá": 64075,
255
  "đòa": 64010,
256
  "đú": 64223,
 
257
  "đạ": 64188,
258
  "đại_hóa": 64136,
259
  "đấ_ước": 64168,
 
262
  "đề_liê": 64209,
263
  "đị": 64120,
264
  "đối_ác": 64087,
 
265
  "ơ_Chí_h": 64242,
266
  "ưu_iê": 64109,
267
  "ươ": 64217,
 
273
  "ườ": 64142,
274
  "ưở": 64027,
275
  "ưở_g": 64025,
 
276
  "ấ_đô": 64253,
277
  "ẩm_hực": 64275,
 
278
  "ập_u": 64126,
279
  "ụ_cộ": 64083,
280
  "Ủy": 64171,
tokenizer_config.json CHANGED
@@ -360,14 +360,6 @@
360
  "single_word": false,
361
  "special": false
362
  },
363
- "99": {
364
- "content": "1",
365
- "lstrip": false,
366
- "normalized": true,
367
- "rstrip": false,
368
- "single_word": false,
369
- "special": false
370
- },
371
  "100": {
372
  "content": "số",
373
  "lstrip": false,
@@ -400,14 +392,6 @@
400
  "single_word": false,
401
  "special": false
402
  },
403
- "118": {
404
- "content": "hay",
405
- "lstrip": false,
406
- "normalized": true,
407
- "rstrip": false,
408
- "single_word": false,
409
- "special": false
410
- },
411
  "120": {
412
  "content": "Hà_Nội",
413
  "lstrip": false,
@@ -432,14 +416,6 @@
432
  "single_word": false,
433
  "special": false
434
  },
435
- "133": {
436
- "content": "giá",
437
- "lstrip": false,
438
- "normalized": true,
439
- "rstrip": false,
440
- "single_word": false,
441
- "special": false
442
- },
443
  "135": {
444
  "content": "...",
445
  "lstrip": false,
@@ -472,14 +448,6 @@
472
  "single_word": false,
473
  "special": false
474
  },
475
- "162": {
476
- "content": "Sau",
477
- "lstrip": false,
478
- "normalized": true,
479
- "rstrip": false,
480
- "single_word": false,
481
- "special": false
482
- },
483
  "163": {
484
  "content": "4",
485
  "lstrip": false,
@@ -504,14 +472,6 @@
504
  "single_word": false,
505
  "special": false
506
  },
507
- "173": {
508
- "content": "5",
509
- "lstrip": false,
510
- "normalized": true,
511
- "rstrip": false,
512
- "single_word": false,
513
- "special": false
514
- },
515
  "181": {
516
  "content": "cấp",
517
  "lstrip": false,
@@ -528,38 +488,6 @@
528
  "single_word": false,
529
  "special": false
530
  },
531
- "193": {
532
- "content": "em",
533
- "lstrip": false,
534
- "normalized": true,
535
- "rstrip": false,
536
- "single_word": false,
537
- "special": false
538
- },
539
- "207": {
540
- "content": "mọi",
541
- "lstrip": false,
542
- "normalized": true,
543
- "rstrip": false,
544
- "single_word": false,
545
- "special": false
546
- },
547
- "218": {
548
- "content": "Tôi",
549
- "lstrip": false,
550
- "normalized": true,
551
- "rstrip": false,
552
- "single_word": false,
553
- "special": false
554
- },
555
- "219": {
556
- "content": "lúc",
557
- "lstrip": false,
558
- "normalized": true,
559
- "rstrip": false,
560
- "single_word": false,
561
- "special": false
562
- },
563
  "222": {
564
  "content": "học",
565
  "lstrip": false,
@@ -640,14 +568,6 @@
640
  "single_word": false,
641
  "special": false
642
  },
643
- "267": {
644
- "content": "xã_hội",
645
- "lstrip": false,
646
- "normalized": true,
647
- "rstrip": false,
648
- "single_word": false,
649
- "special": false
650
- },
651
  "268": {
652
  "content": "Phó",
653
  "lstrip": false,
@@ -744,14 +664,6 @@
744
  "single_word": false,
745
  "special": false
746
  },
747
- "359": {
748
- "content": "khó",
749
- "lstrip": false,
750
- "normalized": true,
751
- "rstrip": false,
752
- "single_word": false,
753
- "special": false
754
- },
755
  "378": {
756
  "content": "độ",
757
  "lstrip": false,
@@ -776,22 +688,6 @@
776
  "single_word": false,
777
  "special": false
778
  },
779
- "426": {
780
- "content": "9",
781
- "lstrip": false,
782
- "normalized": true,
783
- "rstrip": false,
784
- "single_word": false,
785
- "special": false
786
- },
787
- "434": {
788
- "content": "Năm",
789
- "lstrip": false,
790
- "normalized": true,
791
- "rstrip": false,
792
- "single_word": false,
793
- "special": false
794
- },
795
  "447": {
796
  "content": "hỏi",
797
  "lstrip": false,
@@ -800,14 +696,6 @@
800
  "single_word": false,
801
  "special": false
802
  },
803
- "450": {
804
- "content": "Pháp",
805
- "lstrip": false,
806
- "normalized": true,
807
- "rstrip": false,
808
- "single_word": false,
809
- "special": false
810
- },
811
  "471": {
812
  "content": "sớm",
813
  "lstrip": false,
@@ -832,14 +720,6 @@
832
  "single_word": false,
833
  "special": false
834
  },
835
- "499": {
836
- "content": "bước",
837
- "lstrip": false,
838
- "normalized": true,
839
- "rstrip": false,
840
- "single_word": false,
841
- "special": false
842
- },
843
  "506": {
844
  "content": "hộ",
845
  "lstrip": false,
@@ -960,14 +840,6 @@
960
  "single_word": false,
961
  "special": false
962
  },
963
- "654": {
964
- "content": "hôm",
965
- "lstrip": false,
966
- "normalized": true,
967
- "rstrip": false,
968
- "single_word": false,
969
- "special": false
970
- },
971
  "669": {
972
  "content": "Đó",
973
  "lstrip": false,
@@ -984,14 +856,6 @@
984
  "single_word": false,
985
  "special": false
986
  },
987
- "680": {
988
- "content": "Bà",
989
- "lstrip": false,
990
- "normalized": true,
991
- "rstrip": false,
992
- "single_word": false,
993
- "special": false
994
- },
995
  "681": {
996
  "content": "giới",
997
  "lstrip": false,
@@ -1000,14 +864,6 @@
1000
  "single_word": false,
1001
  "special": false
1002
  },
1003
- "696": {
1004
- "content": "100",
1005
- "lstrip": false,
1006
- "normalized": true,
1007
- "rstrip": false,
1008
- "single_word": false,
1009
- "special": false
1010
- },
1011
  "711": {
1012
  "content": "bức",
1013
  "lstrip": false,
@@ -1016,14 +872,6 @@
1016
  "single_word": false,
1017
  "special": false
1018
  },
1019
- "733": {
1020
- "content": "đời",
1021
- "lstrip": false,
1022
- "normalized": true,
1023
- "rstrip": false,
1024
- "single_word": false,
1025
- "special": false
1026
- },
1027
  "740": {
1028
  "content": "lập",
1029
  "lstrip": false,
@@ -1080,14 +928,6 @@
1080
  "single_word": false,
1081
  "special": false
1082
  },
1083
- "825": {
1084
- "content": "chủ_yếu",
1085
- "lstrip": false,
1086
- "normalized": true,
1087
- "rstrip": false,
1088
- "single_word": false,
1089
- "special": false
1090
- },
1091
  "835": {
1092
  "content": "phố",
1093
  "lstrip": false,
@@ -1152,14 +992,6 @@
1152
  "single_word": false,
1153
  "special": false
1154
  },
1155
- "956": {
1156
- "content": "đại_học",
1157
- "lstrip": false,
1158
- "normalized": true,
1159
- "rstrip": false,
1160
- "single_word": false,
1161
- "special": false
1162
- },
1163
  "988": {
1164
  "content": "chở",
1165
  "lstrip": false,
@@ -1192,22 +1024,6 @@
1192
  "single_word": false,
1193
  "special": false
1194
  },
1195
- "1093": {
1196
- "content": "hội",
1197
- "lstrip": false,
1198
- "normalized": true,
1199
- "rstrip": false,
1200
- "single_word": false,
1201
- "special": false
1202
- },
1203
- "1106": {
1204
- "content": "Mỗi",
1205
- "lstrip": false,
1206
- "normalized": true,
1207
- "rstrip": false,
1208
- "single_word": false,
1209
- "special": false
1210
- },
1211
  "1111": {
1212
  "content": "i",
1213
  "lstrip": false,
@@ -1248,14 +1064,6 @@
1248
  "single_word": false,
1249
  "special": false
1250
  },
1251
- "1287": {
1252
- "content": "cầm",
1253
- "lstrip": false,
1254
- "normalized": true,
1255
- "rstrip": false,
1256
- "single_word": false,
1257
- "special": false
1258
- },
1259
  "1292": {
1260
  "content": "châu_Á",
1261
  "lstrip": false,
@@ -1272,14 +1080,6 @@
1272
  "single_word": false,
1273
  "special": false
1274
  },
1275
- "1338": {
1276
- "content": "bây_giờ",
1277
- "lstrip": false,
1278
- "normalized": true,
1279
- "rstrip": false,
1280
- "single_word": false,
1281
- "special": false
1282
- },
1283
  "1351": {
1284
  "content": "Cả",
1285
  "lstrip": false,
@@ -1328,14 +1128,6 @@
1328
  "single_word": false,
1329
  "special": false
1330
  },
1331
- "1464": {
1332
- "content": "bao",
1333
- "lstrip": false,
1334
- "normalized": true,
1335
- "rstrip": false,
1336
- "single_word": false,
1337
- "special": false
1338
- },
1339
  "1517": {
1340
  "content": "a",
1341
  "lstrip": false,
@@ -1368,14 +1160,6 @@
1368
  "single_word": false,
1369
  "special": false
1370
  },
1371
- "1606": {
1372
- "content": "lúa",
1373
- "lstrip": false,
1374
- "normalized": true,
1375
- "rstrip": false,
1376
- "single_word": false,
1377
- "special": false
1378
- },
1379
  "1615": {
1380
  "content": "Chiều",
1381
  "lstrip": false,
@@ -1384,14 +1168,6 @@
1384
  "single_word": false,
1385
  "special": false
1386
  },
1387
- "1663": {
1388
- "content": "70",
1389
- "lstrip": false,
1390
- "normalized": true,
1391
- "rstrip": false,
1392
- "single_word": false,
1393
- "special": false
1394
- },
1395
  "1664": {
1396
  "content": "h",
1397
  "lstrip": false,
@@ -1408,14 +1184,6 @@
1408
  "single_word": false,
1409
  "special": false
1410
  },
1411
- "1750": {
1412
- "content": "lao",
1413
- "lstrip": false,
1414
- "normalized": true,
1415
- "rstrip": false,
1416
- "single_word": false,
1417
- "special": false
1418
- },
1419
  "1775": {
1420
  "content": "đám",
1421
  "lstrip": false,
@@ -1440,14 +1208,6 @@
1440
  "single_word": false,
1441
  "special": false
1442
  },
1443
- "1912": {
1444
- "content": "độc_lập",
1445
- "lstrip": false,
1446
- "normalized": true,
1447
- "rstrip": false,
1448
- "single_word": false,
1449
- "special": false
1450
- },
1451
  "2089": {
1452
  "content": "pháp_lý",
1453
  "lstrip": false,
@@ -1472,14 +1232,6 @@
1472
  "single_word": false,
1473
  "special": false
1474
  },
1475
- "2153": {
1476
- "content": "sở",
1477
- "lstrip": false,
1478
- "normalized": true,
1479
- "rstrip": false,
1480
- "single_word": false,
1481
- "special": false
1482
- },
1483
  "2185": {
1484
  "content": "phó",
1485
  "lstrip": false,
@@ -1504,14 +1256,6 @@
1504
  "single_word": false,
1505
  "special": false
1506
  },
1507
- "2209": {
1508
- "content": "cơ",
1509
- "lstrip": false,
1510
- "normalized": true,
1511
- "rstrip": false,
1512
- "single_word": false,
1513
- "special": false
1514
- },
1515
  "2217": {
1516
  "content": "đậm",
1517
  "lstrip": false,
@@ -1568,14 +1312,6 @@
1568
  "single_word": false,
1569
  "special": false
1570
  },
1571
- "2469": {
1572
- "content": "hưu",
1573
- "lstrip": false,
1574
- "normalized": true,
1575
- "rstrip": false,
1576
- "single_word": false,
1577
- "special": false
1578
- },
1579
  "2557": {
1580
  "content": "chức_vụ",
1581
  "lstrip": false,
@@ -1624,14 +1360,6 @@
1624
  "single_word": false,
1625
  "special": false
1626
  },
1627
- "2913": {
1628
- "content": "lò",
1629
- "lstrip": false,
1630
- "normalized": true,
1631
- "rstrip": false,
1632
- "single_word": false,
1633
- "special": false
1634
- },
1635
  "2991": {
1636
  "content": "ô",
1637
  "lstrip": false,
@@ -1688,14 +1416,6 @@
1688
  "single_word": false,
1689
  "special": false
1690
  },
1691
- "3810": {
1692
- "content": "kỹ_sư",
1693
- "lstrip": false,
1694
- "normalized": true,
1695
- "rstrip": false,
1696
- "single_word": false,
1697
- "special": false
1698
- },
1699
  "3988": {
1700
  "content": "ổ",
1701
  "lstrip": false,
@@ -1728,22 +1448,6 @@
1728
  "single_word": false,
1729
  "special": false
1730
  },
1731
- "4216": {
1732
- "content": "1992",
1733
- "lstrip": false,
1734
- "normalized": true,
1735
- "rstrip": false,
1736
- "single_word": false,
1737
- "special": false
1738
- },
1739
- "4278": {
1740
- "content": "1994",
1741
- "lstrip": false,
1742
- "normalized": true,
1743
- "rstrip": false,
1744
- "single_word": false,
1745
- "special": false
1746
- },
1747
  "4318": {
1748
  "content": "MTTQ",
1749
  "lstrip": false,
@@ -1856,14 +1560,6 @@
1856
  "single_word": false,
1857
  "special": false
1858
  },
1859
- "5854": {
1860
- "content": "Nội_Bài",
1861
- "lstrip": false,
1862
- "normalized": true,
1863
- "rstrip": false,
1864
- "single_word": false,
1865
- "special": false
1866
- },
1867
  "6007": {
1868
  "content": "hư",
1869
  "lstrip": false,
@@ -1992,14 +1688,6 @@
1992
  "single_word": false,
1993
  "special": false
1994
  },
1995
- "8915": {
1996
- "content": "ời",
1997
- "lstrip": false,
1998
- "normalized": true,
1999
- "rstrip": false,
2000
- "single_word": false,
2001
- "special": false
2002
- },
2003
  "8942": {
2004
  "content": "lê",
2005
  "lstrip": false,
@@ -2032,14 +1720,6 @@
2032
  "single_word": false,
2033
  "special": false
2034
  },
2035
- "9412": {
2036
- "content": "ài",
2037
- "lstrip": false,
2038
- "normalized": true,
2039
- "rstrip": false,
2040
- "single_word": false,
2041
- "special": false
2042
- },
2043
  "9456": {
2044
  "content": "ề",
2045
  "lstrip": false,
@@ -2168,14 +1848,6 @@
2168
  "single_word": false,
2169
  "special": false
2170
  },
2171
- "12416": {
2172
- "content": "ợ",
2173
- "lstrip": false,
2174
- "normalized": true,
2175
- "rstrip": false,
2176
- "single_word": false,
2177
- "special": false
2178
- },
2179
  "13291": {
2180
  "content": "ộc",
2181
  "lstrip": false,
@@ -2216,14 +1888,6 @@
2216
  "single_word": false,
2217
  "special": false
2218
  },
2219
- "15002": {
2220
- "content": "Cò",
2221
- "lstrip": false,
2222
- "normalized": true,
2223
- "rstrip": false,
2224
- "single_word": false,
2225
- "special": false
2226
- },
2227
  "15145": {
2228
  "content": "úc",
2229
  "lstrip": false,
@@ -2264,14 +1928,6 @@
2264
  "single_word": false,
2265
  "special": false
2266
  },
2267
- "16788": {
2268
- "content": "iểu",
2269
- "lstrip": false,
2270
- "normalized": true,
2271
- "rstrip": false,
2272
- "single_word": false,
2273
- "special": false
2274
- },
2275
  "17341": {
2276
  "content": "iếp",
2277
  "lstrip": false,
@@ -2296,14 +1952,6 @@
2296
  "single_word": false,
2297
  "special": false
2298
  },
2299
- "20017": {
2300
- "content": "đỡ_đầu",
2301
- "lstrip": false,
2302
- "normalized": true,
2303
- "rstrip": false,
2304
- "single_word": false,
2305
- "special": false
2306
- },
2307
  "20463": {
2308
  "content": "kiều_hối",
2309
  "lstrip": false,
@@ -2352,14 +2000,6 @@
2352
  "single_word": false,
2353
  "special": false
2354
  },
2355
- "24096": {
2356
- "content": "há",
2357
- "lstrip": false,
2358
- "normalized": true,
2359
- "rstrip": false,
2360
- "single_word": false,
2361
- "special": false
2362
- },
2363
  "30251": {
2364
  "content": "ă",
2365
  "lstrip": false,
@@ -2400,14 +2040,6 @@
2400
  "single_word": false,
2401
  "special": false
2402
  },
2403
- "35477": {
2404
- "content": "hy",
2405
- "lstrip": false,
2406
- "normalized": true,
2407
- "rstrip": false,
2408
- "single_word": false,
2409
- "special": false
2410
- },
2411
  "37463": {
2412
  "content": "ổ_chức",
2413
  "lstrip": false,
@@ -2456,14 +2088,6 @@
2456
  "single_word": false,
2457
  "special": false
2458
  },
2459
- "46921": {
2460
- "content": "a_đời",
2461
- "lstrip": false,
2462
- "normalized": true,
2463
- "rstrip": false,
2464
- "single_word": false,
2465
- "special": false
2466
- },
2467
  "48978": {
2468
  "content": "â",
2469
  "lstrip": false,
@@ -4743,294 +4367,6 @@
4743
  "rstrip": false,
4744
  "single_word": false,
4745
  "special": false
4746
- },
4747
- "64281": {
4748
- "content": "Phố_Việ",
4749
- "lstrip": false,
4750
- "normalized": true,
4751
- "rstrip": false,
4752
- "single_word": false,
4753
- "special": false
4754
- },
4755
- "64282": {
4756
- "content": "Nam_vẫ",
4757
- "lstrip": false,
4758
- "normalized": true,
4759
- "rstrip": false,
4760
- "single_word": false,
4761
- "special": false
4762
- },
4763
- "64283": {
4764
- "content": "chức_ă",
4765
- "lstrip": false,
4766
- "normalized": true,
4767
- "rstrip": false,
4768
- "single_word": false,
4769
- "special": false
4770
- },
4771
- "64284": {
4772
- "content": "Jea_e",
4773
- "lstrip": false,
4774
- "normalized": true,
4775
- "rstrip": false,
4776
- "single_word": false,
4777
- "special": false
4778
- },
4779
- "64285": {
4780
- "content": "Huy_h",
4781
- "lstrip": false,
4782
- "normalized": true,
4783
- "rstrip": false,
4784
- "single_word": false,
4785
- "special": false
4786
- },
4787
- "64286": {
4788
- "content": "âm_sự",
4789
- "lstrip": false,
4790
- "normalized": true,
4791
- "rstrip": false,
4792
- "single_word": false,
4793
- "special": false
4794
- },
4795
- "64287": {
4796
- "content": "h_cư",
4797
- "lstrip": false,
4798
- "normalized": true,
4799
- "rstrip": false,
4800
- "single_word": false,
4801
- "special": false
4802
- },
4803
- "64288": {
4804
- "content": "xuố",
4805
- "lstrip": false,
4806
- "normalized": true,
4807
- "rstrip": false,
4808
- "single_word": false,
4809
- "special": false
4810
- },
4811
- "64289": {
4812
- "content": "khô_g",
4813
- "lstrip": false,
4814
- "normalized": true,
4815
- "rstrip": false,
4816
- "single_word": false,
4817
- "special": false
4818
- },
4819
- "64290": {
4820
- "content": "hấy",
4821
- "lstrip": false,
4822
- "normalized": true,
4823
- "rstrip": false,
4824
- "single_word": false,
4825
- "special": false
4826
- },
4827
- "64291": {
4828
- "content": "co_số",
4829
- "lstrip": false,
4830
- "normalized": true,
4831
- "rstrip": false,
4832
- "single_word": false,
4833
- "special": false
4834
- },
4835
- "64292": {
4836
- "content": "gôi_hà",
4837
- "lstrip": false,
4838
- "normalized": true,
4839
- "rstrip": false,
4840
- "single_word": false,
4841
- "special": false
4842
- },
4843
- "64293": {
4844
- "content": "đơ_sơ",
4845
- "lstrip": false,
4846
- "normalized": true,
4847
- "rstrip": false,
4848
- "single_word": false,
4849
- "special": false
4850
- },
4851
- "64294": {
4852
- "content": "hà_cửa",
4853
- "lstrip": false,
4854
- "normalized": true,
4855
- "rstrip": false,
4856
- "single_word": false,
4857
- "special": false
4858
- },
4859
- "64295": {
4860
- "content": "kha_g",
4861
- "lstrip": false,
4862
- "normalized": true,
4863
- "rstrip": false,
4864
- "single_word": false,
4865
- "special": false
4866
- },
4867
- "64296": {
4868
- "content": "a_g",
4869
- "lstrip": false,
4870
- "normalized": true,
4871
- "rstrip": false,
4872
- "single_word": false,
4873
- "special": false
4874
- },
4875
- "64297": {
4876
- "content": "hứ",
4877
- "lstrip": false,
4878
- "normalized": true,
4879
- "rstrip": false,
4880
- "single_word": false,
4881
- "special": false
4882
- },
4883
- "64298": {
4884
- "content": "ấ_hiều",
4885
- "lstrip": false,
4886
- "normalized": true,
4887
- "rstrip": false,
4888
- "single_word": false,
4889
- "special": false
4890
- },
4891
- "64299": {
4892
- "content": "ghĩ",
4893
- "lstrip": false,
4894
- "normalized": true,
4895
- "rstrip": false,
4896
- "single_word": false,
4897
- "special": false
4898
- },
4899
- "64300": {
4900
- "content": "o_lớ",
4901
- "lstrip": false,
4902
- "normalized": true,
4903
- "rstrip": false,
4904
- "single_word": false,
4905
- "special": false
4906
- },
4907
- "64301": {
4908
- "content": "hiêu",
4909
- "lstrip": false,
4910
- "normalized": true,
4911
- "rstrip": false,
4912
- "single_word": false,
4913
- "special": false
4914
- },
4915
- "64302": {
4916
- "content": "hòa_òa",
4917
- "lstrip": false,
4918
- "normalized": true,
4919
- "rstrip": false,
4920
- "single_word": false,
4921
- "special": false
4922
- },
4923
- "64303": {
4924
- "content": "xúc_độ",
4925
- "lstrip": false,
4926
- "normalized": true,
4927
- "rstrip": false,
4928
- "single_word": false,
4929
- "special": false
4930
- },
4931
- "64304": {
4932
- "content": "Hội_gười",
4933
- "lstrip": false,
4934
- "normalized": true,
4935
- "rstrip": false,
4936
- "single_word": false,
4937
- "special": false
4938
- },
4939
- "64305": {
4940
- "content": "ập_hợp",
4941
- "lstrip": false,
4942
- "normalized": true,
4943
- "rstrip": false,
4944
- "single_word": false,
4945
- "special": false
4946
- },
4947
- "64306": {
4948
- "content": "guyệ",
4949
- "lstrip": false,
4950
- "normalized": true,
4951
- "rstrip": false,
4952
- "single_word": false,
4953
- "special": false
4954
- },
4955
- "64307": {
4956
- "content": "âm_ới",
4957
- "lstrip": false,
4958
- "normalized": true,
4959
- "rstrip": false,
4960
- "single_word": false,
4961
- "special": false
4962
- },
4963
- "64308": {
4964
- "content": "đứ",
4965
- "lstrip": false,
4966
- "normalized": true,
4967
- "rstrip": false,
4968
- "single_word": false,
4969
- "special": false
4970
- },
4971
- "64309": {
4972
- "content": "a_hà_h",
4973
- "lstrip": false,
4974
- "normalized": true,
4975
- "rstrip": false,
4976
- "single_word": false,
4977
- "special": false
4978
- },
4979
- "64310": {
4980
- "content": "ô_g",
4981
- "lstrip": false,
4982
- "normalized": true,
4983
- "rstrip": false,
4984
- "single_word": false,
4985
- "special": false
4986
- },
4987
- "64311": {
4988
- "content": "Nam_ă",
4989
- "lstrip": false,
4990
- "normalized": true,
4991
- "rstrip": false,
4992
- "single_word": false,
4993
- "special": false
4994
- },
4995
- "64312": {
4996
- "content": "suấ",
4997
- "lstrip": false,
4998
- "normalized": true,
4999
- "rstrip": false,
5000
- "single_word": false,
5001
- "special": false
5002
- },
5003
- "64313": {
5004
- "content": "cải_hiệ",
5005
- "lstrip": false,
5006
- "normalized": true,
5007
- "rstrip": false,
5008
- "single_word": false,
5009
- "special": false
5010
- },
5011
- "64314": {
5012
- "content": "khă",
5013
- "lstrip": false,
5014
- "normalized": true,
5015
- "rstrip": false,
5016
- "single_word": false,
5017
- "special": false
5018
- },
5019
- "64315": {
5020
- "content": "eu_o",
5021
- "lstrip": false,
5022
- "normalized": true,
5023
- "rstrip": false,
5024
- "single_word": false,
5025
- "special": false
5026
- },
5027
- "64316": {
5028
- "content": "bổ_g",
5029
- "lstrip": false,
5030
- "normalized": true,
5031
- "rstrip": false,
5032
- "single_word": false,
5033
- "special": false
5034
  }
5035
  },
5036
  "bos_token": "<s>",
 
360
  "single_word": false,
361
  "special": false
362
  },
 
 
 
 
 
 
 
 
363
  "100": {
364
  "content": "số",
365
  "lstrip": false,
 
392
  "single_word": false,
393
  "special": false
394
  },
 
 
 
 
 
 
 
 
395
  "120": {
396
  "content": "Hà_Nội",
397
  "lstrip": false,
 
416
  "single_word": false,
417
  "special": false
418
  },
 
 
 
 
 
 
 
 
419
  "135": {
420
  "content": "...",
421
  "lstrip": false,
 
448
  "single_word": false,
449
  "special": false
450
  },
 
 
 
 
 
 
 
 
451
  "163": {
452
  "content": "4",
453
  "lstrip": false,
 
472
  "single_word": false,
473
  "special": false
474
  },
 
 
 
 
 
 
 
 
475
  "181": {
476
  "content": "cấp",
477
  "lstrip": false,
 
488
  "single_word": false,
489
  "special": false
490
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  "222": {
492
  "content": "học",
493
  "lstrip": false,
 
568
  "single_word": false,
569
  "special": false
570
  },
 
 
 
 
 
 
 
 
571
  "268": {
572
  "content": "Phó",
573
  "lstrip": false,
 
664
  "single_word": false,
665
  "special": false
666
  },
 
 
 
 
 
 
 
 
667
  "378": {
668
  "content": "độ",
669
  "lstrip": false,
 
688
  "single_word": false,
689
  "special": false
690
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  "447": {
692
  "content": "hỏi",
693
  "lstrip": false,
 
696
  "single_word": false,
697
  "special": false
698
  },
 
 
 
 
 
 
 
 
699
  "471": {
700
  "content": "sớm",
701
  "lstrip": false,
 
720
  "single_word": false,
721
  "special": false
722
  },
 
 
 
 
 
 
 
 
723
  "506": {
724
  "content": "hộ",
725
  "lstrip": false,
 
840
  "single_word": false,
841
  "special": false
842
  },
 
 
 
 
 
 
 
 
843
  "669": {
844
  "content": "Đó",
845
  "lstrip": false,
 
856
  "single_word": false,
857
  "special": false
858
  },
 
 
 
 
 
 
 
 
859
  "681": {
860
  "content": "giới",
861
  "lstrip": false,
 
864
  "single_word": false,
865
  "special": false
866
  },
 
 
 
 
 
 
 
 
867
  "711": {
868
  "content": "bức",
869
  "lstrip": false,
 
872
  "single_word": false,
873
  "special": false
874
  },
 
 
 
 
 
 
 
 
875
  "740": {
876
  "content": "lập",
877
  "lstrip": false,
 
928
  "single_word": false,
929
  "special": false
930
  },
 
 
 
 
 
 
 
 
931
  "835": {
932
  "content": "phố",
933
  "lstrip": false,
 
992
  "single_word": false,
993
  "special": false
994
  },
 
 
 
 
 
 
 
 
995
  "988": {
996
  "content": "chở",
997
  "lstrip": false,
 
1024
  "single_word": false,
1025
  "special": false
1026
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1027
  "1111": {
1028
  "content": "i",
1029
  "lstrip": false,
 
1064
  "single_word": false,
1065
  "special": false
1066
  },
 
 
 
 
 
 
 
 
1067
  "1292": {
1068
  "content": "châu_Á",
1069
  "lstrip": false,
 
1080
  "single_word": false,
1081
  "special": false
1082
  },
 
 
 
 
 
 
 
 
1083
  "1351": {
1084
  "content": "Cả",
1085
  "lstrip": false,
 
1128
  "single_word": false,
1129
  "special": false
1130
  },
 
 
 
 
 
 
 
 
1131
  "1517": {
1132
  "content": "a",
1133
  "lstrip": false,
 
1160
  "single_word": false,
1161
  "special": false
1162
  },
 
 
 
 
 
 
 
 
1163
  "1615": {
1164
  "content": "Chiều",
1165
  "lstrip": false,
 
1168
  "single_word": false,
1169
  "special": false
1170
  },
 
 
 
 
 
 
 
 
1171
  "1664": {
1172
  "content": "h",
1173
  "lstrip": false,
 
1184
  "single_word": false,
1185
  "special": false
1186
  },
 
 
 
 
 
 
 
 
1187
  "1775": {
1188
  "content": "đám",
1189
  "lstrip": false,
 
1208
  "single_word": false,
1209
  "special": false
1210
  },
 
 
 
 
 
 
 
 
1211
  "2089": {
1212
  "content": "pháp_lý",
1213
  "lstrip": false,
 
1232
  "single_word": false,
1233
  "special": false
1234
  },
 
 
 
 
 
 
 
 
1235
  "2185": {
1236
  "content": "phó",
1237
  "lstrip": false,
 
1256
  "single_word": false,
1257
  "special": false
1258
  },
 
 
 
 
 
 
 
 
1259
  "2217": {
1260
  "content": "đậm",
1261
  "lstrip": false,
 
1312
  "single_word": false,
1313
  "special": false
1314
  },
 
 
 
 
 
 
 
 
1315
  "2557": {
1316
  "content": "chức_vụ",
1317
  "lstrip": false,
 
1360
  "single_word": false,
1361
  "special": false
1362
  },
 
 
 
 
 
 
 
 
1363
  "2991": {
1364
  "content": "ô",
1365
  "lstrip": false,
 
1416
  "single_word": false,
1417
  "special": false
1418
  },
 
 
 
 
 
 
 
 
1419
  "3988": {
1420
  "content": "ổ",
1421
  "lstrip": false,
 
1448
  "single_word": false,
1449
  "special": false
1450
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1451
  "4318": {
1452
  "content": "MTTQ",
1453
  "lstrip": false,
 
1560
  "single_word": false,
1561
  "special": false
1562
  },
 
 
 
 
 
 
 
 
1563
  "6007": {
1564
  "content": "hư",
1565
  "lstrip": false,
 
1688
  "single_word": false,
1689
  "special": false
1690
  },
 
 
 
 
 
 
 
 
1691
  "8942": {
1692
  "content": "lê",
1693
  "lstrip": false,
 
1720
  "single_word": false,
1721
  "special": false
1722
  },
 
 
 
 
 
 
 
 
1723
  "9456": {
1724
  "content": "ề",
1725
  "lstrip": false,
 
1848
  "single_word": false,
1849
  "special": false
1850
  },
 
 
 
 
 
 
 
 
1851
  "13291": {
1852
  "content": "ộc",
1853
  "lstrip": false,
 
1888
  "single_word": false,
1889
  "special": false
1890
  },
 
 
 
 
 
 
 
 
1891
  "15145": {
1892
  "content": "úc",
1893
  "lstrip": false,
 
1928
  "single_word": false,
1929
  "special": false
1930
  },
 
 
 
 
 
 
 
 
1931
  "17341": {
1932
  "content": "iếp",
1933
  "lstrip": false,
 
1952
  "single_word": false,
1953
  "special": false
1954
  },
 
 
 
 
 
 
 
 
1955
  "20463": {
1956
  "content": "kiều_hối",
1957
  "lstrip": false,
 
2000
  "single_word": false,
2001
  "special": false
2002
  },
 
 
 
 
 
 
 
 
2003
  "30251": {
2004
  "content": "ă",
2005
  "lstrip": false,
 
2040
  "single_word": false,
2041
  "special": false
2042
  },
 
 
 
 
 
 
 
 
2043
  "37463": {
2044
  "content": "ổ_chức",
2045
  "lstrip": false,
 
2088
  "single_word": false,
2089
  "special": false
2090
  },
 
 
 
 
 
 
 
 
2091
  "48978": {
2092
  "content": "â",
2093
  "lstrip": false,
 
4367
  "rstrip": false,
4368
  "single_word": false,
4369
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4370
  }
4371
  },
4372
  "bos_token": "<s>",