PuppetLover commited on
Commit
0fe97ca
·
verified ·
1 Parent(s): 8cd4582

Upload tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +359 -281
  2. special_tokens_map.json +98 -28
  3. spiece.model +3 -0
  4. tokenizer_config.json +0 -0
added_tokens.json CHANGED
@@ -1,283 +1,361 @@
1
  {
2
- "1.600_gia_đì": 64247,
3
- "27-29": 64018,
4
- "30/4/2025": 64169,
5
- "<mask>": 64000,
6
- "A_i": 64122,
7
- "Akihi_o": 64048,
8
- "Aoki_Kazuhiko": 64036,
9
- "Ba_A": 64041,
10
- "Bê_cạ": 64147,
11
- "Bùi_Tha_h_Sơ": 64026,
12
- "Bả": 64085,
13
- "Bả_Ishiba": 64004,
14
- "Chuyế": 64019,
15
- "Chuyế_hăm": 64059,
16
- "Chá_h": 64034,
17
- "Chí_h": 64022,
18
- "Chủ_ịch": 64218,
19
- "Cố_vấ": 64037,
20
- "Hai_bê": 64139,
21
- "Hi_oo": 64051,
22
- "Hiệ": 64179,
23
- "Hòa": 64225,
24
- "I_o": 64097,
25
- "Iijima_Isao": 64046,
26
- "Ishiba": 64007,
27
- "Kaiha_a": 64054,
28
- "Ke": 64055,
29
- "Koji": 64053,
30
- "Lê_Thị_Bích_T": 64023,
31
- "Lươ": 64151,
32
- "Masa_aka": 64044,
33
- "Mộ_cộ": 64279,
34
- "Nagashima_Akihisa": 64039,
35
- "Nakashima": 64050,
36
- "Nam_Tow": 64264,
37
- "Nam_chố": 64159,
38
- "Nam_hực": 64128,
39
- "Nam_ại": 64031,
40
- "Nam_ỉ": 64243,
41
- "Nam_ừ": 64016,
42
- "Naoki": 64098,
43
- "Nguyễ_Mi": 64029,
44
- "Nhậ": 64003,
45
- "Nhậ_Bả": 64011,
46
- "Nhữ": 64229,
47
- "Oka_o": 64043,
48
- "Phu_hâ": 64006,
49
- "Phạm_Mi_h": 64021,
50
- "Phạm_Qua": 64032,
51
- "Phầ_lớ": 64249,
52
- "S_isuk": 64271,
53
- "Shige_u": 64005,
54
- "TPHCM.": 64183,
55
- "TPHCM_phá": 64184,
56
- "TPHCM_sá": 64160,
57
- "T_o": 64195,
58
- "T_u": 64216,
59
- "Tha_i": 64221,
60
- "Thá": 64272,
61
- "Thái_La": 64153,
62
- "Tháp_ù": 64033,
63
- "Thườ": 64028,
64
- "Thủ_ướ": 64002,
65
- "Tsuchimichi": 64047,
66
- "": 64211,
67
- "Tổ_g": 64040,
68
- "Udo": 64220,
69
- "Udo_Tha": 64244,
70
- "Vie": 64265,
71
- "Việ": 64015,
72
- "Việ_Nam": 64057,
73
- "Việ_kiều": 64260,
74
- "Việ_ma": 64256,
75
- "Việ_ại": 64219,
76
- "Vũ_Thị_Huỳ_h": 64172,
77
- "Xuâ": 64273,
78
- "Xuâ_Hòa": 64152,
79
- "Yoshi_o": 64052,
80
- "Yoshiko": 64008,
81
- "a_h": 64278,
82
- "a_o": 64056,
83
- "aka_Mao": 64049,
84
- "am_Tow": 64266,
85
- "biế": 64099,
86
- "biế_ơ": 64240,
87
- "biể_hà": 64269,
88
- "biểu_dươ": 64162,
89
- "biệ": 64176,
90
- "buô_bá": 64255,
91
- "bê_hư": 64145,
92
- "bì_h": 64071,
93
- "bả_sắc": 64257,
94
- "bắ": 64012,
95
- "bắ_đầu": 64274,
96
- "bằ": 64251,
97
- "chiế": 64226,
98
- "chiế_lược": 64067,
99
- "chuyê_cơ": 64001,
100
- "chuyê_gia": 64205,
101
- "chuyế": 64062,
102
- "chuyể": 64091,
103
- "chuyể_đổi": 64089,
104
- "chí_h": 64084,
105
- "chấ": 64077,
106
- "co_gười": 64254,
107
- "co_phố": 64267,
108
- "co_si": 64250,
109
- "co_xa": 64155,
110
- "cà_g": 64261,
111
- "cá_hâ": 64117,
112
- "cù_g": 64009,
113
- "cơ_sở_hạ": 64131,
114
- "cườ": 64102,
115
- "cậy_chí": 64081,
116
- "diễ_a": 64199,
117
- "diệ": 64069,
118
- "doa": 64207,
119
- "doa_h": 64193,
120
- "": 64258,
121
- "dự_g": 64164,
122
- "dự_á": 64095,
123
- "ghiệp": 64134,
124
- "ghiệp_hố": 64236,
125
- "ghĩa": 64060,
126
- "ghị": 64198,
127
- "giai_đọa": 64196,
128
- "goại": 64111,
129
- "guồ": 64132,
130
- "guồ_lực": 64200,
131
- "gày": 64017,
132
- "gười": 64154,
133
- "gầ": 64201,
134
- "gắ": 64268,
135
- "h_Vũ": 64030,
136
- "h_chí": 64270,
137
- "h_cảm": 64212,
138
- "h_hổ": 64181,
139
- "h_hức": 64014,
140
- "h_leo": 64146,
141
- "h_vực": 64080,
142
- "h_đạo": 64115,
143
- "hau": 64276,
144
- "hiế": 64227,
145
- "hiế_hực": 64191,
146
- "hiều": 64094,
147
- "hiều_hội": 64197,
148
- "hiều_ý": 64203,
149
- "hiệ": 64129,
150
- "hiệm": 64170,
151
- "hà_g": 64110,
152
- "hà_h": 64189,
153
- "hà_lã": 64118,
154
- "hào_hơ": 64262,
155
- "hào_ằ": 64231,
156
- "hách_hức": 64124,
157
- "": 64166,
158
- "hâ_lực": 64133,
159
- "hì_h": 64150,
160
- "hòa": 64070,
161
- "hóa": 64135,
162
- "hăm_TPHCM": 64222,
163
- "hăm_chí": 64013,
164
- "hăm_diễ": 64020,
165
- "hăm_hâ": 64192,
166
- "hăm_lầ": 64113,
167
- "hăm_đầu": 64063,
168
- "hươ": 64137,
169
- "hướ": 64210,
170
- "hấ": 64138,
171
- "hậ": 64202,
172
- "hằm": 64101,
173
- "hế": 64061,
174
- "hế_giới": 64107,
175
- "hể": 64096,
176
- "hể_chế": 64130,
177
- "hị": 64141,
178
- "họa_độ": 64190,
179
- "hỏ": 64235,
180
- "hỗ_ợ": 64127,
181
- "hợp_ác": 64082,
182
- "hữ": 64108,
183
- "hữu_ghị": 64259,
184
- "i_Masafumi": 64045,
185
- "i_cậy": 64116,
186
- "i_h": 64042,
187
- "i_ước": 64245,
188
- "": 64064,
189
- "iế": 64092,
190
- "iếp_ục": 64086,
191
- "iề": 64156,
192
- "iể": 64093,
193
- "iể_hà": 64165,
194
- "iể_hực": 64076,
195
- "khả_ă": 64143,
196
- "khẳ": 64119,
197
- "khỏa": 64180,
198
- "kiế": 64204,
199
- "kiệ": 64248,
200
- "kỷ_iệm": 64167,
201
- "lam_hắ": 64280,
202
- "liê_hệ": 64182,
203
- "luô": 64238,
204
- "luô_dà": 64174,
205
- "": 64114,
206
- "": 64079,
207
- "lượ": 64232,
208
- "lầ": 64100,
209
- "lớ": 64233,
210
- "miề": 64158,
211
- "miề_Nam": 64230,
212
- "muố": 64112,
213
- "máu_hị": 64178,
214
- "mặ": 64161,
215
- "mặ_hà": 64144,
216
- "mừ": 64157,
217
- "o_Naoki": 64058,
218
- "phươ": 64121,
219
- "phấ_đấu": 64140,
220
- "phầ": 64177,
221
- "phồ": 64072,
222
- "quyề": 64252,
223
- "quốc_phò": 64123,
224
- "quốc_ế": 64125,
225
- "si_h": 64173,
226
- "": 64024,
227
- "sả_xuấ": 64194,
228
- "u_g": 64105,
229
- "vi_h": 64073,
230
- "viê": 64215,
231
- "": 64035,
232
- "vượ": 64185,
233
- "vấ": 64208,
234
- "vậ_lực": 64234,
235
- "vữ": 64186,
236
- "xa_h": 64090,
237
- "xươ": 64228,
238
- "Ô_g": 64213,
239
- "ác_o": 64088,
240
- "âm_ă": 64106,
241
- "âm_đặc": 64175,
242
- "ê_hế": 64263,
243
- "ê_hế_giới": 64074,
244
- "ê_ấ": 64078,
245
- "ì_h": 64149,
246
- "í_h": 64246,
247
- "í_hức": 64206,
248
- "òa": 64068,
249
- "ă_mó": 64277,
250
- "Đô_g": 64103,
251
- "Đả_g": 64241,
252
- "Đối_ác": 64066,
253
- "đa_g": 64104,
254
- "đà_phá": 64075,
255
- "đòa": 64010,
256
- "đú": 64223,
257
- "đạ": 64188,
258
- "đại_hóa": 64136,
259
- "đấ_ước": 64168,
260
- "đầu_àu": 64187,
261
- "đặc_biệ": 64038,
262
- "đề_liê": 64209,
263
- "đị": 64120,
264
- "đối_ác": 64087,
265
- "ơ_Chí_h": 64242,
266
- "ưu_iê": 64109,
267
- "ươ": 64217,
268
- "ước_goài": 64163,
269
- "ước_hà": 64237,
270
- "ước_hằm": 64148,
271
- "ước_â": 64065,
272
- "ước_đa": 64224,
273
- "ườ": 64142,
274
- "ưở": 64027,
275
- "ưở_g": 64025,
276
- "ấ_đô": 64253,
277
- "ẩm_hực": 64275,
278
- "ập_u": 64126,
279
- "ụ_cộ": 64083,
280
- "Ủy": 64171,
281
- "ủy": 64214,
282
- "ự_hào": 64239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  }
 
1
  {
2
+ "1.600_gia_đì": 36290,
3
+ "10/2017": 36340,
4
+ "130": 36173,
5
+ "15.000": 36291,
6
+ "1970": 36098,
7
+ "25/4": 36126,
8
+ "30/4/1975": 36144,
9
+ "30/4/2025": 36145,
10
+ "50": 36140,
11
+ "<extra_id_0>": 36095,
12
+ "<extra_id_10>": 36085,
13
+ "<extra_id_11>": 36084,
14
+ "<extra_id_12>": 36083,
15
+ "<extra_id_13>": 36082,
16
+ "<extra_id_14>": 36081,
17
+ "<extra_id_15>": 36080,
18
+ "<extra_id_16>": 36079,
19
+ "<extra_id_17>": 36078,
20
+ "<extra_id_18>": 36077,
21
+ "<extra_id_19>": 36076,
22
+ "<extra_id_1>": 36094,
23
+ "<extra_id_20>": 36075,
24
+ "<extra_id_21>": 36074,
25
+ "<extra_id_22>": 36073,
26
+ "<extra_id_23>": 36072,
27
+ "<extra_id_24>": 36071,
28
+ "<extra_id_25>": 36070,
29
+ "<extra_id_26>": 36069,
30
+ "<extra_id_27>": 36068,
31
+ "<extra_id_28>": 36067,
32
+ "<extra_id_29>": 36066,
33
+ "<extra_id_2>": 36093,
34
+ "<extra_id_30>": 36065,
35
+ "<extra_id_31>": 36064,
36
+ "<extra_id_32>": 36063,
37
+ "<extra_id_33>": 36062,
38
+ "<extra_id_34>": 36061,
39
+ "<extra_id_35>": 36060,
40
+ "<extra_id_36>": 36059,
41
+ "<extra_id_37>": 36058,
42
+ "<extra_id_38>": 36057,
43
+ "<extra_id_39>": 36056,
44
+ "<extra_id_3>": 36092,
45
+ "<extra_id_40>": 36055,
46
+ "<extra_id_41>": 36054,
47
+ "<extra_id_42>": 36053,
48
+ "<extra_id_43>": 36052,
49
+ "<extra_id_44>": 36051,
50
+ "<extra_id_45>": 36050,
51
+ "<extra_id_46>": 36049,
52
+ "<extra_id_47>": 36048,
53
+ "<extra_id_48>": 36047,
54
+ "<extra_id_49>": 36046,
55
+ "<extra_id_4>": 36091,
56
+ "<extra_id_50>": 36045,
57
+ "<extra_id_51>": 36044,
58
+ "<extra_id_52>": 36043,
59
+ "<extra_id_53>": 36042,
60
+ "<extra_id_54>": 36041,
61
+ "<extra_id_55>": 36040,
62
+ "<extra_id_56>": 36039,
63
+ "<extra_id_57>": 36038,
64
+ "<extra_id_58>": 36037,
65
+ "<extra_id_59>": 36036,
66
+ "<extra_id_5>": 36090,
67
+ "<extra_id_60>": 36035,
68
+ "<extra_id_61>": 36034,
69
+ "<extra_id_62>": 36033,
70
+ "<extra_id_63>": 36032,
71
+ "<extra_id_64>": 36031,
72
+ "<extra_id_65>": 36030,
73
+ "<extra_id_66>": 36029,
74
+ "<extra_id_67>": 36028,
75
+ "<extra_id_68>": 36027,
76
+ "<extra_id_69>": 36026,
77
+ "<extra_id_6>": 36089,
78
+ "<extra_id_70>": 36025,
79
+ "<extra_id_71>": 36024,
80
+ "<extra_id_72>": 36023,
81
+ "<extra_id_73>": 36022,
82
+ "<extra_id_74>": 36021,
83
+ "<extra_id_75>": 36020,
84
+ "<extra_id_76>": 36019,
85
+ "<extra_id_77>": 36018,
86
+ "<extra_id_78>": 36017,
87
+ "<extra_id_79>": 36016,
88
+ "<extra_id_7>": 36088,
89
+ "<extra_id_80>": 36015,
90
+ "<extra_id_81>": 36014,
91
+ "<extra_id_82>": 36013,
92
+ "<extra_id_83>": 36012,
93
+ "<extra_id_84>": 36011,
94
+ "<extra_id_85>": 36010,
95
+ "<extra_id_86>": 36009,
96
+ "<extra_id_87>": 36008,
97
+ "<extra_id_88>": 36007,
98
+ "<extra_id_89>": 36006,
99
+ "<extra_id_8>": 36087,
100
+ "<extra_id_90>": 36005,
101
+ "<extra_id_91>": 36004,
102
+ "<extra_id_92>": 36003,
103
+ "<extra_id_93>": 36002,
104
+ "<extra_id_94>": 36001,
105
+ "<extra_id_95>": 36000,
106
+ "<extra_id_9>": 36086,
107
+ "Bác_Hồ": 36156,
108
+ "Chùa": 36355,
109
+ "Chú": 36264,
110
+ "Chủ": 36146,
111
+ "Chủ_ịch": 36238,
112
+ "Cả": 36351,
113
+ "Hiệ": 36168,
114
+ "Hòa": 36250,
115
+ "Hội": 36239,
116
+ "Lươ": 36099,
117
+ "MTTQ": 36237,
118
+ "Mai": 36152,
119
+ "Mộ_cộ": 36356,
120
+ "Mỹ": 36123,
121
+ "Nam_Tow": 36329,
122
+ "Nam_chố": 36122,
123
+ "Nam_ại": 36278,
124
+ "Nam_ỉ": 36286,
125
+ "Ngay": 36194,
126
+ "Ngày": 36276,
127
+ "Nhớ": 36096,
128
+ "Nhữ": 36261,
129
+ "Phầ_lớ": 36294,
130
+ "Phố": 36345,
131
+ "S_isuk": 36338,
132
+ "TPHCM": 36149,
133
+ "TPHCM.": 36179,
134
+ "TPHCM_phá": 36181,
135
+ "TPHCM_sá": 36125,
136
+ "T_o": 36208,
137
+ "T_u": 36236,
138
+ "Tha_i": 36242,
139
+ "Theo": 36180,
140
+ "Thá": 36339,
141
+ "Thái_La": 36102,
142
+ "": 36231,
143
+ "Tại": 36302,
144
+ "UBND": 36124,
145
+ "Udo": 36241,
146
+ "Udo_Tha": 36287,
147
+ "Vie": 36330,
148
+ "Việ": 36277,
149
+ "Việ_Nam": 36130,
150
+ "Việ_kiều": 36322,
151
+ "Việ_ma": 36309,
152
+ "Việ_ại": 36240,
153
+ "Vũ_Thị_Huỳ_h": 36151,
154
+ "Xuâ": 36343,
155
+ "Xuâ_Hòa": 36100,
156
+ "a_h": 36354,
157
+ "am_Tow": 36331,
158
+ "biế": 36103,
159
+ "biế_ơ": 36282,
160
+ "biể_hà": 36335,
161
+ "biểu_dươ": 36129,
162
+ "biệ": 36160,
163
+ "buô_bá": 36306,
164
+ "": 36150,
165
+ "bào": 36120,
166
+ "": 36270,
167
+ "bả_sắc": 36311,
168
+ "bậc": 36183,
169
+ "bắ_đầu": 36344,
170
+ "bằ": 36296,
171
+ "bởi": 36161,
172
+ "bức": 36353,
173
+ "chia_sẻ": 36153,
174
+ "chiế": 36252,
175
+ "chuyê_gia": 36223,
176
+ "cháu": 36281,
177
+ "chí_h": 36300,
178
+ "chú": 36262,
179
+ "chỉ": 36254,
180
+ "chợ": 36303,
181
+ "chức_vụ": 36342,
182
+ "co_gười": 36305,
183
+ "co_phố": 36333,
184
+ "co_si": 36295,
185
+ "co_xa": 36112,
186
+ "cuộc": 36199,
187
+ "cà_g": 36323,
188
+ "câu_lạc_bộ": 36210,
189
+ "": 36258,
190
+ "cả": 36188,
191
+ "cầu": 36314,
192
+ "cố": 36255,
193
+ "cộ": 36196,
194
+ "diễ_a": 36214,
195
+ "doa": 36225,
196
+ "doa_h": 36205,
197
+ "": 36312,
198
+ "dịp": 36247,
199
+ "dự": 36244,
200
+ "dự_g": 36134,
201
+ "ghiệp_hố": 36272,
202
+ "ghề": 36297,
203
+ "ghị": 36213,
204
+ "giai_đọa": 36209,
205
+ "giao_lưu": 36315,
206
+ "giả": 36108,
207
+ "giới": 36328,
208
+ "giữ": 36184,
209
+ "giữa": 36321,
210
+ "guồ_lực": 36217,
211
+ "gày": 36249,
212
+ "góp": 36132,
213
+ "": 36337,
214
+ "gười": 36111,
215
+ "gầ": 36218,
216
+ "gắ": 36334,
217
+ "gặp": 36348,
218
+ "gửi": 36118,
219
+ "h_chí": 36336,
220
+ "h_cảm": 36232,
221
+ "h_hổ": 36177,
222
+ "hau": 36349,
223
+ "hiế": 36256,
224
+ "hiế_hực": 36198,
225
+ "hiều": 36106,
226
+ "hiều_hội": 36212,
227
+ "hiều_ý": 36221,
228
+ "hiệ": 36332,
229
+ "hiệm": 36147,
230
+ "huy": 36216,
231
+ "": 36191,
232
+ "hà_h": 36193,
233
+ "hào_hơ": 36324,
234
+ "hào_ằ": 36265,
235
+ "": 36138,
236
+ "hì_h": 36307,
237
+ "hóa": 36317,
238
+ "hăm_TPHCM": 36245,
239
+ "hăm_hâ": 36203,
240
+ "": 36352,
241
+ "hươ": 36166,
242
+ "hướ": 36230,
243
+ "hấ": 36142,
244
+ "hậ": 36220,
245
+ "hằm": 36215,
246
+ "họa_độ": 36197,
247
+ "học": 36172,
248
+ "họp": 36127,
249
+ "hỏ": 36269,
250
+ "hỏi": 36110,
251
+ "hố": 36141,
252
+ "hồi": 36202,
253
+ "hớ": 36274,
254
+ "hời": 36155,
255
+ "hữ": 36097,
256
+ "hữu_ghị": 36320,
257
+ "i_ước": 36288,
258
+ "iể_hà": 36136,
259
+ "khi": 36104,
260
+ "khu": 36308,
261
+ "khu_phố": 36326,
262
+ "khá": 36251,
263
+ "khô": 36253,
264
+ "khỏa": 36169,
265
+ "kiế": 36222,
266
+ "kiều_bào": 36101,
267
+ "kiều_hối": 36207,
268
+ "kiệ": 36293,
269
+ "kỷ_iệm": 36139,
270
+ "lam_hắ": 36357,
271
+ "liê_hệ": 36178,
272
+ "luô": 36279,
273
+ "luô_dà": 36157,
274
+ "làm_việc": 36171,
275
+ "": 36176,
276
+ "lượ": 36266,
277
+ "lại": 36275,
278
+ "lấy": 36116,
279
+ "lầ": 36325,
280
+ "lập": 36211,
281
+ "lớ": 36267,
282
+ "miề": 36121,
283
+ "miề_Nam": 36263,
284
+ "mà": 36257,
285
+ "máu": 36260,
286
+ "máu_hị": 36165,
287
+ "mặ": 36128,
288
+ "mộ": 36162,
289
+ "mở": 36318,
290
+ "mừ": 36117,
291
+ "phá": 36135,
292
+ "pháp_lý": 36299,
293
+ "phầ": 36163,
294
+ "phố": 36137,
295
+ "phủ": 36285,
296
+ "quyề": 36301,
297
+ "quê": 36113,
298
+ "quả": 36192,
299
+ "quốc_gia": 36174,
300
+ "sau": 36341,
301
+ "si_h": 36154,
302
+ "sâu_đậm": 36233,
303
+ "sả_xuấ": 36206,
304
+ "số": 36170,
305
+ "sự": 36158,
306
+ "vi_h": 36243,
307
+ "viê": 36235,
308
+ "việc": 36201,
309
+ "vào": 36271,
310
+ "vù": 36175,
311
+ "vă": 36316,
312
+ "vượ": 36182,
313
+ "vấ": 36226,
314
+ "vậ_lực": 36268,
315
+ "vị": 36186,
316
+ "vừa": 36195,
317
+ "vữ": 36185,
318
+ "xây": 36133,
319
+ "xươ": 36259,
320
+ "Ô_g": 36234,
321
+ "âm_đặc": 36159,
322
+ "ê_hế": 36327,
323
+ "ì_h": 36319,
324
+ "í_h": 36289,
325
+ "í_hức": 36224,
326
+ "ă_mó": 36350,
327
+ "Đây": 36347,
328
+ "Đả_g": 36283,
329
+ "Để": 36189,
330
+ "Đồ": 36313,
331
+ "điều": 36292,
332
+ "đám": 36109,
333
+ "đám_cưới": 36107,
334
+ "đây": 36219,
335
+ "đó": 36105,
336
+ "đú": 36246,
337
+ "đạ": 36190,
338
+ "đấ_ước": 36143,
339
+ "đầu": 36204,
340
+ "đầu_àu": 36187,
341
+ "đậm": 36310,
342
+ "đẹp": 36358,
343
+ "đế": 36228,
344
+ "đề_liê": 36227,
345
+ "để": 36115,
346
+ "đị": 36229,
347
+ "địa_vị": 36298,
348
+ "đồ": 36119,
349
+ "đổi_mới": 36200,
350
+ "ơ_Chí_h": 36284,
351
+ "ước_goài": 36131,
352
+ "ước_hà": 36273,
353
+ "ước_đa": 36248,
354
+ "ấ_đô": 36304,
355
+ "ẩm_hực": 36346,
356
+ "ổ_chức": 36114,
357
+ "Ủy": 36148,
358
+ "ự_hào": 36280,
359
+ "“": 36164,
360
+ "”": 36167
361
  }
special_tokens_map.json CHANGED
@@ -1,18 +1,102 @@
1
  {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  "eos_token": {
17
  "content": "</s>",
18
  "lstrip": false,
@@ -20,13 +104,6 @@
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
- "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
  "pad_token": {
31
  "content": "<pad>",
32
  "lstrip": false,
@@ -34,13 +111,6 @@
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
- "sep_token": {
38
- "content": "</s>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
  "unk_token": {
45
  "content": "<unk>",
46
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>"
99
+ ],
100
  "eos_token": {
101
  "content": "</s>",
102
  "lstrip": false,
 
104
  "rstrip": false,
105
  "single_word": false
106
  },
 
 
 
 
 
 
 
107
  "pad_token": {
108
  "content": "<pad>",
109
  "lstrip": false,
 
111
  "rstrip": false,
112
  "single_word": false
113
  },
 
 
 
 
 
 
 
114
  "unk_token": {
115
  "content": "<unk>",
116
  "lstrip": false,
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59986b62f9f0b90edafb9b073ea7b93d21114a5841219a1ea2399ade73f729c6
3
+ size 820370
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff