Upload tokenizer

#3
by ArthurZ HF Staff - opened
Files changed (4) hide show
  1. added_tokens.json +209 -0
  2. special_tokens_map.json +215 -50
  3. tokenizer.json +2 -2
  4. tokenizer_config.json +209 -2
added_tokens.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 2,
3
+ "<mask>": 256203,
4
+ "<pad>": 1,
5
+ "<s>": 0,
6
+ "<unk>": 3,
7
+ "ace_Arab": 256001,
8
+ "ace_Latn": 256002,
9
+ "acm_Arab": 256003,
10
+ "acq_Arab": 256004,
11
+ "aeb_Arab": 256005,
12
+ "afr_Latn": 256006,
13
+ "ajp_Arab": 256007,
14
+ "aka_Latn": 256008,
15
+ "als_Latn": 256162,
16
+ "amh_Ethi": 256009,
17
+ "apc_Arab": 256010,
18
+ "arb_Arab": 256011,
19
+ "ars_Arab": 256012,
20
+ "ary_Arab": 256013,
21
+ "arz_Arab": 256014,
22
+ "asm_Beng": 256015,
23
+ "ast_Latn": 256016,
24
+ "awa_Deva": 256017,
25
+ "ayr_Latn": 256018,
26
+ "azb_Arab": 256019,
27
+ "azj_Latn": 256020,
28
+ "bak_Cyrl": 256021,
29
+ "bam_Latn": 256022,
30
+ "ban_Latn": 256023,
31
+ "bel_Cyrl": 256024,
32
+ "bem_Latn": 256025,
33
+ "ben_Beng": 256026,
34
+ "bho_Deva": 256027,
35
+ "bjn_Arab": 256028,
36
+ "bjn_Latn": 256029,
37
+ "bod_Tibt": 256030,
38
+ "bos_Latn": 256031,
39
+ "bug_Latn": 256032,
40
+ "bul_Cyrl": 256033,
41
+ "cat_Latn": 256034,
42
+ "ceb_Latn": 256035,
43
+ "ces_Latn": 256036,
44
+ "cjk_Latn": 256037,
45
+ "ckb_Arab": 256038,
46
+ "crh_Latn": 256039,
47
+ "cym_Latn": 256040,
48
+ "dan_Latn": 256041,
49
+ "deu_Latn": 256042,
50
+ "dik_Latn": 256043,
51
+ "dyu_Latn": 256044,
52
+ "dzo_Tibt": 256045,
53
+ "ell_Grek": 256046,
54
+ "eng_Latn": 256047,
55
+ "epo_Latn": 256048,
56
+ "est_Latn": 256049,
57
+ "eus_Latn": 256050,
58
+ "ewe_Latn": 256051,
59
+ "fao_Latn": 256052,
60
+ "fij_Latn": 256054,
61
+ "fin_Latn": 256055,
62
+ "fon_Latn": 256056,
63
+ "fra_Latn": 256057,
64
+ "fur_Latn": 256058,
65
+ "fuv_Latn": 256059,
66
+ "gaz_Latn": 256135,
67
+ "gla_Latn": 256060,
68
+ "gle_Latn": 256061,
69
+ "glg_Latn": 256062,
70
+ "grn_Latn": 256063,
71
+ "guj_Gujr": 256064,
72
+ "hat_Latn": 256065,
73
+ "hau_Latn": 256066,
74
+ "heb_Hebr": 256067,
75
+ "hin_Deva": 256068,
76
+ "hne_Deva": 256069,
77
+ "hrv_Latn": 256070,
78
+ "hun_Latn": 256071,
79
+ "hye_Armn": 256072,
80
+ "ibo_Latn": 256073,
81
+ "ilo_Latn": 256074,
82
+ "ind_Latn": 256075,
83
+ "isl_Latn": 256076,
84
+ "ita_Latn": 256077,
85
+ "jav_Latn": 256078,
86
+ "jpn_Jpan": 256079,
87
+ "kab_Latn": 256080,
88
+ "kac_Latn": 256081,
89
+ "kam_Latn": 256082,
90
+ "kan_Knda": 256083,
91
+ "kas_Arab": 256084,
92
+ "kas_Deva": 256085,
93
+ "kat_Geor": 256086,
94
+ "kaz_Cyrl": 256089,
95
+ "kbp_Latn": 256090,
96
+ "kea_Latn": 256091,
97
+ "khk_Cyrl": 256122,
98
+ "khm_Khmr": 256092,
99
+ "kik_Latn": 256093,
100
+ "kin_Latn": 256094,
101
+ "kir_Cyrl": 256095,
102
+ "kmb_Latn": 256096,
103
+ "kmr_Latn": 256099,
104
+ "knc_Arab": 256087,
105
+ "knc_Latn": 256088,
106
+ "kon_Latn": 256097,
107
+ "kor_Hang": 256098,
108
+ "lao_Laoo": 256100,
109
+ "lij_Latn": 256102,
110
+ "lim_Latn": 256103,
111
+ "lin_Latn": 256104,
112
+ "lit_Latn": 256105,
113
+ "lmo_Latn": 256106,
114
+ "ltg_Latn": 256107,
115
+ "ltz_Latn": 256108,
116
+ "lua_Latn": 256109,
117
+ "lug_Latn": 256110,
118
+ "luo_Latn": 256111,
119
+ "lus_Latn": 256112,
120
+ "lvs_Latn": 256101,
121
+ "mag_Deva": 256113,
122
+ "mai_Deva": 256114,
123
+ "mal_Mlym": 256115,
124
+ "mar_Deva": 256116,
125
+ "min_Latn": 256117,
126
+ "mkd_Cyrl": 256118,
127
+ "mlt_Latn": 256120,
128
+ "mni_Beng": 256121,
129
+ "mos_Latn": 256123,
130
+ "mri_Latn": 256124,
131
+ "mya_Mymr": 256126,
132
+ "nld_Latn": 256127,
133
+ "nno_Latn": 256128,
134
+ "nob_Latn": 256129,
135
+ "npi_Deva": 256130,
136
+ "nso_Latn": 256131,
137
+ "nus_Latn": 256132,
138
+ "nya_Latn": 256133,
139
+ "oci_Latn": 256134,
140
+ "ory_Orya": 256136,
141
+ "pag_Latn": 256137,
142
+ "pan_Guru": 256138,
143
+ "pap_Latn": 256139,
144
+ "pbt_Arab": 256143,
145
+ "pes_Arab": 256053,
146
+ "plt_Latn": 256119,
147
+ "pol_Latn": 256140,
148
+ "por_Latn": 256141,
149
+ "prs_Arab": 256142,
150
+ "quy_Latn": 256144,
151
+ "ron_Latn": 256145,
152
+ "run_Latn": 256146,
153
+ "rus_Cyrl": 256147,
154
+ "sag_Latn": 256148,
155
+ "san_Deva": 256149,
156
+ "sat_Beng": 256150,
157
+ "scn_Latn": 256151,
158
+ "shn_Mymr": 256152,
159
+ "sin_Sinh": 256153,
160
+ "slk_Latn": 256154,
161
+ "slv_Latn": 256155,
162
+ "smo_Latn": 256156,
163
+ "sna_Latn": 256157,
164
+ "snd_Arab": 256158,
165
+ "som_Latn": 256159,
166
+ "sot_Latn": 256160,
167
+ "spa_Latn": 256161,
168
+ "srd_Latn": 256163,
169
+ "srp_Cyrl": 256164,
170
+ "ssw_Latn": 256165,
171
+ "sun_Latn": 256166,
172
+ "swe_Latn": 256167,
173
+ "swh_Latn": 256168,
174
+ "szl_Latn": 256169,
175
+ "tam_Taml": 256170,
176
+ "taq_Latn": 256177,
177
+ "taq_Tfng": 256178,
178
+ "tat_Cyrl": 256171,
179
+ "tel_Telu": 256172,
180
+ "tgk_Cyrl": 256173,
181
+ "tgl_Latn": 256174,
182
+ "tha_Thai": 256175,
183
+ "tir_Ethi": 256176,
184
+ "tpi_Latn": 256179,
185
+ "tsn_Latn": 256180,
186
+ "tso_Latn": 256181,
187
+ "tuk_Latn": 256182,
188
+ "tum_Latn": 256183,
189
+ "tur_Latn": 256184,
190
+ "twi_Latn": 256185,
191
+ "tzm_Tfng": 256186,
192
+ "uig_Arab": 256187,
193
+ "ukr_Cyrl": 256188,
194
+ "umb_Latn": 256189,
195
+ "urd_Arab": 256190,
196
+ "uzn_Latn": 256191,
197
+ "vec_Latn": 256192,
198
+ "vie_Latn": 256193,
199
+ "war_Latn": 256194,
200
+ "wol_Latn": 256195,
201
+ "xho_Latn": 256196,
202
+ "ydd_Hebr": 256197,
203
+ "yor_Latn": 256198,
204
+ "yue_Hant": 256199,
205
+ "zho_Hans": 256200,
206
+ "zho_Hant": 256201,
207
+ "zsm_Latn": 256125,
208
+ "zul_Latn": 256202
209
+ }
special_tokens_map.json CHANGED
@@ -201,55 +201,220 @@
201
  "yue_Hant",
202
  "zho_Hans",
203
  "zho_Hant",
204
- "zul_Latn"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  ],
206
- "bos_token": {
207
- "content": "<s>",
208
- "lstrip": false,
209
- "normalized": false,
210
- "rstrip": false,
211
- "single_word": false
212
- },
213
- "cls_token": {
214
- "content": "<s>",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": false,
218
- "single_word": false
219
- },
220
- "eos_token": {
221
- "content": "</s>",
222
- "lstrip": false,
223
- "normalized": false,
224
- "rstrip": false,
225
- "single_word": false
226
- },
227
- "mask_token": {
228
- "content": "<mask>",
229
- "lstrip": true,
230
- "normalized": true,
231
- "rstrip": false,
232
- "single_word": false
233
- },
234
- "pad_token": {
235
- "content": "<pad>",
236
- "lstrip": false,
237
- "normalized": false,
238
- "rstrip": false,
239
- "single_word": false
240
- },
241
- "sep_token": {
242
- "content": "</s>",
243
- "lstrip": false,
244
- "normalized": false,
245
- "rstrip": false,
246
- "single_word": false
247
- },
248
- "unk_token": {
249
- "content": "<unk>",
250
- "lstrip": false,
251
- "normalized": false,
252
- "rstrip": false,
253
- "single_word": false
254
- }
255
  }
 
201
  "yue_Hant",
202
  "zho_Hans",
203
  "zho_Hant",
204
+ "zul_Latn",
205
+ "<s>",
206
+ "<pad>",
207
+ "</s>",
208
+ "<unk>",
209
+ "ace_Arab",
210
+ "ace_Latn",
211
+ "acm_Arab",
212
+ "acq_Arab",
213
+ "aeb_Arab",
214
+ "afr_Latn",
215
+ "ajp_Arab",
216
+ "aka_Latn",
217
+ "amh_Ethi",
218
+ "apc_Arab",
219
+ "arb_Arab",
220
+ "ars_Arab",
221
+ "ary_Arab",
222
+ "arz_Arab",
223
+ "asm_Beng",
224
+ "ast_Latn",
225
+ "awa_Deva",
226
+ "ayr_Latn",
227
+ "azb_Arab",
228
+ "azj_Latn",
229
+ "bak_Cyrl",
230
+ "bam_Latn",
231
+ "ban_Latn",
232
+ "bel_Cyrl",
233
+ "bem_Latn",
234
+ "ben_Beng",
235
+ "bho_Deva",
236
+ "bjn_Arab",
237
+ "bjn_Latn",
238
+ "bod_Tibt",
239
+ "bos_Latn",
240
+ "bug_Latn",
241
+ "bul_Cyrl",
242
+ "cat_Latn",
243
+ "ceb_Latn",
244
+ "ces_Latn",
245
+ "cjk_Latn",
246
+ "ckb_Arab",
247
+ "crh_Latn",
248
+ "cym_Latn",
249
+ "dan_Latn",
250
+ "deu_Latn",
251
+ "dik_Latn",
252
+ "dyu_Latn",
253
+ "dzo_Tibt",
254
+ "ell_Grek",
255
+ "eng_Latn",
256
+ "epo_Latn",
257
+ "est_Latn",
258
+ "eus_Latn",
259
+ "ewe_Latn",
260
+ "fao_Latn",
261
+ "pes_Arab",
262
+ "fij_Latn",
263
+ "fin_Latn",
264
+ "fon_Latn",
265
+ "fra_Latn",
266
+ "fur_Latn",
267
+ "fuv_Latn",
268
+ "gla_Latn",
269
+ "gle_Latn",
270
+ "glg_Latn",
271
+ "grn_Latn",
272
+ "guj_Gujr",
273
+ "hat_Latn",
274
+ "hau_Latn",
275
+ "heb_Hebr",
276
+ "hin_Deva",
277
+ "hne_Deva",
278
+ "hrv_Latn",
279
+ "hun_Latn",
280
+ "hye_Armn",
281
+ "ibo_Latn",
282
+ "ilo_Latn",
283
+ "ind_Latn",
284
+ "isl_Latn",
285
+ "ita_Latn",
286
+ "jav_Latn",
287
+ "jpn_Jpan",
288
+ "kab_Latn",
289
+ "kac_Latn",
290
+ "kam_Latn",
291
+ "kan_Knda",
292
+ "kas_Arab",
293
+ "kas_Deva",
294
+ "kat_Geor",
295
+ "knc_Arab",
296
+ "knc_Latn",
297
+ "kaz_Cyrl",
298
+ "kbp_Latn",
299
+ "kea_Latn",
300
+ "khm_Khmr",
301
+ "kik_Latn",
302
+ "kin_Latn",
303
+ "kir_Cyrl",
304
+ "kmb_Latn",
305
+ "kon_Latn",
306
+ "kor_Hang",
307
+ "kmr_Latn",
308
+ "lao_Laoo",
309
+ "lvs_Latn",
310
+ "lij_Latn",
311
+ "lim_Latn",
312
+ "lin_Latn",
313
+ "lit_Latn",
314
+ "lmo_Latn",
315
+ "ltg_Latn",
316
+ "ltz_Latn",
317
+ "lua_Latn",
318
+ "lug_Latn",
319
+ "luo_Latn",
320
+ "lus_Latn",
321
+ "mag_Deva",
322
+ "mai_Deva",
323
+ "mal_Mlym",
324
+ "mar_Deva",
325
+ "min_Latn",
326
+ "mkd_Cyrl",
327
+ "plt_Latn",
328
+ "mlt_Latn",
329
+ "mni_Beng",
330
+ "khk_Cyrl",
331
+ "mos_Latn",
332
+ "mri_Latn",
333
+ "zsm_Latn",
334
+ "mya_Mymr",
335
+ "nld_Latn",
336
+ "nno_Latn",
337
+ "nob_Latn",
338
+ "npi_Deva",
339
+ "nso_Latn",
340
+ "nus_Latn",
341
+ "nya_Latn",
342
+ "oci_Latn",
343
+ "gaz_Latn",
344
+ "ory_Orya",
345
+ "pag_Latn",
346
+ "pan_Guru",
347
+ "pap_Latn",
348
+ "pol_Latn",
349
+ "por_Latn",
350
+ "prs_Arab",
351
+ "pbt_Arab",
352
+ "quy_Latn",
353
+ "ron_Latn",
354
+ "run_Latn",
355
+ "rus_Cyrl",
356
+ "sag_Latn",
357
+ "san_Deva",
358
+ "sat_Beng",
359
+ "scn_Latn",
360
+ "shn_Mymr",
361
+ "sin_Sinh",
362
+ "slk_Latn",
363
+ "slv_Latn",
364
+ "smo_Latn",
365
+ "sna_Latn",
366
+ "snd_Arab",
367
+ "som_Latn",
368
+ "sot_Latn",
369
+ "spa_Latn",
370
+ "als_Latn",
371
+ "srd_Latn",
372
+ "srp_Cyrl",
373
+ "ssw_Latn",
374
+ "sun_Latn",
375
+ "swe_Latn",
376
+ "swh_Latn",
377
+ "szl_Latn",
378
+ "tam_Taml",
379
+ "tat_Cyrl",
380
+ "tel_Telu",
381
+ "tgk_Cyrl",
382
+ "tgl_Latn",
383
+ "tha_Thai",
384
+ "tir_Ethi",
385
+ "taq_Latn",
386
+ "taq_Tfng",
387
+ "tpi_Latn",
388
+ "tsn_Latn",
389
+ "tso_Latn",
390
+ "tuk_Latn",
391
+ "tum_Latn",
392
+ "tur_Latn",
393
+ "twi_Latn",
394
+ "tzm_Tfng",
395
+ "uig_Arab",
396
+ "ukr_Cyrl",
397
+ "umb_Latn",
398
+ "urd_Arab",
399
+ "uzn_Latn",
400
+ "vec_Latn",
401
+ "vie_Latn",
402
+ "war_Latn",
403
+ "wol_Latn",
404
+ "xho_Latn",
405
+ "ydd_Hebr",
406
+ "yor_Latn",
407
+ "yue_Hant",
408
+ "zho_Hans",
409
+ "zho_Hant",
410
+ "zul_Latn",
411
+ "<mask>"
412
  ],
413
+ "bos_token": "<s>",
414
+ "cls_token": "<s>",
415
+ "eos_token": "</s>",
416
+ "mask_token": "<mask>",
417
+ "pad_token": "<pad>",
418
+ "sep_token": "</s>",
419
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:610501fe8857739dbb451ab69a0a795cb87dadcf8873d7e2227764d165e72e72
3
- size 17331379
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9633f7d33a10432bf06e8865c3c7d4e4798ed432be7bbc7a4a29051a7f5e594
3
+ size 17331380
tokenizer_config.json CHANGED
@@ -1651,7 +1651,7 @@
1651
  "256203": {
1652
  "content": "<mask>",
1653
  "lstrip": true,
1654
- "normalized": true,
1655
  "rstrip": false,
1656
  "single_word": false,
1657
  "special": true
@@ -1859,7 +1859,214 @@
1859
  "yue_Hant",
1860
  "zho_Hans",
1861
  "zho_Hant",
1862
- "zul_Latn"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1863
  ],
1864
  "bos_token": "<s>",
1865
  "clean_up_tokenization_spaces": true,
 
1651
  "256203": {
1652
  "content": "<mask>",
1653
  "lstrip": true,
1654
+ "normalized": false,
1655
  "rstrip": false,
1656
  "single_word": false,
1657
  "special": true
 
1859
  "yue_Hant",
1860
  "zho_Hans",
1861
  "zho_Hant",
1862
+ "zul_Latn",
1863
+ "<s>",
1864
+ "<pad>",
1865
+ "</s>",
1866
+ "<unk>",
1867
+ "ace_Arab",
1868
+ "ace_Latn",
1869
+ "acm_Arab",
1870
+ "acq_Arab",
1871
+ "aeb_Arab",
1872
+ "afr_Latn",
1873
+ "ajp_Arab",
1874
+ "aka_Latn",
1875
+ "amh_Ethi",
1876
+ "apc_Arab",
1877
+ "arb_Arab",
1878
+ "ars_Arab",
1879
+ "ary_Arab",
1880
+ "arz_Arab",
1881
+ "asm_Beng",
1882
+ "ast_Latn",
1883
+ "awa_Deva",
1884
+ "ayr_Latn",
1885
+ "azb_Arab",
1886
+ "azj_Latn",
1887
+ "bak_Cyrl",
1888
+ "bam_Latn",
1889
+ "ban_Latn",
1890
+ "bel_Cyrl",
1891
+ "bem_Latn",
1892
+ "ben_Beng",
1893
+ "bho_Deva",
1894
+ "bjn_Arab",
1895
+ "bjn_Latn",
1896
+ "bod_Tibt",
1897
+ "bos_Latn",
1898
+ "bug_Latn",
1899
+ "bul_Cyrl",
1900
+ "cat_Latn",
1901
+ "ceb_Latn",
1902
+ "ces_Latn",
1903
+ "cjk_Latn",
1904
+ "ckb_Arab",
1905
+ "crh_Latn",
1906
+ "cym_Latn",
1907
+ "dan_Latn",
1908
+ "deu_Latn",
1909
+ "dik_Latn",
1910
+ "dyu_Latn",
1911
+ "dzo_Tibt",
1912
+ "ell_Grek",
1913
+ "eng_Latn",
1914
+ "epo_Latn",
1915
+ "est_Latn",
1916
+ "eus_Latn",
1917
+ "ewe_Latn",
1918
+ "fao_Latn",
1919
+ "pes_Arab",
1920
+ "fij_Latn",
1921
+ "fin_Latn",
1922
+ "fon_Latn",
1923
+ "fra_Latn",
1924
+ "fur_Latn",
1925
+ "fuv_Latn",
1926
+ "gla_Latn",
1927
+ "gle_Latn",
1928
+ "glg_Latn",
1929
+ "grn_Latn",
1930
+ "guj_Gujr",
1931
+ "hat_Latn",
1932
+ "hau_Latn",
1933
+ "heb_Hebr",
1934
+ "hin_Deva",
1935
+ "hne_Deva",
1936
+ "hrv_Latn",
1937
+ "hun_Latn",
1938
+ "hye_Armn",
1939
+ "ibo_Latn",
1940
+ "ilo_Latn",
1941
+ "ind_Latn",
1942
+ "isl_Latn",
1943
+ "ita_Latn",
1944
+ "jav_Latn",
1945
+ "jpn_Jpan",
1946
+ "kab_Latn",
1947
+ "kac_Latn",
1948
+ "kam_Latn",
1949
+ "kan_Knda",
1950
+ "kas_Arab",
1951
+ "kas_Deva",
1952
+ "kat_Geor",
1953
+ "knc_Arab",
1954
+ "knc_Latn",
1955
+ "kaz_Cyrl",
1956
+ "kbp_Latn",
1957
+ "kea_Latn",
1958
+ "khm_Khmr",
1959
+ "kik_Latn",
1960
+ "kin_Latn",
1961
+ "kir_Cyrl",
1962
+ "kmb_Latn",
1963
+ "kon_Latn",
1964
+ "kor_Hang",
1965
+ "kmr_Latn",
1966
+ "lao_Laoo",
1967
+ "lvs_Latn",
1968
+ "lij_Latn",
1969
+ "lim_Latn",
1970
+ "lin_Latn",
1971
+ "lit_Latn",
1972
+ "lmo_Latn",
1973
+ "ltg_Latn",
1974
+ "ltz_Latn",
1975
+ "lua_Latn",
1976
+ "lug_Latn",
1977
+ "luo_Latn",
1978
+ "lus_Latn",
1979
+ "mag_Deva",
1980
+ "mai_Deva",
1981
+ "mal_Mlym",
1982
+ "mar_Deva",
1983
+ "min_Latn",
1984
+ "mkd_Cyrl",
1985
+ "plt_Latn",
1986
+ "mlt_Latn",
1987
+ "mni_Beng",
1988
+ "khk_Cyrl",
1989
+ "mos_Latn",
1990
+ "mri_Latn",
1991
+ "zsm_Latn",
1992
+ "mya_Mymr",
1993
+ "nld_Latn",
1994
+ "nno_Latn",
1995
+ "nob_Latn",
1996
+ "npi_Deva",
1997
+ "nso_Latn",
1998
+ "nus_Latn",
1999
+ "nya_Latn",
2000
+ "oci_Latn",
2001
+ "gaz_Latn",
2002
+ "ory_Orya",
2003
+ "pag_Latn",
2004
+ "pan_Guru",
2005
+ "pap_Latn",
2006
+ "pol_Latn",
2007
+ "por_Latn",
2008
+ "prs_Arab",
2009
+ "pbt_Arab",
2010
+ "quy_Latn",
2011
+ "ron_Latn",
2012
+ "run_Latn",
2013
+ "rus_Cyrl",
2014
+ "sag_Latn",
2015
+ "san_Deva",
2016
+ "sat_Beng",
2017
+ "scn_Latn",
2018
+ "shn_Mymr",
2019
+ "sin_Sinh",
2020
+ "slk_Latn",
2021
+ "slv_Latn",
2022
+ "smo_Latn",
2023
+ "sna_Latn",
2024
+ "snd_Arab",
2025
+ "som_Latn",
2026
+ "sot_Latn",
2027
+ "spa_Latn",
2028
+ "als_Latn",
2029
+ "srd_Latn",
2030
+ "srp_Cyrl",
2031
+ "ssw_Latn",
2032
+ "sun_Latn",
2033
+ "swe_Latn",
2034
+ "swh_Latn",
2035
+ "szl_Latn",
2036
+ "tam_Taml",
2037
+ "tat_Cyrl",
2038
+ "tel_Telu",
2039
+ "tgk_Cyrl",
2040
+ "tgl_Latn",
2041
+ "tha_Thai",
2042
+ "tir_Ethi",
2043
+ "taq_Latn",
2044
+ "taq_Tfng",
2045
+ "tpi_Latn",
2046
+ "tsn_Latn",
2047
+ "tso_Latn",
2048
+ "tuk_Latn",
2049
+ "tum_Latn",
2050
+ "tur_Latn",
2051
+ "twi_Latn",
2052
+ "tzm_Tfng",
2053
+ "uig_Arab",
2054
+ "ukr_Cyrl",
2055
+ "umb_Latn",
2056
+ "urd_Arab",
2057
+ "uzn_Latn",
2058
+ "vec_Latn",
2059
+ "vie_Latn",
2060
+ "war_Latn",
2061
+ "wol_Latn",
2062
+ "xho_Latn",
2063
+ "ydd_Hebr",
2064
+ "yor_Latn",
2065
+ "yue_Hant",
2066
+ "zho_Hans",
2067
+ "zho_Hant",
2068
+ "zul_Latn",
2069
+ "<mask>"
2070
  ],
2071
  "bos_token": "<s>",
2072
  "clean_up_tokenization_spaces": true,