Samuael commited on
Commit
04d7243
·
verified ·
1 Parent(s): ca56b17

Upload tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +213 -6
tokenizer_config.json CHANGED
@@ -33,7 +33,7 @@
33
  "special": true
34
  },
35
  "20001": {
36
- "content": "am_CR",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -41,7 +41,191 @@
41
  "special": true
42
  },
43
  "20002": {
44
- "content": "am_IC",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
@@ -50,8 +234,31 @@
50
  }
51
  },
52
  "additional_special_tokens": [
53
- "am_CR",
54
- "am_IC"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  ],
56
  "bos_token": "<s>",
57
  "clean_up_tokenization_spaces": true,
@@ -62,8 +269,8 @@
62
  "pad_token": "<pad>",
63
  "sep_token": "</s>",
64
  "sp_model_kwargs": {},
65
- "src_lang": "am_IC",
66
- "tgt_lang": "am_CR",
67
  "tokenizer_class": "MBartTokenizer",
68
  "tokenizer_file": null,
69
  "unk_token": "<unk>"
 
33
  "special": true
34
  },
35
  "20001": {
36
+ "content": "ar_AR",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
 
41
  "special": true
42
  },
43
  "20002": {
44
+ "content": "cs_CZ",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "20003": {
52
+ "content": "de_DE",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "20004": {
60
+ "content": "en_XX",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "20005": {
68
+ "content": "es_XX",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "20006": {
76
+ "content": "et_EE",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "20007": {
84
+ "content": "fi_FI",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "20008": {
92
+ "content": "fr_XX",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "20009": {
100
+ "content": "gu_IN",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "20010": {
108
+ "content": "hi_IN",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "20011": {
116
+ "content": "it_IT",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "20012": {
124
+ "content": "ja_XX",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "20013": {
132
+ "content": "kk_KZ",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "20014": {
140
+ "content": "ko_KR",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "20015": {
148
+ "content": "lt_LT",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "20016": {
156
+ "content": "lv_LV",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "20017": {
164
+ "content": "my_MM",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "20018": {
172
+ "content": "ne_NP",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "20019": {
180
+ "content": "nl_XX",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "20020": {
188
+ "content": "ro_RO",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "20021": {
196
+ "content": "ru_RU",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "20022": {
204
+ "content": "si_LK",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "20023": {
212
+ "content": "tr_TR",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "20024": {
220
+ "content": "vi_VN",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "20025": {
228
+ "content": "zh_CN",
229
  "lstrip": false,
230
  "normalized": false,
231
  "rstrip": false,
 
234
  }
235
  },
236
  "additional_special_tokens": [
237
+ "ar_AR",
238
+ "cs_CZ",
239
+ "de_DE",
240
+ "en_XX",
241
+ "es_XX",
242
+ "et_EE",
243
+ "fi_FI",
244
+ "fr_XX",
245
+ "gu_IN",
246
+ "hi_IN",
247
+ "it_IT",
248
+ "ja_XX",
249
+ "kk_KZ",
250
+ "ko_KR",
251
+ "lt_LT",
252
+ "lv_LV",
253
+ "my_MM",
254
+ "ne_NP",
255
+ "nl_XX",
256
+ "ro_RO",
257
+ "ru_RU",
258
+ "si_LK",
259
+ "tr_TR",
260
+ "vi_VN",
261
+ "zh_CN"
262
  ],
263
  "bos_token": "<s>",
264
  "clean_up_tokenization_spaces": true,
 
269
  "pad_token": "<pad>",
270
  "sep_token": "</s>",
271
  "sp_model_kwargs": {},
272
+ "src_lang": "en_XX",
273
+ "tgt_lang": null,
274
  "tokenizer_class": "MBartTokenizer",
275
  "tokenizer_file": null,
276
  "unk_token": "<unk>"