Samuael commited on
Commit
109a4f1
·
verified ·
1 Parent(s): 8ead7ab

Upload tokenizer

Browse files
sentencepiece.bpe.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34727325a68a6a1e8580bf1e33934313f5f232bc92448d5ff77a5a850f25dbaa
3
- size 240462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc579c42f174c74d5a56bc4a02e6eef7759711b9415d2fd1bf4810683bdb69c5
3
+ size 240450
special_tokens_map.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "additional_special_tokens": [
3
- "",
4
- "ar_AR"
5
  ],
6
  "bos_token": "<s>",
7
  "cls_token": "<s>",
 
1
  {
2
  "additional_special_tokens": [
3
+ "ar_AR",
4
+ "cs_CZ"
5
  ],
6
  "bos_token": "<s>",
7
  "cls_token": "<s>",
tokenizer_config.json CHANGED
@@ -32,7 +32,7 @@
32
  "single_word": false,
33
  "special": true
34
  },
35
- "235": {
36
  "content": "ar_AR",
37
  "lstrip": false,
38
  "normalized": false,
@@ -40,7 +40,7 @@
40
  "single_word": false,
41
  "special": true
42
  },
43
- "236": {
44
  "content": "cs_CZ",
45
  "lstrip": false,
46
  "normalized": false,
@@ -48,7 +48,7 @@
48
  "single_word": false,
49
  "special": true
50
  },
51
- "237": {
52
  "content": "de_DE",
53
  "lstrip": false,
54
  "normalized": false,
@@ -56,7 +56,7 @@
56
  "single_word": false,
57
  "special": true
58
  },
59
- "238": {
60
  "content": "en_XX",
61
  "lstrip": false,
62
  "normalized": false,
@@ -64,7 +64,7 @@
64
  "single_word": false,
65
  "special": true
66
  },
67
- "239": {
68
  "content": "es_XX",
69
  "lstrip": false,
70
  "normalized": false,
@@ -72,7 +72,7 @@
72
  "single_word": false,
73
  "special": true
74
  },
75
- "240": {
76
  "content": "et_EE",
77
  "lstrip": false,
78
  "normalized": false,
@@ -80,7 +80,7 @@
80
  "single_word": false,
81
  "special": true
82
  },
83
- "241": {
84
  "content": "fi_FI",
85
  "lstrip": false,
86
  "normalized": false,
@@ -88,7 +88,7 @@
88
  "single_word": false,
89
  "special": true
90
  },
91
- "242": {
92
  "content": "fr_XX",
93
  "lstrip": false,
94
  "normalized": false,
@@ -96,7 +96,7 @@
96
  "single_word": false,
97
  "special": true
98
  },
99
- "243": {
100
  "content": "gu_IN",
101
  "lstrip": false,
102
  "normalized": false,
@@ -104,7 +104,7 @@
104
  "single_word": false,
105
  "special": true
106
  },
107
- "244": {
108
  "content": "hi_IN",
109
  "lstrip": false,
110
  "normalized": false,
@@ -112,7 +112,7 @@
112
  "single_word": false,
113
  "special": true
114
  },
115
- "245": {
116
  "content": "it_IT",
117
  "lstrip": false,
118
  "normalized": false,
@@ -120,7 +120,7 @@
120
  "single_word": false,
121
  "special": true
122
  },
123
- "246": {
124
  "content": "ja_XX",
125
  "lstrip": false,
126
  "normalized": false,
@@ -128,7 +128,7 @@
128
  "single_word": false,
129
  "special": true
130
  },
131
- "247": {
132
  "content": "kk_KZ",
133
  "lstrip": false,
134
  "normalized": false,
@@ -136,7 +136,7 @@
136
  "single_word": false,
137
  "special": true
138
  },
139
- "248": {
140
  "content": "ko_KR",
141
  "lstrip": false,
142
  "normalized": false,
@@ -144,7 +144,7 @@
144
  "single_word": false,
145
  "special": true
146
  },
147
- "249": {
148
  "content": "lt_LT",
149
  "lstrip": false,
150
  "normalized": false,
@@ -152,7 +152,7 @@
152
  "single_word": false,
153
  "special": true
154
  },
155
- "250": {
156
  "content": "lv_LV",
157
  "lstrip": false,
158
  "normalized": false,
@@ -160,7 +160,7 @@
160
  "single_word": false,
161
  "special": true
162
  },
163
- "251": {
164
  "content": "my_MM",
165
  "lstrip": false,
166
  "normalized": false,
@@ -168,7 +168,7 @@
168
  "single_word": false,
169
  "special": true
170
  },
171
- "252": {
172
  "content": "ne_NP",
173
  "lstrip": false,
174
  "normalized": false,
@@ -176,7 +176,7 @@
176
  "single_word": false,
177
  "special": true
178
  },
179
- "253": {
180
  "content": "nl_XX",
181
  "lstrip": false,
182
  "normalized": false,
@@ -184,7 +184,7 @@
184
  "single_word": false,
185
  "special": true
186
  },
187
- "254": {
188
  "content": "ro_RO",
189
  "lstrip": false,
190
  "normalized": false,
@@ -192,7 +192,7 @@
192
  "single_word": false,
193
  "special": true
194
  },
195
- "255": {
196
  "content": "ru_RU",
197
  "lstrip": false,
198
  "normalized": false,
@@ -200,7 +200,7 @@
200
  "single_word": false,
201
  "special": true
202
  },
203
- "256": {
204
  "content": "si_LK",
205
  "lstrip": false,
206
  "normalized": false,
@@ -208,7 +208,7 @@
208
  "single_word": false,
209
  "special": true
210
  },
211
- "257": {
212
  "content": "tr_TR",
213
  "lstrip": false,
214
  "normalized": false,
@@ -216,7 +216,7 @@
216
  "single_word": false,
217
  "special": true
218
  },
219
- "258": {
220
  "content": "vi_VN",
221
  "lstrip": false,
222
  "normalized": false,
@@ -224,7 +224,7 @@
224
  "single_word": false,
225
  "special": true
226
  },
227
- "259": {
228
  "content": "zh_CN",
229
  "lstrip": false,
230
  "normalized": false,
@@ -234,8 +234,8 @@
234
  }
235
  },
236
  "additional_special_tokens": [
237
- "",
238
- "ar_AR"
239
  ],
240
  "bos_token": "<s>",
241
  "clean_up_tokenization_spaces": true,
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "234": {
36
  "content": "ar_AR",
37
  "lstrip": false,
38
  "normalized": false,
 
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "235": {
44
  "content": "cs_CZ",
45
  "lstrip": false,
46
  "normalized": false,
 
48
  "single_word": false,
49
  "special": true
50
  },
51
+ "236": {
52
  "content": "de_DE",
53
  "lstrip": false,
54
  "normalized": false,
 
56
  "single_word": false,
57
  "special": true
58
  },
59
+ "237": {
60
  "content": "en_XX",
61
  "lstrip": false,
62
  "normalized": false,
 
64
  "single_word": false,
65
  "special": true
66
  },
67
+ "238": {
68
  "content": "es_XX",
69
  "lstrip": false,
70
  "normalized": false,
 
72
  "single_word": false,
73
  "special": true
74
  },
75
+ "239": {
76
  "content": "et_EE",
77
  "lstrip": false,
78
  "normalized": false,
 
80
  "single_word": false,
81
  "special": true
82
  },
83
+ "240": {
84
  "content": "fi_FI",
85
  "lstrip": false,
86
  "normalized": false,
 
88
  "single_word": false,
89
  "special": true
90
  },
91
+ "241": {
92
  "content": "fr_XX",
93
  "lstrip": false,
94
  "normalized": false,
 
96
  "single_word": false,
97
  "special": true
98
  },
99
+ "242": {
100
  "content": "gu_IN",
101
  "lstrip": false,
102
  "normalized": false,
 
104
  "single_word": false,
105
  "special": true
106
  },
107
+ "243": {
108
  "content": "hi_IN",
109
  "lstrip": false,
110
  "normalized": false,
 
112
  "single_word": false,
113
  "special": true
114
  },
115
+ "244": {
116
  "content": "it_IT",
117
  "lstrip": false,
118
  "normalized": false,
 
120
  "single_word": false,
121
  "special": true
122
  },
123
+ "245": {
124
  "content": "ja_XX",
125
  "lstrip": false,
126
  "normalized": false,
 
128
  "single_word": false,
129
  "special": true
130
  },
131
+ "246": {
132
  "content": "kk_KZ",
133
  "lstrip": false,
134
  "normalized": false,
 
136
  "single_word": false,
137
  "special": true
138
  },
139
+ "247": {
140
  "content": "ko_KR",
141
  "lstrip": false,
142
  "normalized": false,
 
144
  "single_word": false,
145
  "special": true
146
  },
147
+ "248": {
148
  "content": "lt_LT",
149
  "lstrip": false,
150
  "normalized": false,
 
152
  "single_word": false,
153
  "special": true
154
  },
155
+ "249": {
156
  "content": "lv_LV",
157
  "lstrip": false,
158
  "normalized": false,
 
160
  "single_word": false,
161
  "special": true
162
  },
163
+ "250": {
164
  "content": "my_MM",
165
  "lstrip": false,
166
  "normalized": false,
 
168
  "single_word": false,
169
  "special": true
170
  },
171
+ "251": {
172
  "content": "ne_NP",
173
  "lstrip": false,
174
  "normalized": false,
 
176
  "single_word": false,
177
  "special": true
178
  },
179
+ "252": {
180
  "content": "nl_XX",
181
  "lstrip": false,
182
  "normalized": false,
 
184
  "single_word": false,
185
  "special": true
186
  },
187
+ "253": {
188
  "content": "ro_RO",
189
  "lstrip": false,
190
  "normalized": false,
 
192
  "single_word": false,
193
  "special": true
194
  },
195
+ "254": {
196
  "content": "ru_RU",
197
  "lstrip": false,
198
  "normalized": false,
 
200
  "single_word": false,
201
  "special": true
202
  },
203
+ "255": {
204
  "content": "si_LK",
205
  "lstrip": false,
206
  "normalized": false,
 
208
  "single_word": false,
209
  "special": true
210
  },
211
+ "256": {
212
  "content": "tr_TR",
213
  "lstrip": false,
214
  "normalized": false,
 
216
  "single_word": false,
217
  "special": true
218
  },
219
+ "257": {
220
  "content": "vi_VN",
221
  "lstrip": false,
222
  "normalized": false,
 
224
  "single_word": false,
225
  "special": true
226
  },
227
+ "258": {
228
  "content": "zh_CN",
229
  "lstrip": false,
230
  "normalized": false,
 
234
  }
235
  },
236
  "additional_special_tokens": [
237
+ "ar_AR",
238
+ "cs_CZ"
239
  ],
240
  "bos_token": "<s>",
241
  "clean_up_tokenization_spaces": true,