Lakoc commited on
Commit
436f6a2
·
verified ·
1 Parent(s): 1865892

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +0 -1
  2. tokenizer.json +65 -74
  3. tokenizer_config.json +0 -9
special_tokens_map.json CHANGED
@@ -3,6 +3,5 @@
3
  "eos_token": "([eos])",
4
  "mask_token": "([mask])",
5
  "pad_token": "([pad])",
6
- "sep_token": " ",
7
  "unk_token": "([unk])"
8
  }
 
3
  "eos_token": "([eos])",
4
  "mask_token": "([mask])",
5
  "pad_token": "([pad])",
 
6
  "unk_token": "([unk])"
7
  }
tokenizer.json CHANGED
@@ -47,21 +47,12 @@
47
  "rstrip": false,
48
  "normalized": false,
49
  "special": true
50
- },
51
- {
52
- "id": 100,
53
- "content": " ",
54
- "single_word": false,
55
- "lstrip": false,
56
- "rstrip": false,
57
- "normalized": false,
58
- "special": true
59
  }
60
  ],
61
  "normalizer": null,
62
  "pre_tokenizer": {
63
  "type": "ByteLevel",
64
- "add_prefix_space": false,
65
  "trim_offsets": true,
66
  "use_regex": true
67
  },
@@ -179,139 +170,139 @@
179
  "Ġt": 33,
180
  "he": 34,
181
  "Ġa": 35,
182
- "in": 36,
183
- "Ġthe": 37,
184
  "Ġs": 38,
185
  "Ġw": 39,
186
  "Ġo": 40,
187
  "re": 41,
188
  "nd": 42,
189
- "Ġh": 43,
190
- "Ġb": 44,
191
  "er": 45,
192
  "Ġm": 46,
193
- "ou": 47,
194
- "Ġi": 48,
195
  "Ġc": 49,
196
  "Ġf": 50,
197
  "at": 51,
198
  "ed": 52,
199
- "en": 53,
200
- "on": 54,
201
- "Ġof": 55,
202
- "Ġto": 56,
203
- "Ġand": 57,
204
  "is": 58,
205
- "ing": 59,
206
- "Ġd": 60,
207
- "Ġp": 61,
208
- "Ġth": 62,
209
- "or": 63,
210
- "Ġhe": 64,
211
- "es": 65,
212
- "it": 66,
213
- "Ġl": 67,
214
- "as": 68,
215
- "ar": 69,
216
- "ll": 70,
217
- "an": 71,
218
- "Ġin": 72,
219
  "Ġn": 73,
220
  "Ġg": 74,
221
  "om": 75,
222
  "Ġbe": 76,
223
  "Ġha": 77,
224
- "le": 78,
225
- "Ġe": 79,
226
  "ot": 80,
227
- "ut": 81,
228
- "ow": 82,
229
- "ic": 83,
230
- "Ġy": 84,
231
- "ld": 85,
232
- "ve": 86,
233
- "ly": 87,
234
- "Ġit": 88,
235
- "Ġwh": 89,
236
- "Ġwas": 90,
237
- "se": 91,
238
- "Ġthat": 92,
239
- "id": 93,
240
  "st": 94,
241
- "gh": 95,
242
- "Ġon": 96,
243
  "ent": 97,
244
  "Ġre": 98,
245
- "im": 99
246
  },
247
  "merges": [
248
  "Ġ t",
249
  "h e",
250
  "Ġ a",
251
- "i n",
252
  "Ġt he",
 
253
  "Ġ s",
254
  "Ġ w",
255
  "Ġ o",
256
  "r e",
257
  "n d",
258
- "Ġ h",
259
  "Ġ b",
 
260
  "e r",
261
  "Ġ m",
262
- "o u",
263
  "Ġ i",
 
264
  "Ġ c",
265
  "Ġ f",
266
  "a t",
267
  "e d",
 
268
  "e n",
269
- "o n",
270
- "Ġo f",
271
  "Ġt o",
272
- "Ġa nd",
 
273
  "i s",
274
- "in g",
275
  "Ġ d",
276
- "Ġ p",
277
  "Ġt h",
278
- "o r",
279
  "Ġ he",
 
 
280
  "e s",
 
 
281
  "i t",
282
- "Ġ l",
283
- "a s",
284
  "a r",
285
- "l l",
286
  "a n",
287
- "Ġ in",
288
  "Ġ n",
289
  "Ġ g",
290
  "o m",
291
  "Ġb e",
292
  "Ġh a",
293
- "l e",
294
  "Ġ e",
 
295
  "o t",
 
296
  "u t",
297
  "o w",
298
  "i c",
299
- "Ġ y",
 
300
  "l d",
301
  "v e",
 
302
  "l y",
303
- "Ġi t",
304
- "Ġw h",
305
  "Ġw as",
306
- "s e",
307
- "Ġth at",
308
  "i d",
 
309
  "s t",
310
- "g h",
311
  "Ġo n",
 
312
  "en t",
313
  "Ġ re",
314
- "i m"
315
  ]
316
  }
317
  }
 
47
  "rstrip": false,
48
  "normalized": false,
49
  "special": true
 
 
 
 
 
 
 
 
 
50
  }
51
  ],
52
  "normalizer": null,
53
  "pre_tokenizer": {
54
  "type": "ByteLevel",
55
+ "add_prefix_space": true,
56
  "trim_offsets": true,
57
  "use_regex": true
58
  },
 
170
  "Ġt": 33,
171
  "he": 34,
172
  "Ġa": 35,
173
+ "Ġthe": 36,
174
+ "in": 37,
175
  "Ġs": 38,
176
  "Ġw": 39,
177
  "Ġo": 40,
178
  "re": 41,
179
  "nd": 42,
180
+ "Ġb": 43,
181
+ "Ġh": 44,
182
  "er": 45,
183
  "Ġm": 46,
184
+ "Ġi": 47,
185
+ "ou": 48,
186
  "Ġc": 49,
187
  "Ġf": 50,
188
  "at": 51,
189
  "ed": 52,
190
+ "Ġand": 53,
191
+ "en": 54,
192
+ "Ġto": 55,
193
+ "Ġof": 56,
194
+ "on": 57,
195
  "is": 58,
196
+ "Ġd": 59,
197
+ "ing": 60,
198
+ "Ġth": 61,
199
+ "Ġp": 62,
200
+ "Ġhe": 63,
201
+ "or": 64,
202
+ "Ġl": 65,
203
+ "es": 66,
204
+ "Ġin": 67,
205
+ "ll": 68,
206
+ "it": 69,
207
+ "ar": 70,
208
+ "as": 71,
209
+ "an": 72,
210
  "Ġn": 73,
211
  "Ġg": 74,
212
  "om": 75,
213
  "Ġbe": 76,
214
  "Ġha": 77,
215
+ "Ġe": 78,
216
+ "le": 79,
217
  "ot": 80,
218
+ "Ġy": 81,
219
+ "ut": 82,
220
+ "ow": 83,
221
+ "ic": 84,
222
+ "Ġwh": 85,
223
+ "Ġit": 86,
224
+ "ld": 87,
225
+ "ve": 88,
226
+ "Ġthat": 89,
227
+ "ly": 90,
228
+ "Ġwas": 91,
229
+ "id": 92,
230
+ "se": 93,
231
  "st": 94,
232
+ "Ġon": 95,
233
+ "gh": 96,
234
  "ent": 97,
235
  "Ġre": 98,
236
+ "Ġyou": 99
237
  },
238
  "merges": [
239
  "Ġ t",
240
  "h e",
241
  "Ġ a",
 
242
  "Ġt he",
243
+ "i n",
244
  "Ġ s",
245
  "Ġ w",
246
  "Ġ o",
247
  "r e",
248
  "n d",
 
249
  "Ġ b",
250
+ "Ġ h",
251
  "e r",
252
  "Ġ m",
 
253
  "Ġ i",
254
+ "o u",
255
  "Ġ c",
256
  "Ġ f",
257
  "a t",
258
  "e d",
259
+ "Ġa nd",
260
  "e n",
 
 
261
  "Ġt o",
262
+ "Ġo f",
263
+ "o n",
264
  "i s",
 
265
  "Ġ d",
266
+ "in g",
267
  "Ġt h",
268
+ "Ġ p",
269
  "Ġ he",
270
+ "o r",
271
+ "Ġ l",
272
  "e s",
273
+ "Ġ in",
274
+ "l l",
275
  "i t",
 
 
276
  "a r",
277
+ "a s",
278
  "a n",
 
279
  "Ġ n",
280
  "Ġ g",
281
  "o m",
282
  "Ġb e",
283
  "Ġh a",
 
284
  "Ġ e",
285
+ "l e",
286
  "o t",
287
+ "Ġ y",
288
  "u t",
289
  "o w",
290
  "i c",
291
+ "Ġw h",
292
+ "Ġi t",
293
  "l d",
294
  "v e",
295
+ "Ġth at",
296
  "l y",
 
 
297
  "Ġw as",
 
 
298
  "i d",
299
+ "s e",
300
  "s t",
 
301
  "Ġo n",
302
+ "g h",
303
  "en t",
304
  "Ġ re",
305
+ "Ġy ou"
306
  ]
307
  }
308
  }
tokenizer_config.json CHANGED
@@ -39,14 +39,6 @@
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
- },
43
- "100": {
44
- "content": " ",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
  }
51
  },
52
  "bos_token": "([bos])",
@@ -55,7 +47,6 @@
55
  "mask_token": "([mask])",
56
  "model_max_length": 1000000000000000019884624838656,
57
  "pad_token": "([pad])",
58
- "sep_token": " ",
59
  "tokenizer_class": "PreTrainedTokenizerFast",
60
  "unk_token": "([unk])"
61
  }
 
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
 
 
 
 
 
 
 
 
42
  }
43
  },
44
  "bos_token": "([bos])",
 
47
  "mask_token": "([mask])",
48
  "model_max_length": 1000000000000000019884624838656,
49
  "pad_token": "([pad])",
 
50
  "tokenizer_class": "PreTrainedTokenizerFast",
51
  "unk_token": "([unk])"
52
  }