go76dof commited on
Commit
fcdbc8e
·
verified ·
1 Parent(s): ed52679

feat: upload custom trained BPE tokenizer from scratch

Browse files
Files changed (2) hide show
  1. tokenizer.json +469 -0
  2. tokenizer_config.json +10 -0
tokenizer.json ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[PAD]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "Lowercase"
54
+ },
55
+ "pre_tokenizer": {
56
+ "type": "Whitespace"
57
+ },
58
+ "post_processor": {
59
+ "type": "TemplateProcessing",
60
+ "single": [
61
+ {
62
+ "Sequence": {
63
+ "id": "A",
64
+ "type_id": 0
65
+ }
66
+ }
67
+ ],
68
+ "pair": [
69
+ {
70
+ "Sequence": {
71
+ "id": "A",
72
+ "type_id": 0
73
+ }
74
+ },
75
+ {
76
+ "Sequence": {
77
+ "id": "B",
78
+ "type_id": 1
79
+ }
80
+ }
81
+ ],
82
+ "special_tokens": {}
83
+ },
84
+ "decoder": null,
85
+ "model": {
86
+ "type": "BPE",
87
+ "dropout": null,
88
+ "unk_token": "[UNK]",
89
+ "continuing_subword_prefix": null,
90
+ "end_of_word_suffix": null,
91
+ "fuse_unk": false,
92
+ "byte_fallback": false,
93
+ "ignore_merges": false,
94
+ "vocab": {
95
+ "[UNK]": 0,
96
+ "[PAD]": 1,
97
+ "[CLS]": 2,
98
+ "[SEP]": 3,
99
+ "[MASK]": 4,
100
+ "a": 5,
101
+ "b": 6,
102
+ "c": 7,
103
+ "d": 8,
104
+ "e": 9,
105
+ "f": 10,
106
+ "g": 11,
107
+ "h": 12,
108
+ "i": 13,
109
+ "k": 14,
110
+ "l": 15,
111
+ "m": 16,
112
+ "n": 17,
113
+ "o": 18,
114
+ "r": 19,
115
+ "s": 20,
116
+ "t": 21,
117
+ "u": 22,
118
+ "v": 23,
119
+ "z": 24,
120
+ "en": 25,
121
+ "is": 26,
122
+ "st": 27,
123
+ "te": 28,
124
+ "ar": 29,
125
+ "ce": 30,
126
+ "ch": 31,
127
+ "ct": 32,
128
+ "er": 33,
129
+ "in": 34,
130
+ "om": 35,
131
+ "to": 36,
132
+ "ing": 37,
133
+ "ab": 38,
134
+ "an": 39,
135
+ "at": 40,
136
+ "ate": 41,
137
+ "ace": 42,
138
+ "bu": 43,
139
+ "cl": 44,
140
+ "cr": 45,
141
+ "cu": 46,
142
+ "cab": 47,
143
+ "ding": 48,
144
+ "ear": 49,
145
+ "fr": 50,
146
+ "face": 51,
147
+ "gg": 52,
148
+ "gen": 53,
149
+ "hu": 54,
150
+ "his": 55,
151
+ "il": 56,
152
+ "iz": 57,
153
+ "ite": 58,
154
+ "ict": 59,
155
+ "ken": 60,
156
+ "ocab": 61,
157
+ "re": 62,
158
+ "rict": 63,
159
+ "sen": 64,
160
+ "scr": 65,
161
+ "ten": 66,
162
+ "this": 67,
163
+ "ure": 68,
164
+ "vocab": 69,
165
+ "stom": 70,
166
+ "strict": 71,
167
+ "test": 72,
168
+ "arch": 73,
169
+ "cture": 74,
170
+ "erate": 75,
171
+ "token": 76,
172
+ "and": 77,
173
+ "atch": 78,
174
+ "buil": 79,
175
+ "clear": 80,
176
+ "custom": 81,
177
+ "from": 82,
178
+ "gging": 83,
179
+ "generate": 84,
180
+ "hugging": 85,
181
+ "izer": 86,
182
+ "itecture": 87,
183
+ "senten": 88,
184
+ "scratch": 89,
185
+ "architecture": 90,
186
+ "tokenizer": 91,
187
+ "building": 92,
188
+ "sentence": 93
189
+ },
190
+ "merges": [
191
+ [
192
+ "e",
193
+ "n"
194
+ ],
195
+ [
196
+ "i",
197
+ "s"
198
+ ],
199
+ [
200
+ "s",
201
+ "t"
202
+ ],
203
+ [
204
+ "t",
205
+ "e"
206
+ ],
207
+ [
208
+ "a",
209
+ "r"
210
+ ],
211
+ [
212
+ "c",
213
+ "e"
214
+ ],
215
+ [
216
+ "c",
217
+ "h"
218
+ ],
219
+ [
220
+ "c",
221
+ "t"
222
+ ],
223
+ [
224
+ "e",
225
+ "r"
226
+ ],
227
+ [
228
+ "i",
229
+ "n"
230
+ ],
231
+ [
232
+ "o",
233
+ "m"
234
+ ],
235
+ [
236
+ "t",
237
+ "o"
238
+ ],
239
+ [
240
+ "in",
241
+ "g"
242
+ ],
243
+ [
244
+ "a",
245
+ "b"
246
+ ],
247
+ [
248
+ "a",
249
+ "n"
250
+ ],
251
+ [
252
+ "a",
253
+ "t"
254
+ ],
255
+ [
256
+ "a",
257
+ "te"
258
+ ],
259
+ [
260
+ "a",
261
+ "ce"
262
+ ],
263
+ [
264
+ "b",
265
+ "u"
266
+ ],
267
+ [
268
+ "c",
269
+ "l"
270
+ ],
271
+ [
272
+ "c",
273
+ "r"
274
+ ],
275
+ [
276
+ "c",
277
+ "u"
278
+ ],
279
+ [
280
+ "c",
281
+ "ab"
282
+ ],
283
+ [
284
+ "d",
285
+ "ing"
286
+ ],
287
+ [
288
+ "e",
289
+ "ar"
290
+ ],
291
+ [
292
+ "f",
293
+ "r"
294
+ ],
295
+ [
296
+ "f",
297
+ "ace"
298
+ ],
299
+ [
300
+ "g",
301
+ "g"
302
+ ],
303
+ [
304
+ "g",
305
+ "en"
306
+ ],
307
+ [
308
+ "h",
309
+ "u"
310
+ ],
311
+ [
312
+ "h",
313
+ "is"
314
+ ],
315
+ [
316
+ "i",
317
+ "l"
318
+ ],
319
+ [
320
+ "i",
321
+ "z"
322
+ ],
323
+ [
324
+ "i",
325
+ "te"
326
+ ],
327
+ [
328
+ "i",
329
+ "ct"
330
+ ],
331
+ [
332
+ "k",
333
+ "en"
334
+ ],
335
+ [
336
+ "o",
337
+ "cab"
338
+ ],
339
+ [
340
+ "r",
341
+ "e"
342
+ ],
343
+ [
344
+ "r",
345
+ "ict"
346
+ ],
347
+ [
348
+ "s",
349
+ "en"
350
+ ],
351
+ [
352
+ "s",
353
+ "cr"
354
+ ],
355
+ [
356
+ "t",
357
+ "en"
358
+ ],
359
+ [
360
+ "t",
361
+ "his"
362
+ ],
363
+ [
364
+ "u",
365
+ "re"
366
+ ],
367
+ [
368
+ "v",
369
+ "ocab"
370
+ ],
371
+ [
372
+ "st",
373
+ "om"
374
+ ],
375
+ [
376
+ "st",
377
+ "rict"
378
+ ],
379
+ [
380
+ "te",
381
+ "st"
382
+ ],
383
+ [
384
+ "ar",
385
+ "ch"
386
+ ],
387
+ [
388
+ "ct",
389
+ "ure"
390
+ ],
391
+ [
392
+ "er",
393
+ "ate"
394
+ ],
395
+ [
396
+ "to",
397
+ "ken"
398
+ ],
399
+ [
400
+ "an",
401
+ "d"
402
+ ],
403
+ [
404
+ "at",
405
+ "ch"
406
+ ],
407
+ [
408
+ "bu",
409
+ "il"
410
+ ],
411
+ [
412
+ "cl",
413
+ "ear"
414
+ ],
415
+ [
416
+ "cu",
417
+ "stom"
418
+ ],
419
+ [
420
+ "fr",
421
+ "om"
422
+ ],
423
+ [
424
+ "gg",
425
+ "ing"
426
+ ],
427
+ [
428
+ "gen",
429
+ "erate"
430
+ ],
431
+ [
432
+ "hu",
433
+ "gging"
434
+ ],
435
+ [
436
+ "iz",
437
+ "er"
438
+ ],
439
+ [
440
+ "ite",
441
+ "cture"
442
+ ],
443
+ [
444
+ "sen",
445
+ "ten"
446
+ ],
447
+ [
448
+ "scr",
449
+ "atch"
450
+ ],
451
+ [
452
+ "arch",
453
+ "itecture"
454
+ ],
455
+ [
456
+ "token",
457
+ "izer"
458
+ ],
459
+ [
460
+ "buil",
461
+ "ding"
462
+ ],
463
+ [
464
+ "senten",
465
+ "ce"
466
+ ]
467
+ ]
468
+ }
469
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "tokenizer_class": "TokenizersBackend",
9
+ "unk_toke": "[UNK]"
10
+ }