vojtam commited on
Commit
d4f0de4
·
verified ·
1 Parent(s): 56738ed

Upload tokenizer.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.json +511 -0
tokenizer.json ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[PAD]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|endoftext|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "ByteLevel",
37
+ "add_prefix_space": false,
38
+ "trim_offsets": true,
39
+ "use_regex": true
40
+ },
41
+ "post_processor": {
42
+ "type": "ByteLevel",
43
+ "add_prefix_space": true,
44
+ "trim_offsets": false,
45
+ "use_regex": true
46
+ },
47
+ "decoder": {
48
+ "type": "ByteLevel",
49
+ "add_prefix_space": true,
50
+ "trim_offsets": true,
51
+ "use_regex": true
52
+ },
53
+ "model": {
54
+ "type": "BPE",
55
+ "dropout": null,
56
+ "unk_token": null,
57
+ "continuing_subword_prefix": "",
58
+ "end_of_word_suffix": "",
59
+ "fuse_unk": false,
60
+ "byte_fallback": false,
61
+ "ignore_merges": false,
62
+ "vocab": {
63
+ "<unk>": 0,
64
+ "[PAD]": 1,
65
+ "<|endoftext|>": 2,
66
+ "AA": 3,
67
+ "TT": 4,
68
+ "TG": 5,
69
+ "CA": 6,
70
+ "CC": 7,
71
+ "TA": 8,
72
+ "GG": 9,
73
+ "TC": 10,
74
+ "GA": 11,
75
+ "AAA": 12,
76
+ "GC": 13,
77
+ "TAA": 14,
78
+ "TTTT": 15,
79
+ "TCA": 16,
80
+ "TGA": 17,
81
+ "TTA": 18,
82
+ "GAA": 19,
83
+ "TCC": 20,
84
+ "CAA": 21,
85
+ "CTG": 22,
86
+ "CTT": 23,
87
+ "GTG": 24,
88
+ "GTT": 25,
89
+ "GCA": 26,
90
+ "GGA": 27,
91
+ "CCA": 28,
92
+ "GTA": 29,
93
+ "GCC": 30,
94
+ "CTA": 31,
95
+ "AAAA": 32,
96
+ "TAAA": 33,
97
+ "CTC": 34,
98
+ "GTC": 35,
99
+ "TGTG": 36,
100
+ "TATT": 37,
101
+ "CACA": 38,
102
+ "GAAA": 39,
103
+ "TATA": 40,
104
+ "TCTT": 41,
105
+ "TGTT": 42,
106
+ "CAAA": 43,
107
+ "GAGA": 44,
108
+ "CATT": 45,
109
+ "TGAA": 46,
110
+ "CAGG": 47,
111
+ "TCTG": 48,
112
+ "CAGA": 49,
113
+ "TCAA": 50,
114
+ "GGAA": 51,
115
+ "TAAAA": 52,
116
+ "CTGA": 53,
117
+ "GCTT": 54,
118
+ "GTGA": 55,
119
+ "GCTG": 56,
120
+ "CTCA": 57,
121
+ "CCTT": 58,
122
+ "A": 59,
123
+ "T": 60,
124
+ "G": 61,
125
+ "C": 62,
126
+ "▁": 63
127
+ },
128
+ "merges": [
129
+ [
130
+ "A",
131
+ "A"
132
+ ],
133
+ [
134
+ "T",
135
+ "T"
136
+ ],
137
+ [
138
+ "T",
139
+ "G"
140
+ ],
141
+ [
142
+ "C",
143
+ "A"
144
+ ],
145
+ [
146
+ "C",
147
+ "C"
148
+ ],
149
+ [
150
+ "T",
151
+ "A"
152
+ ],
153
+ [
154
+ "G",
155
+ "G"
156
+ ],
157
+ [
158
+ "T",
159
+ "C"
160
+ ],
161
+ [
162
+ "G",
163
+ "A"
164
+ ],
165
+ [
166
+ "A",
167
+ "AA"
168
+ ],
169
+ [
170
+ "AA",
171
+ "A"
172
+ ],
173
+ [
174
+ "G",
175
+ "C"
176
+ ],
177
+ [
178
+ "T",
179
+ "AA"
180
+ ],
181
+ [
182
+ "TA",
183
+ "A"
184
+ ],
185
+ [
186
+ "TT",
187
+ "TT"
188
+ ],
189
+ [
190
+ "T",
191
+ "CA"
192
+ ],
193
+ [
194
+ "TC",
195
+ "A"
196
+ ],
197
+ [
198
+ "T",
199
+ "GA"
200
+ ],
201
+ [
202
+ "TG",
203
+ "A"
204
+ ],
205
+ [
206
+ "T",
207
+ "TA"
208
+ ],
209
+ [
210
+ "TT",
211
+ "A"
212
+ ],
213
+ [
214
+ "G",
215
+ "AA"
216
+ ],
217
+ [
218
+ "GA",
219
+ "A"
220
+ ],
221
+ [
222
+ "T",
223
+ "CC"
224
+ ],
225
+ [
226
+ "TC",
227
+ "C"
228
+ ],
229
+ [
230
+ "C",
231
+ "AA"
232
+ ],
233
+ [
234
+ "CA",
235
+ "A"
236
+ ],
237
+ [
238
+ "C",
239
+ "TG"
240
+ ],
241
+ [
242
+ "C",
243
+ "TT"
244
+ ],
245
+ [
246
+ "G",
247
+ "TG"
248
+ ],
249
+ [
250
+ "G",
251
+ "TT"
252
+ ],
253
+ [
254
+ "G",
255
+ "CA"
256
+ ],
257
+ [
258
+ "GC",
259
+ "A"
260
+ ],
261
+ [
262
+ "G",
263
+ "GA"
264
+ ],
265
+ [
266
+ "GG",
267
+ "A"
268
+ ],
269
+ [
270
+ "C",
271
+ "CA"
272
+ ],
273
+ [
274
+ "CC",
275
+ "A"
276
+ ],
277
+ [
278
+ "G",
279
+ "TA"
280
+ ],
281
+ [
282
+ "G",
283
+ "CC"
284
+ ],
285
+ [
286
+ "GC",
287
+ "C"
288
+ ],
289
+ [
290
+ "C",
291
+ "TA"
292
+ ],
293
+ [
294
+ "A",
295
+ "AAA"
296
+ ],
297
+ [
298
+ "AA",
299
+ "AA"
300
+ ],
301
+ [
302
+ "AAA",
303
+ "A"
304
+ ],
305
+ [
306
+ "T",
307
+ "AAA"
308
+ ],
309
+ [
310
+ "TA",
311
+ "AA"
312
+ ],
313
+ [
314
+ "TAA",
315
+ "A"
316
+ ],
317
+ [
318
+ "C",
319
+ "TC"
320
+ ],
321
+ [
322
+ "G",
323
+ "TC"
324
+ ],
325
+ [
326
+ "T",
327
+ "GTG"
328
+ ],
329
+ [
330
+ "TG",
331
+ "TG"
332
+ ],
333
+ [
334
+ "TA",
335
+ "TT"
336
+ ],
337
+ [
338
+ "CA",
339
+ "CA"
340
+ ],
341
+ [
342
+ "G",
343
+ "AAA"
344
+ ],
345
+ [
346
+ "GA",
347
+ "AA"
348
+ ],
349
+ [
350
+ "GAA",
351
+ "A"
352
+ ],
353
+ [
354
+ "TA",
355
+ "TA"
356
+ ],
357
+ [
358
+ "T",
359
+ "CTT"
360
+ ],
361
+ [
362
+ "TC",
363
+ "TT"
364
+ ],
365
+ [
366
+ "T",
367
+ "GTT"
368
+ ],
369
+ [
370
+ "TG",
371
+ "TT"
372
+ ],
373
+ [
374
+ "C",
375
+ "AAA"
376
+ ],
377
+ [
378
+ "CA",
379
+ "AA"
380
+ ],
381
+ [
382
+ "CAA",
383
+ "A"
384
+ ],
385
+ [
386
+ "GA",
387
+ "GA"
388
+ ],
389
+ [
390
+ "CA",
391
+ "TT"
392
+ ],
393
+ [
394
+ "T",
395
+ "GAA"
396
+ ],
397
+ [
398
+ "TG",
399
+ "AA"
400
+ ],
401
+ [
402
+ "TGA",
403
+ "A"
404
+ ],
405
+ [
406
+ "CA",
407
+ "GG"
408
+ ],
409
+ [
410
+ "T",
411
+ "CTG"
412
+ ],
413
+ [
414
+ "TC",
415
+ "TG"
416
+ ],
417
+ [
418
+ "CA",
419
+ "GA"
420
+ ],
421
+ [
422
+ "T",
423
+ "CAA"
424
+ ],
425
+ [
426
+ "TC",
427
+ "AA"
428
+ ],
429
+ [
430
+ "TCA",
431
+ "A"
432
+ ],
433
+ [
434
+ "G",
435
+ "GAA"
436
+ ],
437
+ [
438
+ "GG",
439
+ "AA"
440
+ ],
441
+ [
442
+ "GGA",
443
+ "A"
444
+ ],
445
+ [
446
+ "T",
447
+ "AAAA"
448
+ ],
449
+ [
450
+ "TA",
451
+ "AAA"
452
+ ],
453
+ [
454
+ "TAA",
455
+ "AA"
456
+ ],
457
+ [
458
+ "TAAA",
459
+ "A"
460
+ ],
461
+ [
462
+ "C",
463
+ "TGA"
464
+ ],
465
+ [
466
+ "CTG",
467
+ "A"
468
+ ],
469
+ [
470
+ "G",
471
+ "CTT"
472
+ ],
473
+ [
474
+ "GC",
475
+ "TT"
476
+ ],
477
+ [
478
+ "G",
479
+ "TGA"
480
+ ],
481
+ [
482
+ "GTG",
483
+ "A"
484
+ ],
485
+ [
486
+ "G",
487
+ "CTG"
488
+ ],
489
+ [
490
+ "GC",
491
+ "TG"
492
+ ],
493
+ [
494
+ "C",
495
+ "TCA"
496
+ ],
497
+ [
498
+ "CTC",
499
+ "A"
500
+ ],
501
+ [
502
+ "C",
503
+ "CTT"
504
+ ],
505
+ [
506
+ "CC",
507
+ "TT"
508
+ ]
509
+ ]
510
+ }
511
+ }