GoshKolotyan commited on
Commit
da7dd17
·
verified ·
1 Parent(s): f52b2ec

Initial upload of perovskite tokenizer - tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +717 -0
tokenizer.json ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "NFD"
54
+ },
55
+ "pre_tokenizer": {
56
+ "type": "Whitespace"
57
+ },
58
+ "post_processor": {
59
+ "type": "TemplateProcessing",
60
+ "single": [
61
+ {
62
+ "SpecialToken": {
63
+ "id": "[CLS]",
64
+ "type_id": 0
65
+ }
66
+ },
67
+ {
68
+ "Sequence": {
69
+ "id": "A",
70
+ "type_id": 0
71
+ }
72
+ },
73
+ {
74
+ "SpecialToken": {
75
+ "id": "[SEP]",
76
+ "type_id": 0
77
+ }
78
+ }
79
+ ],
80
+ "pair": [
81
+ {
82
+ "SpecialToken": {
83
+ "id": "[CLS]",
84
+ "type_id": 0
85
+ }
86
+ },
87
+ {
88
+ "Sequence": {
89
+ "id": "A",
90
+ "type_id": 0
91
+ }
92
+ },
93
+ {
94
+ "SpecialToken": {
95
+ "id": "[SEP]",
96
+ "type_id": 0
97
+ }
98
+ },
99
+ {
100
+ "Sequence": {
101
+ "id": "B",
102
+ "type_id": 1
103
+ }
104
+ },
105
+ {
106
+ "SpecialToken": {
107
+ "id": "[SEP]",
108
+ "type_id": 1
109
+ }
110
+ }
111
+ ],
112
+ "special_tokens": {
113
+ "[CLS]": {
114
+ "id": "[CLS]",
115
+ "ids": [
116
+ 2
117
+ ],
118
+ "tokens": [
119
+ "[CLS]"
120
+ ]
121
+ },
122
+ "[SEP]": {
123
+ "id": "[SEP]",
124
+ "ids": [
125
+ 3
126
+ ],
127
+ "tokens": [
128
+ "[SEP]"
129
+ ]
130
+ }
131
+ }
132
+ },
133
+ "decoder": null,
134
+ "model": {
135
+ "type": "BPE",
136
+ "dropout": null,
137
+ "unk_token": null,
138
+ "continuing_subword_prefix": "##",
139
+ "end_of_word_suffix": null,
140
+ "fuse_unk": false,
141
+ "byte_fallback": false,
142
+ "ignore_merges": false,
143
+ "vocab": {
144
+ "[PAD]": 0,
145
+ "[UNK]": 1,
146
+ "[CLS]": 2,
147
+ "[SEP]": 3,
148
+ "[MASK]": 4,
149
+ ".": 5,
150
+ "0": 6,
151
+ "1": 7,
152
+ "2": 8,
153
+ "3": 9,
154
+ "4": 10,
155
+ "5": 11,
156
+ "6": 12,
157
+ "7": 13,
158
+ "8": 14,
159
+ "9": 15,
160
+ "A": 16,
161
+ "B": 17,
162
+ "C": 18,
163
+ "D": 19,
164
+ "F": 20,
165
+ "G": 21,
166
+ "H": 22,
167
+ "I": 23,
168
+ "K": 24,
169
+ "L": 25,
170
+ "M": 26,
171
+ "N": 27,
172
+ "P": 28,
173
+ "R": 29,
174
+ "S": 30,
175
+ "T": 31,
176
+ "Y": 32,
177
+ "Z": 33,
178
+ "a": 34,
179
+ "b": 35,
180
+ "d": 36,
181
+ "e": 37,
182
+ "g": 38,
183
+ "i": 39,
184
+ "l": 40,
185
+ "n": 41,
186
+ "o": 42,
187
+ "r": 43,
188
+ "s": 44,
189
+ "u": 45,
190
+ "##A": 46,
191
+ "##a": 47,
192
+ "##3": 48,
193
+ "##7": 49,
194
+ "##o": 50,
195
+ "##2": 51,
196
+ "##l": 52,
197
+ "##M": 53,
198
+ "##n": 54,
199
+ "##4": 55,
200
+ "##b": 56,
201
+ "##9": 57,
202
+ "##s": 58,
203
+ "##g": 59,
204
+ "##6": 60,
205
+ "##8": 61,
206
+ "##e": 62,
207
+ "##r": 63,
208
+ "##5": 64,
209
+ "##d": 65,
210
+ "##1": 66,
211
+ "##u": 67,
212
+ "##i": 68,
213
+ "01": 69,
214
+ "In": 70,
215
+ "Na": 71,
216
+ "Yb": 72,
217
+ "Br": 73,
218
+ "Cl": 74,
219
+ "Se": 75,
220
+ "Te": 76,
221
+ "Ba": 77,
222
+ "Cs": 78,
223
+ "DM": 79,
224
+ "FA": 80,
225
+ "Hg": 81,
226
+ "La": 82,
227
+ "Li": 83,
228
+ "MA": 84,
229
+ "Rb": 85,
230
+ "Sr": 86,
231
+ "Tl": 87,
232
+ "DMA": 88,
233
+ "Ag": 89,
234
+ "Au": 90,
235
+ "Bi": 91,
236
+ "Co": 92,
237
+ "Cd": 93,
238
+ "Cu": 94,
239
+ "Fe": 95,
240
+ "Ga": 96,
241
+ "Ge": 97,
242
+ "Mn": 98,
243
+ "Mg": 99,
244
+ "Nb": 100,
245
+ "Ni": 101,
246
+ "Pb": 102,
247
+ "Pd": 103,
248
+ "Sn": 104,
249
+ "Sb": 105,
250
+ "Tb": 106,
251
+ "Ti": 107,
252
+ "Zn": 108,
253
+ "03": 109,
254
+ "09": 110,
255
+ "06": 111,
256
+ "12": 112,
257
+ "18": 113,
258
+ "15": 114,
259
+ "82": 115,
260
+ "88": 116,
261
+ "85": 117,
262
+ "94": 118,
263
+ "91": 119,
264
+ "97": 120,
265
+ "07": 121,
266
+ "02": 122,
267
+ "04": 123,
268
+ "08": 124,
269
+ "05": 125,
270
+ "13": 126,
271
+ "17": 127,
272
+ "14": 128,
273
+ "19": 129,
274
+ "16": 130,
275
+ "11": 131,
276
+ "83": 132,
277
+ "87": 133,
278
+ "84": 134,
279
+ "89": 135,
280
+ "86": 136,
281
+ "81": 137,
282
+ "93": 138,
283
+ "92": 139,
284
+ "96": 140,
285
+ "98": 141,
286
+ "95": 142,
287
+ "79": 143,
288
+ "99": 144,
289
+ "27": 145,
290
+ "24": 146,
291
+ "21": 147,
292
+ "33": 148,
293
+ "39": 149,
294
+ "36": 150,
295
+ "43": 151,
296
+ "42": 152,
297
+ "49": 153,
298
+ "46": 154,
299
+ "48": 155,
300
+ "45": 156,
301
+ "57": 157,
302
+ "52": 158,
303
+ "54": 159,
304
+ "58": 160,
305
+ "55": 161,
306
+ "51": 162,
307
+ "67": 163,
308
+ "64": 164,
309
+ "61": 165,
310
+ "73": 166,
311
+ "76": 167,
312
+ "37": 168
313
+ },
314
+ "merges": [
315
+ [
316
+ "0",
317
+ "##1"
318
+ ],
319
+ [
320
+ "I",
321
+ "##n"
322
+ ],
323
+ [
324
+ "N",
325
+ "##a"
326
+ ],
327
+ [
328
+ "Y",
329
+ "##b"
330
+ ],
331
+ [
332
+ "B",
333
+ "##r"
334
+ ],
335
+ [
336
+ "C",
337
+ "##l"
338
+ ],
339
+ [
340
+ "S",
341
+ "##e"
342
+ ],
343
+ [
344
+ "T",
345
+ "##e"
346
+ ],
347
+ [
348
+ "B",
349
+ "##a"
350
+ ],
351
+ [
352
+ "C",
353
+ "##s"
354
+ ],
355
+ [
356
+ "D",
357
+ "##M"
358
+ ],
359
+ [
360
+ "F",
361
+ "##A"
362
+ ],
363
+ [
364
+ "H",
365
+ "##g"
366
+ ],
367
+ [
368
+ "L",
369
+ "##a"
370
+ ],
371
+ [
372
+ "L",
373
+ "##i"
374
+ ],
375
+ [
376
+ "M",
377
+ "##A"
378
+ ],
379
+ [
380
+ "R",
381
+ "##b"
382
+ ],
383
+ [
384
+ "S",
385
+ "##r"
386
+ ],
387
+ [
388
+ "T",
389
+ "##l"
390
+ ],
391
+ [
392
+ "DM",
393
+ "##A"
394
+ ],
395
+ [
396
+ "A",
397
+ "##g"
398
+ ],
399
+ [
400
+ "A",
401
+ "##u"
402
+ ],
403
+ [
404
+ "B",
405
+ "##i"
406
+ ],
407
+ [
408
+ "C",
409
+ "##o"
410
+ ],
411
+ [
412
+ "C",
413
+ "##d"
414
+ ],
415
+ [
416
+ "C",
417
+ "##u"
418
+ ],
419
+ [
420
+ "F",
421
+ "##e"
422
+ ],
423
+ [
424
+ "G",
425
+ "##a"
426
+ ],
427
+ [
428
+ "G",
429
+ "##e"
430
+ ],
431
+ [
432
+ "M",
433
+ "##n"
434
+ ],
435
+ [
436
+ "M",
437
+ "##g"
438
+ ],
439
+ [
440
+ "N",
441
+ "##b"
442
+ ],
443
+ [
444
+ "N",
445
+ "##i"
446
+ ],
447
+ [
448
+ "P",
449
+ "##b"
450
+ ],
451
+ [
452
+ "P",
453
+ "##d"
454
+ ],
455
+ [
456
+ "S",
457
+ "##n"
458
+ ],
459
+ [
460
+ "S",
461
+ "##b"
462
+ ],
463
+ [
464
+ "T",
465
+ "##b"
466
+ ],
467
+ [
468
+ "T",
469
+ "##i"
470
+ ],
471
+ [
472
+ "Z",
473
+ "##n"
474
+ ],
475
+ [
476
+ "0",
477
+ "##3"
478
+ ],
479
+ [
480
+ "0",
481
+ "##9"
482
+ ],
483
+ [
484
+ "0",
485
+ "##6"
486
+ ],
487
+ [
488
+ "1",
489
+ "##2"
490
+ ],
491
+ [
492
+ "1",
493
+ "##8"
494
+ ],
495
+ [
496
+ "1",
497
+ "##5"
498
+ ],
499
+ [
500
+ "8",
501
+ "##2"
502
+ ],
503
+ [
504
+ "8",
505
+ "##8"
506
+ ],
507
+ [
508
+ "8",
509
+ "##5"
510
+ ],
511
+ [
512
+ "9",
513
+ "##4"
514
+ ],
515
+ [
516
+ "9",
517
+ "##1"
518
+ ],
519
+ [
520
+ "9",
521
+ "##7"
522
+ ],
523
+ [
524
+ "0",
525
+ "##7"
526
+ ],
527
+ [
528
+ "0",
529
+ "##2"
530
+ ],
531
+ [
532
+ "0",
533
+ "##4"
534
+ ],
535
+ [
536
+ "0",
537
+ "##8"
538
+ ],
539
+ [
540
+ "0",
541
+ "##5"
542
+ ],
543
+ [
544
+ "1",
545
+ "##3"
546
+ ],
547
+ [
548
+ "1",
549
+ "##7"
550
+ ],
551
+ [
552
+ "1",
553
+ "##4"
554
+ ],
555
+ [
556
+ "1",
557
+ "##9"
558
+ ],
559
+ [
560
+ "1",
561
+ "##6"
562
+ ],
563
+ [
564
+ "1",
565
+ "##1"
566
+ ],
567
+ [
568
+ "8",
569
+ "##3"
570
+ ],
571
+ [
572
+ "8",
573
+ "##7"
574
+ ],
575
+ [
576
+ "8",
577
+ "##4"
578
+ ],
579
+ [
580
+ "8",
581
+ "##9"
582
+ ],
583
+ [
584
+ "8",
585
+ "##6"
586
+ ],
587
+ [
588
+ "8",
589
+ "##1"
590
+ ],
591
+ [
592
+ "9",
593
+ "##3"
594
+ ],
595
+ [
596
+ "9",
597
+ "##2"
598
+ ],
599
+ [
600
+ "9",
601
+ "##6"
602
+ ],
603
+ [
604
+ "9",
605
+ "##8"
606
+ ],
607
+ [
608
+ "9",
609
+ "##5"
610
+ ],
611
+ [
612
+ "7",
613
+ "##9"
614
+ ],
615
+ [
616
+ "9",
617
+ "##9"
618
+ ],
619
+ [
620
+ "2",
621
+ "##7"
622
+ ],
623
+ [
624
+ "2",
625
+ "##4"
626
+ ],
627
+ [
628
+ "2",
629
+ "##1"
630
+ ],
631
+ [
632
+ "3",
633
+ "##3"
634
+ ],
635
+ [
636
+ "3",
637
+ "##9"
638
+ ],
639
+ [
640
+ "3",
641
+ "##6"
642
+ ],
643
+ [
644
+ "4",
645
+ "##3"
646
+ ],
647
+ [
648
+ "4",
649
+ "##2"
650
+ ],
651
+ [
652
+ "4",
653
+ "##9"
654
+ ],
655
+ [
656
+ "4",
657
+ "##6"
658
+ ],
659
+ [
660
+ "4",
661
+ "##8"
662
+ ],
663
+ [
664
+ "4",
665
+ "##5"
666
+ ],
667
+ [
668
+ "5",
669
+ "##7"
670
+ ],
671
+ [
672
+ "5",
673
+ "##2"
674
+ ],
675
+ [
676
+ "5",
677
+ "##4"
678
+ ],
679
+ [
680
+ "5",
681
+ "##8"
682
+ ],
683
+ [
684
+ "5",
685
+ "##5"
686
+ ],
687
+ [
688
+ "5",
689
+ "##1"
690
+ ],
691
+ [
692
+ "6",
693
+ "##7"
694
+ ],
695
+ [
696
+ "6",
697
+ "##4"
698
+ ],
699
+ [
700
+ "6",
701
+ "##1"
702
+ ],
703
+ [
704
+ "7",
705
+ "##3"
706
+ ],
707
+ [
708
+ "7",
709
+ "##6"
710
+ ],
711
+ [
712
+ "3",
713
+ "##7"
714
+ ]
715
+ ]
716
+ }
717
+ }