aaronfeller commited on
Commit
ee2b9b6
·
verified ·
1 Parent(s): eff0d52

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +37 -0
  2. tokenizer.json +550 -0
  3. tokenizer_config.json +53 -0
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Split",
55
+ "pattern": {
56
+ "Regex": "(\\[[^\\]]+]|C\\(=N\\)N|CCC\\(C\\)|\\(CCCN\\)|NC\\(=O\\)|C\\(C\\)=O|=C\\(N\\)N|N=C\\(N\\)|NC\\(=N\\)|C\\(=O\\)C|CS\\(=O\\)|OC\\(=O\\)|C\\(=O\\)c|c\\(=O\\)n|C\\(=O\\)O|C\\(N\\)=O|cc\\(Br\\)|CC\\(=O\\)|C\\(=O\\)N|ccc\\(C\\)|ccc\\(F\\)|c\\(=O\\)|C\\(=N\\)|c\\(O\\)c|NC\\(C\\)|n\\(C\\)c|CC\\(O\\)|cc\\(N\\)|CC\\(C\\)|cc\\(C\\)|C\\(=O\\)|cc\\(O\\)|c\\(N\\)c|c\\(Cl\\)|C\\(N\\)N|N\\(C\\)C|NC\\(N\\)|=C\\(N\\)|C\\(O\\)C|c\\(OC\\)|\\(C#N\\)|C\\(C\\)C|CC\\(N\\)|C\\(C\\)N|c\\(CO\\)|c\\(Br\\)|\\(CCO\\)|C\\(CC\\)|S\\(=O\\)|c\\(C\\)c|\\(=N\\)|c\\(O\\)|\\(Br\\)|\\(CS\\)|c\\(C\\)|\\(CC\\)|c\\(I\\)|C\\(C\\)|N\\(C\\)|C\\(O\\)|C\\(I\\)|C\\(F\\)|\\(Cl\\)|n\\(C\\)|\\(OC\\)|\\(=O\\)|c\\(F\\)|CCCN\\)|\\(=S\\)|c\\(N\\)|\\(CO\\)|C\\(N\\)|\\(C\\)|ccccc|\\(S\\)|\\(F\\)|\\(O\\)|C#N\\)|CCO\\)|\\(N\\)|C\\(=N|\\(I\\)|CSSC|=N\\)|CC=O|CCCO|Cl\\)|CCNO|=O\\)|CCSC|\\(=N|CO\\)|CCNC|CCCC|=S\\)|CN=C|CCCS|cccc|CCCN|Br\\)|cccn|CS\\)|C=CC|OC\\)|CC=C|cnn|=NC|COC|OCC|\\(O|CCS|CNc|#Cc|=CC|ccn|C=C|CSc|ccc|NCc|CCO|N=C|cnc|I\\)|CCc|OCc|CCl|ccs|COc|CCn|CSC|SCC|NCC|CCN|CNC|C#C|C=O|CNO|CCC|SSC|C#N|O=C|NOC|S\\)|csc|ncc|C\\)|N\\)|\\(C|ncn|F\\)|O\\)|N#C|nnc|CSS|cco|Cl|NC|nc|co|CS|CO|no|cc|CN|cn|SS|OC|\\)|SN|nn|CC|#C|NO|=S|NS|cs|=C|Oc|=O|oc|Nc|Cc|=N|NN|C=|C#|\\(|SC|sc|Br|N#|#N|p|O|I|N|C|s|=|c|B|S|F|n|P|#|o)"
57
+ },
58
+ "behavior": "Isolated",
59
+ "invert": false
60
+ },
61
+ "post_processor": {
62
+ "type": "TemplateProcessing",
63
+ "single": [
64
+ {
65
+ "SpecialToken": {
66
+ "id": "[CLS]",
67
+ "type_id": 0
68
+ }
69
+ },
70
+ {
71
+ "Sequence": {
72
+ "id": "A",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "SpecialToken": {
78
+ "id": "[SEP]",
79
+ "type_id": 0
80
+ }
81
+ }
82
+ ],
83
+ "pair": [
84
+ {
85
+ "SpecialToken": {
86
+ "id": "[CLS]",
87
+ "type_id": 0
88
+ }
89
+ },
90
+ {
91
+ "Sequence": {
92
+ "id": "A",
93
+ "type_id": 0
94
+ }
95
+ },
96
+ {
97
+ "SpecialToken": {
98
+ "id": "[SEP]",
99
+ "type_id": 0
100
+ }
101
+ },
102
+ {
103
+ "Sequence": {
104
+ "id": "B",
105
+ "type_id": 1
106
+ }
107
+ },
108
+ {
109
+ "SpecialToken": {
110
+ "id": "[SEP]",
111
+ "type_id": 1
112
+ }
113
+ }
114
+ ],
115
+ "special_tokens": {
116
+ "[CLS]": {
117
+ "id": "[CLS]",
118
+ "ids": [
119
+ 2
120
+ ],
121
+ "tokens": [
122
+ "[CLS]"
123
+ ]
124
+ },
125
+ "[SEP]": {
126
+ "id": "[SEP]",
127
+ "ids": [
128
+ 3
129
+ ],
130
+ "tokens": [
131
+ "[SEP]"
132
+ ]
133
+ }
134
+ }
135
+ },
136
+ "decoder": null,
137
+ "model": {
138
+ "type": "WordPiece",
139
+ "unk_token": "[UNK]",
140
+ "continuing_subword_prefix": "##",
141
+ "max_input_chars_per_word": 100,
142
+ "vocab": {
143
+ "[PAD]": 0,
144
+ "[UNK]": 1,
145
+ "[CLS]": 2,
146
+ "[SEP]": 3,
147
+ "[MASK]": 4,
148
+ ":": 5,
149
+ "%11": 6,
150
+ "-": 7,
151
+ "[As]": 8,
152
+ "[pH]": 9,
153
+ "[Po]": 10,
154
+ "[Ra]": 11,
155
+ "[3H]": 12,
156
+ "[S-]": 13,
157
+ "8": 14,
158
+ "%21": 15,
159
+ "[CH-]": 16,
160
+ "[IH]": 17,
161
+ "P": 18,
162
+ "[SeH]": 19,
163
+ "[O]": 20,
164
+ "4": 21,
165
+ "/": 22,
166
+ "[N-]": 23,
167
+ "[129Xe]": 24,
168
+ "[Cl+3]": 25,
169
+ "3": 26,
170
+ "[C@@]": 27,
171
+ "[11CH3]": 28,
172
+ "[13C]": 29,
173
+ "[Sn+]": 30,
174
+ "[P@@]": 31,
175
+ "[Ge]": 32,
176
+ "[BH3-]": 33,
177
+ "[123I]": 34,
178
+ "[14CH2]": 35,
179
+ "[Al-]": 36,
180
+ "[Si]": 37,
181
+ "[S@]": 38,
182
+ "[W]": 39,
183
+ "=": 40,
184
+ "%19": 41,
185
+ "Cl": 42,
186
+ "[Cl+2]": 43,
187
+ "%14": 44,
188
+ "[Al]": 45,
189
+ "9": 46,
190
+ "[B-]": 47,
191
+ "[Cl+]": 48,
192
+ "[TlH2]": 49,
193
+ "[NH2+]": 50,
194
+ "[11CH]": 51,
195
+ "[SnH]": 52,
196
+ "[SiH3]": 53,
197
+ "[Sn]": 54,
198
+ "[11C]": 55,
199
+ "S": 56,
200
+ "[SiH2]": 57,
201
+ "%18": 58,
202
+ "[BH-]": 59,
203
+ "[Ru]": 60,
204
+ "%10": 61,
205
+ "[V]": 62,
206
+ "[o+]": 63,
207
+ "[O+]": 64,
208
+ "c": 65,
209
+ "[I-]": 66,
210
+ "[C@@H]": 67,
211
+ "n": 68,
212
+ "2": 69,
213
+ "[Se-]": 70,
214
+ "[N+]": 71,
215
+ "N": 72,
216
+ "s": 73,
217
+ "[PH+]": 74,
218
+ "[C@]": 75,
219
+ "[N@]": 76,
220
+ "[C+]": 77,
221
+ "[s+]": 78,
222
+ "[N@+]": 79,
223
+ "[125I]": 80,
224
+ "[cH-]": 81,
225
+ "[Th]": 82,
226
+ "C": 83,
227
+ "[Sb]": 84,
228
+ "5": 85,
229
+ "[c-]": 86,
230
+ "#": 87,
231
+ "[Ca]": 88,
232
+ "%16": 89,
233
+ "[Tl]": 90,
234
+ "[18F]": 91,
235
+ "[223Ra]": 92,
236
+ "[BH2-]": 93,
237
+ "[O-]": 94,
238
+ "[Bi]": 95,
239
+ "[te]": 96,
240
+ "Br": 97,
241
+ "[Cr]": 98,
242
+ "[N@@]": 99,
243
+ "[Hg]": 100,
244
+ "[S@+]": 101,
245
+ "\\": 102,
246
+ "[n+]": 103,
247
+ "%15": 104,
248
+ "[123Te]": 105,
249
+ "[C-]": 106,
250
+ "1": 107,
251
+ "[NH+]": 108,
252
+ "[I+]": 109,
253
+ "[CH]": 110,
254
+ "%13": 111,
255
+ "[Pb]": 112,
256
+ "[14C]": 113,
257
+ "[2H]": 114,
258
+ "[P@]": 115,
259
+ "[OH+]": 116,
260
+ ")": 117,
261
+ "[Tc]": 118,
262
+ "[se+]": 119,
263
+ "[NH-]": 120,
264
+ "[nH]": 121,
265
+ "B": 122,
266
+ "[CH2]": 123,
267
+ "[P+]": 124,
268
+ "[se]": 125,
269
+ "[In]": 126,
270
+ "[Te]": 127,
271
+ "[Se+]": 128,
272
+ "%12": 129,
273
+ "[S+]": 130,
274
+ "o": 131,
275
+ "[C]": 132,
276
+ "[N@@+]": 133,
277
+ "[n-]": 134,
278
+ "6": 135,
279
+ "[S@@]": 136,
280
+ "[nH+]": 137,
281
+ "[Si+]": 138,
282
+ "[PH]": 139,
283
+ "[Hg+]": 140,
284
+ "[C@H]": 141,
285
+ "[Ga]": 142,
286
+ "[S@@+]": 143,
287
+ "[NH3+]": 144,
288
+ "[SiH]": 145,
289
+ "[11c]": 146,
290
+ "%20": 147,
291
+ "%17": 148,
292
+ "(": 149,
293
+ "O": 150,
294
+ "[IH2]": 151,
295
+ "[As+]": 152,
296
+ "F": 153,
297
+ "[CH2-]": 154,
298
+ "[Se]": 155,
299
+ "[c+]": 156,
300
+ "%23": 157,
301
+ "[SH]": 158,
302
+ "I": 159,
303
+ "7": 160,
304
+ "%22": 161,
305
+ "[Os]": 162,
306
+ "[OH]": 163,
307
+ "p": 164,
308
+ "[P@+]": 165,
309
+ "[Ag+]": 166,
310
+ "[Ag-4]": 167,
311
+ "[Ag]": 168,
312
+ "[Al-3]": 169,
313
+ "[AsH3]": 170,
314
+ "[AsH]": 171,
315
+ "[At]": 172,
316
+ "[B@-]": 173,
317
+ "[B@@-]": 174,
318
+ "[B]": 175,
319
+ "[Ba]": 176,
320
+ "[Br+2]": 177,
321
+ "[BrH]": 178,
322
+ "[Br]": 179,
323
+ "[CH3]": 180,
324
+ "[CaH2]": 181,
325
+ "[Cs]": 182,
326
+ "[FH]": 183,
327
+ "[F]": 184,
328
+ "[H]": 185,
329
+ "[He]": 186,
330
+ "[I+2]": 187,
331
+ "[I+3]": 188,
332
+ "[I]": 189,
333
+ "[K]": 190,
334
+ "[Kr]": 191,
335
+ "[Li+]": 192,
336
+ "[LiH]": 193,
337
+ "[MgH2]": 194,
338
+ "[Mg]": 195,
339
+ "[NH3]": 196,
340
+ "[N]": 197,
341
+ "[Na]": 198,
342
+ "[OH2]": 199,
343
+ "[P@@+]": 200,
344
+ "[PH2]": 201,
345
+ "[P]": 202,
346
+ "[Rb]": 203,
347
+ "[SH+]": 204,
348
+ "[SH2]": 205,
349
+ "[S]": 206,
350
+ "[Se-2]": 207,
351
+ "[SeH2]": 208,
352
+ "[Si@]": 209,
353
+ "[SrH2]": 210,
354
+ "[TeH]": 211,
355
+ "[Xe]": 212,
356
+ "[Zn+2]": 213,
357
+ "[Zn-2]": 214,
358
+ "[Zn]": 215,
359
+ "[n]": 216,
360
+ "[te+]": 217,
361
+ "=O": 218,
362
+ "CC": 219,
363
+ "NC": 220,
364
+ "CO": 221,
365
+ "cc": 222,
366
+ "CCC": 223,
367
+ "CCCC": 224,
368
+ "ccc": 225,
369
+ "CCN": 226,
370
+ "CCCN": 227,
371
+ "CN": 228,
372
+ "CNC": 229,
373
+ "cccc": 230,
374
+ "ccccc": 231,
375
+ "N)": 232,
376
+ "(N)": 233,
377
+ "=O)": 234,
378
+ "(=O)": 235,
379
+ "C(=O)": 236,
380
+ "C(=O)N": 237,
381
+ "O)": 238,
382
+ "(C": 239,
383
+ "(C)": 240,
384
+ "C(C)": 241,
385
+ "C(C)C": 242,
386
+ "CC(=O)": 243,
387
+ "C(=O)O": 244,
388
+ "C(=O)C": 245,
389
+ "C(N)": 246,
390
+ "CC(N)": 247,
391
+ "C(N)=O": 248,
392
+ "CO)": 249,
393
+ "(CO)": 250,
394
+ "CC(C)": 251,
395
+ "CS": 252,
396
+ "=N": 253,
397
+ "CCNC": 254,
398
+ "NC(=O)": 255,
399
+ "=N)": 256,
400
+ "(=N)": 257,
401
+ "C(=N)": 258,
402
+ "CC=O": 259,
403
+ "CCCN)": 260,
404
+ "(CCCN)": 261,
405
+ "NC(=N)": 262,
406
+ "Br)": 263,
407
+ "(Br)": 264,
408
+ "F)": 265,
409
+ "(F)": 266,
410
+ "S)": 267,
411
+ "(S)": 268,
412
+ "C)": 269,
413
+ "(O)": 270,
414
+ "CCS": 271,
415
+ "CCCS": 272,
416
+ "CCSC": 273,
417
+ "cn": 274,
418
+ "ccn": 275,
419
+ "cccn": 276,
420
+ "CSC": 277,
421
+ "=C": 278,
422
+ "CCO": 279,
423
+ "(O": 280,
424
+ "(=N": 281,
425
+ "C(=N": 282,
426
+ "c(O)": 283,
427
+ "OC": 284,
428
+ "SCC": 285,
429
+ "ccc(F)": 286,
430
+ "S(=O)": 287,
431
+ "O=C": 288,
432
+ "CCc": 289,
433
+ "OC(=O)": 290,
434
+ "C#": 291,
435
+ "Cc": 292,
436
+ "C=C": 293,
437
+ "C=": 294,
438
+ "#N": 295,
439
+ "C#N": 296,
440
+ "ccs": 297,
441
+ "NO": 298,
442
+ "C(O)": 299,
443
+ "csc": 300,
444
+ "ccc(C)": 301,
445
+ "cc(Br)": 302,
446
+ "ncn": 303,
447
+ "CCNO": 304,
448
+ "CCCO": 305,
449
+ "CSS": 306,
450
+ "CSSC": 307,
451
+ "=CC": 308,
452
+ "I)": 309,
453
+ "(I)": 310,
454
+ "CNO": 311,
455
+ "N(C)": 312,
456
+ "N(C)C": 313,
457
+ "C(N)N": 314,
458
+ "NOC": 315,
459
+ "C(C)=O": 316,
460
+ "#C": 317,
461
+ "cco": 318,
462
+ "NS": 319,
463
+ "SN": 320,
464
+ "c(=O)n": 321,
465
+ "=S)": 322,
466
+ "(=S)": 323,
467
+ "c(N)c": 324,
468
+ "N=C": 325,
469
+ "SC": 326,
470
+ "SSC": 327,
471
+ "CCC(C)": 328,
472
+ "c(=O)": 329,
473
+ "C#N)": 330,
474
+ "(C#N)": 331,
475
+ "SS": 332,
476
+ "=S": 333,
477
+ "oc": 334,
478
+ "co": 335,
479
+ "no": 336,
480
+ "N#": 337,
481
+ "N#C": 338,
482
+ "nc": 339,
483
+ "sc": 340,
484
+ "C(=N)N": 341,
485
+ "C=O": 342,
486
+ "c(F)": 343,
487
+ "C(F)": 344,
488
+ "c(I)": 345,
489
+ "C(I)": 346,
490
+ "cnn": 347,
491
+ "cc(N)": 348,
492
+ "NC(N)": 349,
493
+ "OC)": 350,
494
+ "(OC)": 351,
495
+ "c(OC)": 352,
496
+ "c(Br)": 353,
497
+ "c(N)": 354,
498
+ "cc(O)": 355,
499
+ "CS)": 356,
500
+ "(CS)": 357,
501
+ "Oc": 358,
502
+ "cnc": 359,
503
+ "Cl)": 360,
504
+ "(Cl)": 361,
505
+ "c(Cl)": 362,
506
+ "c(O)c": 363,
507
+ "NCC": 364,
508
+ "COC": 365,
509
+ "OCC": 366,
510
+ "Nc": 367,
511
+ "ncc": 368,
512
+ "cc(C)": 369,
513
+ "nn": 370,
514
+ "cs": 371,
515
+ "c(C)c": 372,
516
+ "COc": 373,
517
+ "C(=O)c": 374,
518
+ "c(C)": 375,
519
+ "(CC)": 376,
520
+ "NCc": 377,
521
+ "nnc": 378,
522
+ "C(O)C": 379,
523
+ "=C(N)": 380,
524
+ "C=CC": 381,
525
+ "=C(N)N": 382,
526
+ "N=C(N)": 383,
527
+ "OCc": 384,
528
+ "CC=C": 385,
529
+ "CCl": 386,
530
+ "CCn": 387,
531
+ "CNc": 388,
532
+ "CC(O)": 389,
533
+ "NN": 390,
534
+ "CSc": 391,
535
+ "NC(C)": 392,
536
+ "CS(=O)": 393,
537
+ "C(CC)": 394,
538
+ "C#C": 395,
539
+ "C(C)N": 396,
540
+ "CCO)": 397,
541
+ "(CCO)": 398,
542
+ "CN=C": 399,
543
+ "n(C)": 400,
544
+ "n(C)c": 401,
545
+ "c(CO)": 402,
546
+ "#Cc": 403,
547
+ "=NC": 404
548
+ }
549
+ }
550
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "extra_special_tokens": {},
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 768,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "tokenizer_class": "PreTrainedTokenizer",
52
+ "unk_token": "[UNK]"
53
+ }