zluvolyote commited on
Commit
ef1fe96
·
1 Parent(s): a7a5ad2

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -1
  2. tokenizer.json +509 -24
  3. tokenizer_config.json +1 -1
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
1
+ {"mask_token": "[MASK]"}
tokenizer.json CHANGED
@@ -5,7 +5,7 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "[PAD]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 1,
17
- "content": "[UNK]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
@@ -50,15 +50,17 @@
50
  }
51
  ],
52
  "normalizer": {
53
- "type": "BertNormalizer",
54
- "clean_text": true,
55
- "handle_chinese_chars": true,
56
- "strip_accents": null,
57
- "lowercase": true
58
- },
59
- "pre_tokenizer": {
60
- "type": "BertPreTokenizer"
 
61
  },
 
62
  "post_processor": {
63
  "type": "TemplateProcessing",
64
  "single": [
@@ -145,23 +147,506 @@
145
  "continuing_subword_prefix": "##",
146
  "max_input_chars_per_word": 100,
147
  "vocab": {
148
- "[PAD]": 0,
149
- "[UNK]": 1,
150
  "[CLS]": 2,
151
  "[SEP]": 3,
152
  "[MASK]": 4,
153
- "1": 5,
154
- "2": 6,
155
- "3": 7,
156
- "4": 8,
157
- "5": 9,
158
- "6": 10,
159
- "##4": 11,
160
- "##6": 12,
161
- "##5": 13,
162
- "##2": 14,
163
- "##1": 15,
164
- "##3": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  }
166
  }
167
  }
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "content": "[UNK]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 1,
17
+ "content": "[PAD]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
50
  }
51
  ],
52
  "normalizer": {
53
+ "type": "Sequence",
54
+ "normalizers": [
55
+ {
56
+ "type": "NFD"
57
+ },
58
+ {
59
+ "type": "StripAccents"
60
+ }
61
+ ]
62
  },
63
+ "pre_tokenizer": null,
64
  "post_processor": {
65
  "type": "TemplateProcessing",
66
  "single": [
 
147
  "continuing_subword_prefix": "##",
148
  "max_input_chars_per_word": 100,
149
  "vocab": {
150
+ "[UNK]": 0,
151
+ "[PAD]": 1,
152
  "[CLS]": 2,
153
  "[SEP]": 3,
154
  "[MASK]": 4,
155
+ " ": 5,
156
+ "1": 6,
157
+ "2": 7,
158
+ "3": 8,
159
+ "4": 9,
160
+ "5": 10,
161
+ "6": 11,
162
+ "##1": 12,
163
+ "## ": 13,
164
+ "##4": 14,
165
+ "##2": 15,
166
+ "##3": 16,
167
+ "##5": 17,
168
+ "##6": 18,
169
+ "##11": 19,
170
+ "##12": 20,
171
+ "##1 ": 21,
172
+ "##22": 22,
173
+ "##11 ": 23,
174
+ "##2 ": 24,
175
+ "##13": 25,
176
+ "##12 ": 26,
177
+ "##21 ": 27,
178
+ "##14": 28,
179
+ "##211": 29,
180
+ "##3 ": 30,
181
+ "##1111 ": 31,
182
+ "##22 ": 32,
183
+ "##212": 33,
184
+ "##4 ": 34,
185
+ "##13 ": 35,
186
+ "##31 ": 36,
187
+ "##1211 ": 37,
188
+ "##2111 ": 38,
189
+ "##1112 ": 39,
190
+ "##1121 ": 40,
191
+ "##23": 41,
192
+ "##311": 42,
193
+ "##14 ": 43,
194
+ "##15": 44,
195
+ "##24": 45,
196
+ "##1122 ": 46,
197
+ "##32 ": 47,
198
+ "##1221 ": 48,
199
+ "##2211 ": 49,
200
+ "##2121 ": 50,
201
+ "##41 ": 51,
202
+ "##2112 ": 52,
203
+ "##23 ": 53,
204
+ "##1212 ": 54,
205
+ "##32": 55,
206
+ "##16": 56,
207
+ "##411": 57,
208
+ "##213": 58,
209
+ "##5 ": 59,
210
+ "##121 ": 60,
211
+ "##42 ": 61,
212
+ "##2212 ": 62,
213
+ "##2221 ": 63,
214
+ "##1131 ": 64,
215
+ "##1311 ": 65,
216
+ "##1113 ": 66,
217
+ "##1222 ": 67,
218
+ "##2122 ": 68,
219
+ "##3111 ": 69,
220
+ "##24 ": 70,
221
+ "##33": 71,
222
+ "##42": 72,
223
+ "##6 ": 73,
224
+ "##214": 74,
225
+ "##221 ": 75,
226
+ "##122 ": 76,
227
+ "##2222 ": 77,
228
+ "##51 ": 78,
229
+ "##1114 ": 79,
230
+ "##33 ": 80,
231
+ "##15 ": 81,
232
+ "##25": 82,
233
+ "##1141 ": 83,
234
+ "##1213 ": 84,
235
+ "##1312 ": 85,
236
+ "##4111 ": 86,
237
+ "##1132 ": 87,
238
+ "##2131 ": 88,
239
+ "##1411 ": 89,
240
+ "##1321 ": 90,
241
+ "##2113 ": 91,
242
+ "##2311 ": 92,
243
+ "##3112 ": 93,
244
+ "##3211 ": 94,
245
+ "##1123 ": 95,
246
+ "##1231 ": 96,
247
+ "##3121 ": 97,
248
+ "##34": 98,
249
+ "##43": 99,
250
+ "##222 ": 100,
251
+ "##61 ": 101,
252
+ "##511": 102,
253
+ "##313": 103,
254
+ "##16 ": 104,
255
+ "##52 ": 105,
256
+ "##26": 106,
257
+ "##44": 107,
258
+ "##1142 ": 108,
259
+ "##1421 ": 109,
260
+ "##25 ": 110,
261
+ "##1111 1111 ": 111,
262
+ "##1214 ": 112,
263
+ "##4211 ": 113,
264
+ "##2213 ": 114,
265
+ "##4121 ": 115,
266
+ "##2114 ": 116,
267
+ "##1412 ": 117,
268
+ "##43 ": 118,
269
+ "##4112 ": 119,
270
+ "##34 ": 120,
271
+ "##3122 ": 121,
272
+ "##2132 ": 122,
273
+ "##2231 ": 123,
274
+ "##3212 ": 124,
275
+ "##2123 ": 125,
276
+ "##2411 ": 126,
277
+ "##2141 ": 127,
278
+ "##1232 ": 128,
279
+ "##1223 ": 129,
280
+ "##3221 ": 130,
281
+ "##2321 ": 131,
282
+ "##1322 ": 132,
283
+ "##1241 ": 133,
284
+ "##1124 ": 134,
285
+ "##2312 ": 135,
286
+ "##52": 136,
287
+ "##62 ": 137,
288
+ "##611": 138,
289
+ "##314": 139,
290
+ "##215": 140,
291
+ "##44 ": 141,
292
+ "##312": 142,
293
+ "##26 ": 143,
294
+ "##413": 144,
295
+ "##1422 ": 145,
296
+ "##2421 ": 146,
297
+ "##3113 ": 147,
298
+ "##4221 ": 148,
299
+ "##4212 ": 149,
300
+ "##1133 ": 150,
301
+ "##2142 ": 151,
302
+ "##2124 ": 152,
303
+ "##3222 ": 153,
304
+ "##4122 ": 154,
305
+ "##2223 ": 155,
306
+ "##2322 ": 156,
307
+ "##2214 ": 157,
308
+ "##3311 ": 158,
309
+ "##2241 ": 159,
310
+ "##2232 ": 160,
311
+ "##1242 ": 161,
312
+ "##5111 ": 162,
313
+ "##1224 ": 163,
314
+ "##1331 ": 164,
315
+ "##3131 ": 165,
316
+ "##62": 166,
317
+ "##1511 ": 167,
318
+ "##2412 ": 168,
319
+ "##1115 ": 169,
320
+ "##1313 ": 170,
321
+ "##35": 171,
322
+ "##1151 ": 172,
323
+ "##216": 173,
324
+ "##1111 1112 ": 174,
325
+ "##53": 175,
326
+ "##1111 1211 ": 176,
327
+ "##1111 1121 ": 177,
328
+ "##414": 178,
329
+ "##1111 2111 ": 179,
330
+ "##223 ": 180,
331
+ "##412": 181,
332
+ "##241 ": 182,
333
+ "##4222 ": 183,
334
+ "##53 ": 184,
335
+ "##232 ": 185,
336
+ "##2422 ": 186,
337
+ "##2242 ": 187,
338
+ "##1332 ": 188,
339
+ "##45": 189,
340
+ "##2224 ": 190,
341
+ "##3114 ": 191,
342
+ "##1161 ": 192,
343
+ "##1611 ": 193,
344
+ "##1323 ": 194,
345
+ "##1251 ": 195,
346
+ "##6111 ": 196,
347
+ "##1431 ": 197,
348
+ "##1211 1111 ": 198,
349
+ "##3213 ": 199,
350
+ "##2511 ": 200,
351
+ "##1116 ": 201,
352
+ "##1143 ": 202,
353
+ "##2133 ": 203,
354
+ "##2111 1111 ": 204,
355
+ "##3312 ": 205,
356
+ "##1134 ": 206,
357
+ "##1112 1111 ": 207,
358
+ "##4113 ": 208,
359
+ "##1314 ": 209,
360
+ "##5121 ": 210,
361
+ "##36": 211,
362
+ "##1125 ": 212,
363
+ "##3141 ": 213,
364
+ "##3321 ": 214,
365
+ "##5211 ": 215,
366
+ "##1413 ": 216,
367
+ "##1121 1111 ": 217,
368
+ "##1521 ": 218,
369
+ "##2331 ": 219,
370
+ "##3411 ": 220,
371
+ "##4131 ": 221,
372
+ "##3123 ": 222,
373
+ "##4311 ": 223,
374
+ "##1233 ": 224,
375
+ "##54": 225,
376
+ "##5112 ": 226,
377
+ "##1152 ": 227,
378
+ "##2115 ": 228,
379
+ "##35 ": 229,
380
+ "##3132 ": 230,
381
+ "##2151 ": 231,
382
+ "##1341 ": 232,
383
+ "##2313 ": 233,
384
+ "##1215 ": 234,
385
+ "##63": 235,
386
+ "##1512 ": 236,
387
+ "##46": 237,
388
+ "##224 ": 238,
389
+ "##3231 ": 239,
390
+ "##242 ": 240,
391
+ "##63 ": 241,
392
+ "##2111 1211 ": 242,
393
+ "##1252 ": 243,
394
+ "##2111 2111 ": 244,
395
+ "##2611 ": 245,
396
+ "##1121 1121 ": 246,
397
+ "##54 ": 247,
398
+ "##1441 ": 248,
399
+ "##2116 ": 249,
400
+ "##1112 1112 ": 250,
401
+ "##2161 ": 251,
402
+ "##2112 1111 ": 252,
403
+ "##5221 ": 253,
404
+ "##1621 ": 254,
405
+ "##2512 ": 255,
406
+ "##45 ": 256,
407
+ "##1144 ": 257,
408
+ "##1221 1111 ": 258,
409
+ "##64": 259,
410
+ "##131 ": 260,
411
+ "##2314 ": 261,
412
+ "##1261 ": 262,
413
+ "##1112 1211 ": 263,
414
+ "##1162 ": 264,
415
+ "##1126 ": 265,
416
+ "##1414 ": 266,
417
+ "##2152 ": 267,
418
+ "##151 ": 268,
419
+ "##36 ": 269,
420
+ "##3142 ": 270,
421
+ "##6211 ": 271,
422
+ "##3214 ": 272,
423
+ "##4114 ": 273,
424
+ "##1225 ": 274,
425
+ "##2413 ": 275,
426
+ "##3322 ": 276,
427
+ "##2121 1111 ": 277,
428
+ "##2125 ": 278,
429
+ "##2233 ": 279,
430
+ "##1112 1121 ": 280,
431
+ "##6121 ": 281,
432
+ "##4321 ": 282,
433
+ "##2251 ": 283,
434
+ "##2521 ": 284,
435
+ "##6112 ": 285,
436
+ "##1324 ": 286,
437
+ "##1211 1211 ": 287,
438
+ "##4213 ": 288,
439
+ "##2211 1111 ": 289,
440
+ "##1112 2111 ": 290,
441
+ "##2332 ": 291,
442
+ "##4411 ": 292,
443
+ "##1216 ": 293,
444
+ "##2143 ": 294,
445
+ "##1342 ": 295,
446
+ "##2134 ": 296,
447
+ "##3421 ": 297,
448
+ "##1612 ": 298,
449
+ "##4312 ": 299,
450
+ "##3223 ": 300,
451
+ "##2215 ": 301,
452
+ "##5212 ": 302,
453
+ "##1211 2111 ": 303,
454
+ "##4141 ": 304,
455
+ "##3124 ": 305,
456
+ "##1122 1111 ": 306,
457
+ "##4123 ": 307,
458
+ "##1522 ": 308,
459
+ "##1211 1121 ": 309,
460
+ "##1212 1111 ": 310,
461
+ "##1432 ": 311,
462
+ "##3232 ": 312,
463
+ "111": 313,
464
+ "##5122 ": 314,
465
+ "##1423 ": 315,
466
+ "##2431 ": 316,
467
+ "##2111 1121 ": 317,
468
+ "##3241 ": 318,
469
+ "##4132 ": 319,
470
+ "##4231 ": 320,
471
+ "##1234 ": 321,
472
+ "##1111 2112 ": 322,
473
+ "##1243 ": 323,
474
+ "##2111 1112 ": 324,
475
+ "##1111 2211 ": 325,
476
+ "##64 ": 326,
477
+ "##1111 2121 ": 327,
478
+ "##3412 ": 328,
479
+ "##1121 1112 ": 329,
480
+ "##2162 ": 330,
481
+ "##1211 1112 ": 331,
482
+ "##2323 ": 332,
483
+ "##2341 ": 333,
484
+ "##1121 2111 ": 334,
485
+ "##21": 335,
486
+ "##1111 1221 ": 336,
487
+ "##1111 1212 ": 337,
488
+ "##1111 1122 ": 338,
489
+ "##161 ": 339,
490
+ "##4421 ": 340,
491
+ "##46 ": 341,
492
+ "##1121 1211 ": 342,
493
+ "##233 ": 343,
494
+ "##1262 ": 344,
495
+ "##2261 ": 345,
496
+ "##2126 ": 346,
497
+ "##3313 ": 347,
498
+ "##1244 ": 348,
499
+ "##3331 ": 349,
500
+ "##2612 ": 350,
501
+ "##2225 ": 351,
502
+ "##512": 352,
503
+ "##6221 ": 353,
504
+ "##514": 354,
505
+ "##2252 ": 355,
506
+ "##152 ": 356,
507
+ "##2522 ": 357,
508
+ "##1211 1221 ": 358,
509
+ "##4412 ": 359,
510
+ "##6212 ": 360,
511
+ "##3133 ": 361,
512
+ "##1442 ": 362,
513
+ "##4214 ": 363,
514
+ "##2621 ": 364,
515
+ "##2112 1112 ": 365,
516
+ "##5222 ": 366,
517
+ "##3111 1111 ": 367,
518
+ "##4142 ": 368,
519
+ "##1226 ": 369,
520
+ "##2144 ": 370,
521
+ "##2216 ": 371,
522
+ "12": 372,
523
+ "##1333 ": 373,
524
+ "##4223 ": 374,
525
+ "##4241 ": 375,
526
+ "##132 ": 376,
527
+ "##4322 ": 377,
528
+ "##1131 1111 ": 378,
529
+ "##2211 2111 ": 379,
530
+ "##1211 1122 ": 380,
531
+ "##1424 ": 381,
532
+ "##6122 ": 382,
533
+ "##1311 1111 ": 383,
534
+ "##55": 384,
535
+ "##2342 ": 385,
536
+ "##2112 2111 ": 386,
537
+ "##1622 ": 387,
538
+ "##3224 ": 388,
539
+ "##1153 ": 389,
540
+ "##1531 ": 390,
541
+ "##3422 ": 391,
542
+ "##2441 ": 392,
543
+ "##1113 1111 ": 393,
544
+ "##4124 ": 394,
545
+ "##1121 2121 ": 395,
546
+ "##2432 ": 396,
547
+ "##1212 1211 ": 397,
548
+ "##1212 2111 ": 398,
549
+ "##1121 2211 ": 399,
550
+ "##2243 ": 400,
551
+ "##614": 401,
552
+ "##5311 ": 402,
553
+ "211": 403,
554
+ "##1221 1112 ": 404,
555
+ "##1212 1112 ": 405,
556
+ "##1122 1112 ": 406,
557
+ "##3242 ": 407,
558
+ "##612": 408,
559
+ "##2414 ": 409,
560
+ "##2211 1211 ": 410,
561
+ "##3115 ": 411,
562
+ "##2234 ": 412,
563
+ "##2121 2111 ": 413,
564
+ "##1122 1211 ": 414,
565
+ "##1122 2111 ": 415,
566
+ "##1221 2111 ": 416,
567
+ "##2324 ": 417,
568
+ "##2112 1211 ": 418,
569
+ "##1122 1121 ": 419,
570
+ "##1111 1131 ": 420,
571
+ "##2423 ": 421,
572
+ "##1351 ": 422,
573
+ "##1212 1121 ": 423,
574
+ "##2211 1112 ": 424,
575
+ "##1111": 425,
576
+ "##1221 1121 ": 426,
577
+ "##2121 1112 ": 427,
578
+ "##1222 1111 ": 428,
579
+ "##3511 ": 429,
580
+ "##2112 1121 ": 430,
581
+ "##4232 ": 431,
582
+ "##162 ": 432,
583
+ "##2122 1111 ": 433,
584
+ "##2226 ": 434,
585
+ "##55 ": 435,
586
+ "##2221 1111 ": 436,
587
+ "##2121 1121 ": 437,
588
+ "##324": 438,
589
+ "##1221 1211 ": 439,
590
+ "##5113 ": 440,
591
+ "##324 ": 441,
592
+ "##5131 ": 442,
593
+ "##225 ": 443,
594
+ "##1513 ": 444,
595
+ "##2121 1211 ": 445,
596
+ "##1135 ": 446,
597
+ "##6222 ": 447,
598
+ "##1111 2212 ": 448,
599
+ "##2211 1121 ": 449,
600
+ "##2262 ": 450,
601
+ "##2424 ": 451,
602
+ "##4224 ": 452,
603
+ "##3151 ": 453,
604
+ "##1111 1113 ": 454,
605
+ "##4422 ": 455,
606
+ "##4242 ": 456,
607
+ "##1315 ": 457,
608
+ "##65": 458,
609
+ "##2442 ": 459,
610
+ "##1311 1112 ": 460,
611
+ "##2111 2112 ": 461,
612
+ "##2622 ": 462,
613
+ "##1212 1221 ": 463,
614
+ "22": 464,
615
+ "##1111 3111 ": 465,
616
+ "##1111 2221 ": 466,
617
+ "##1211 2211 ": 467,
618
+ "##3314 ": 468,
619
+ "1111 ": 469,
620
+ "##2244 ": 470,
621
+ "##2111 2211 ": 471,
622
+ "##1211 2121 ": 472,
623
+ "##2111 1212 ": 473,
624
+ "##2111 1221 ": 474,
625
+ "##1112 2211 ": 475,
626
+ "##1112 1221 ": 476,
627
+ "##1111 1311 ": 477,
628
+ "##3413 ": 478,
629
+ "##1163 ": 479,
630
+ "##1111 1222 ": 480,
631
+ "##1211 2112 ": 481,
632
+ "##1112 2112 ": 482,
633
+ "##234 ": 483,
634
+ "##4511 ": 484,
635
+ "##3611 ": 485,
636
+ "##2111 1122 ": 486,
637
+ "##2111 2121 ": 487,
638
+ "##1121 1122 ": 488,
639
+ "##6311 ": 489,
640
+ "##1211 1212 ": 490,
641
+ "##66": 491,
642
+ "##1433 ": 492,
643
+ "##1121 1221 ": 493,
644
+ "##1112 1122 ": 494,
645
+ "##2333 ": 495,
646
+ "##2531 ": 496,
647
+ "##1121 1212 ": 497,
648
+ "##1111 2122 ": 498,
649
+ "##1112 1212 ": 499
650
  }
651
  }
652
  }
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer"}
 
1
+ {"tokenizer_class": "PreTrainedTokenizerFast"}