alexshah commited on
Commit
67620f3
·
verified ·
1 Parent(s): 2c57c6a

Upload tokenizer.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.json +70 -59
tokenizer.json CHANGED
@@ -2,11 +2,18 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 510,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
9
- "padding": null,
 
 
 
 
 
 
 
10
  "added_tokens": [
11
  {
12
  "id": 5139,
@@ -2343,69 +2350,73 @@
2343
  ]
2344
  },
2345
  "post_processor": {
2346
- "type": "Sequence",
2347
- "processors": [
2348
  {
2349
- "type": "ByteLevel",
2350
- "add_prefix_space": true,
2351
- "trim_offsets": false,
2352
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2353
  },
2354
  {
2355
- "type": "TemplateProcessing",
2356
- "single": [
2357
- {
2358
- "SpecialToken": {
2359
- "id": "<|begin_of_text|>",
2360
- "type_id": 0
2361
- }
2362
- },
2363
- {
2364
- "Sequence": {
2365
- "id": "A",
2366
- "type_id": 0
2367
- }
2368
- }
 
 
 
 
 
 
 
 
 
2369
  ],
2370
- "pair": [
2371
- {
2372
- "SpecialToken": {
2373
- "id": "<|begin_of_text|>",
2374
- "type_id": 0
2375
- }
2376
- },
2377
- {
2378
- "Sequence": {
2379
- "id": "A",
2380
- "type_id": 0
2381
- }
2382
- },
2383
- {
2384
- "SpecialToken": {
2385
- "id": "<|begin_of_text|>",
2386
- "type_id": 1
2387
- }
2388
- },
2389
- {
2390
- "Sequence": {
2391
- "id": "B",
2392
- "type_id": 1
2393
- }
2394
- }
2395
  ],
2396
- "special_tokens": {
2397
- "<|begin_of_text|>": {
2398
- "id": "<|begin_of_text|>",
2399
- "ids": [
2400
- 128000
2401
- ],
2402
- "tokens": [
2403
- "<|begin_of_text|>"
2404
- ]
2405
- }
2406
- }
2407
  }
2408
- ]
2409
  },
2410
  "decoder": {
2411
  "type": "ByteLevel",
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 512,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
9
+ "padding": {
10
+ "strategy": "BatchLongest",
11
+ "direction": "Left",
12
+ "pad_to_multiple_of": null,
13
+ "pad_id": 128000,
14
+ "pad_type_id": 0,
15
+ "pad_token": "[PAD]"
16
+ },
17
  "added_tokens": [
18
  {
19
  "id": 5139,
 
2350
  ]
2351
  },
2352
  "post_processor": {
2353
+ "type": "TemplateProcessing",
2354
+ "single": [
2355
  {
2356
+ "SpecialToken": {
2357
+ "id": "<|begin_of_text|>",
2358
+ "type_id": 0
2359
+ }
2360
+ },
2361
+ {
2362
+ "Sequence": {
2363
+ "id": "A",
2364
+ "type_id": 0
2365
+ }
2366
+ },
2367
+ {
2368
+ "SpecialToken": {
2369
+ "id": "<|end_of_text|>",
2370
+ "type_id": 0
2371
+ }
2372
+ }
2373
+ ],
2374
+ "pair": [
2375
+ {
2376
+ "SpecialToken": {
2377
+ "id": "<|begin_of_text|>",
2378
+ "type_id": 0
2379
+ }
2380
  },
2381
  {
2382
+ "Sequence": {
2383
+ "id": "A",
2384
+ "type_id": 0
2385
+ }
2386
+ },
2387
+ {
2388
+ "SpecialToken": {
2389
+ "id": "<|end_of_text|>",
2390
+ "type_id": 0
2391
+ }
2392
+ },
2393
+ {
2394
+ "Sequence": {
2395
+ "id": "B",
2396
+ "type_id": 1
2397
+ }
2398
+ }
2399
+ ],
2400
+ "special_tokens": {
2401
+ "<|begin_of_text|>": {
2402
+ "id": "<|begin_of_text|>",
2403
+ "ids": [
2404
+ 101497
2405
  ],
2406
+ "tokens": [
2407
+ "<|begin_of_text|>"
2408
+ ]
2409
+ },
2410
+ "<|end_of_text|>": {
2411
+ "id": "<|end_of_text|>",
2412
+ "ids": [
2413
+ 57368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2414
  ],
2415
+ "tokens": [
2416
+ "<|end_of_text|>"
2417
+ ]
 
 
 
 
 
 
 
 
2418
  }
2419
+ }
2420
  },
2421
  "decoder": {
2422
  "type": "ByteLevel",