deepnet commited on
Commit
a849302
·
verified ·
1 Parent(s): 1ed739c

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +63 -4
  2. tokenizer_config.json +1 -1
tokenizer.json CHANGED
@@ -2334,10 +2334,69 @@
2334
  ]
2335
  },
2336
  "post_processor": {
2337
- "type": "ByteLevel",
2338
- "add_prefix_space": true,
2339
- "trim_offsets": false,
2340
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2341
  },
2342
  "decoder": {
2343
  "type": "ByteLevel",
 
2334
  ]
2335
  },
2336
  "post_processor": {
2337
+ "type": "Sequence",
2338
+ "processors": [
2339
+ {
2340
+ "type": "ByteLevel",
2341
+ "add_prefix_space": true,
2342
+ "trim_offsets": false,
2343
+ "use_regex": true
2344
+ },
2345
+ {
2346
+ "type": "TemplateProcessing",
2347
+ "single": [
2348
+ {
2349
+ "SpecialToken": {
2350
+ "id": "<|begin_of_text|>",
2351
+ "type_id": 0
2352
+ }
2353
+ },
2354
+ {
2355
+ "Sequence": {
2356
+ "id": "A",
2357
+ "type_id": 0
2358
+ }
2359
+ }
2360
+ ],
2361
+ "pair": [
2362
+ {
2363
+ "SpecialToken": {
2364
+ "id": "<|begin_of_text|>",
2365
+ "type_id": 0
2366
+ }
2367
+ },
2368
+ {
2369
+ "Sequence": {
2370
+ "id": "A",
2371
+ "type_id": 0
2372
+ }
2373
+ },
2374
+ {
2375
+ "SpecialToken": {
2376
+ "id": "<|begin_of_text|>",
2377
+ "type_id": 1
2378
+ }
2379
+ },
2380
+ {
2381
+ "Sequence": {
2382
+ "id": "B",
2383
+ "type_id": 1
2384
+ }
2385
+ }
2386
+ ],
2387
+ "special_tokens": {
2388
+ "<|begin_of_text|>": {
2389
+ "id": "<|begin_of_text|>",
2390
+ "ids": [
2391
+ 128000
2392
+ ],
2393
+ "tokens": [
2394
+ "<|begin_of_text|>"
2395
+ ]
2396
+ }
2397
+ }
2398
+ }
2399
+ ]
2400
  },
2401
  "decoder": {
2402
  "type": "ByteLevel",
tokenizer_config.json CHANGED
@@ -2050,7 +2050,7 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|end_of_text|>",
2056
  "max_length": 2048,
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|end_of_text|>",
2056
  "max_length": 2048,