Upload tokenizer

#2
by Xenova HF Staff - opened
Files changed (2) hide show
  1. tokenizer.json +53 -5
  2. tokenizer_config.json +1 -1
tokenizer.json CHANGED
@@ -7370,14 +7370,62 @@
7370
  "pre_tokenizer": {
7371
  "type": "Metaspace",
7372
  "replacement": "▁",
7373
- "prepend_scheme": "always",
7374
  "split": false
7375
  },
7376
  "post_processor": {
7377
- "type": "ByteLevel",
7378
- "add_prefix_space": true,
7379
- "trim_offsets": false,
7380
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7381
  },
7382
  "decoder": {
7383
  "type": "Sequence",
 
7370
  "pre_tokenizer": {
7371
  "type": "Metaspace",
7372
  "replacement": "▁",
7373
+ "prepend_scheme": "first",
7374
  "split": false
7375
  },
7376
  "post_processor": {
7377
+ "type": "TemplateProcessing",
7378
+ "single": [
7379
+ {
7380
+ "SpecialToken": {
7381
+ "id": "<|begin▁of▁sentence|>",
7382
+ "type_id": 0
7383
+ }
7384
+ },
7385
+ {
7386
+ "Sequence": {
7387
+ "id": "A",
7388
+ "type_id": 0
7389
+ }
7390
+ }
7391
+ ],
7392
+ "pair": [
7393
+ {
7394
+ "SpecialToken": {
7395
+ "id": "<|begin▁of▁sentence|>",
7396
+ "type_id": 0
7397
+ }
7398
+ },
7399
+ {
7400
+ "Sequence": {
7401
+ "id": "A",
7402
+ "type_id": 0
7403
+ }
7404
+ },
7405
+ {
7406
+ "SpecialToken": {
7407
+ "id": "<|begin▁of▁sentence|>",
7408
+ "type_id": 1
7409
+ }
7410
+ },
7411
+ {
7412
+ "Sequence": {
7413
+ "id": "B",
7414
+ "type_id": 1
7415
+ }
7416
+ }
7417
+ ],
7418
+ "special_tokens": {
7419
+ "<|begin▁of▁sentence|>": {
7420
+ "id": "<|begin▁of▁sentence|>",
7421
+ "ids": [
7422
+ 0
7423
+ ],
7424
+ "tokens": [
7425
+ "<|begin▁of▁sentence|>"
7426
+ ]
7427
+ }
7428
+ }
7429
  },
7430
  "decoder": {
7431
  "type": "Sequence",
tokenizer_config.json CHANGED
@@ -4,7 +4,7 @@
4
  "bos_token": "<|begin▁of▁sentence|>",
5
  "clean_up_tokenization_spaces": false,
6
  "eos_token": "<|end▁of▁sentence|>",
7
- "is_local": false,
8
  "model_max_length": 131072,
9
  "pad_token": "<|end▁of▁sentence|>",
10
  "sp_model_kwargs": {},
 
4
  "bos_token": "<|begin▁of▁sentence|>",
5
  "clean_up_tokenization_spaces": false,
6
  "eos_token": "<|end▁of▁sentence|>",
7
+ "is_local": true,
8
  "model_max_length": 131072,
9
  "pad_token": "<|end▁of▁sentence|>",
10
  "sp_model_kwargs": {},