tampakwill commited on
Commit
568fb9d
·
verified ·
1 Parent(s): bf15648

Upgrade to 50k Vocab - Support Al-Qur'an & Indo (No More Broken Bytes)

Browse files
Files changed (4) hide show
  1. merges.txt +0 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +7 -8
  4. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,13 +1,12 @@
1
  {
 
2
  "backend": "tokenizers",
3
- "bos_token": "<s>",
4
- "eos_token": "</s>",
5
- "extra_special_tokens": [
6
- "### User:",
7
- "### AWA:"
8
- ],
9
  "model_max_length": 1000000000000000019884624838656,
10
  "pad_token": "[PAD]",
11
- "tokenizer_class": "TokenizersBackend",
12
- "unk_token": "[UNK]"
13
  }
 
1
  {
2
+ "add_prefix_space": false,
3
  "backend": "tokenizers",
4
+ "bos_token": "[BOS]",
5
+ "eos_token": "[EOS]",
6
+ "errors": "replace",
7
+ "is_local": true,
 
 
8
  "model_max_length": 1000000000000000019884624838656,
9
  "pad_token": "[PAD]",
10
+ "tokenizer_class": "GPT2Tokenizer",
11
+ "unk_token": "<|endoftext|>"
12
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff