summerstars commited on
Commit
d908648
·
verified ·
1 Parent(s): fcdb41e

Upload tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +26 -7
tokenizer.json CHANGED
@@ -184,7 +184,17 @@
184
  "special": true
185
  }
186
  ],
187
- "normalizer": null,
 
 
 
 
 
 
 
 
 
 
188
  "pre_tokenizer": {
189
  "type": "ByteLevel",
190
  "add_prefix_space": false,
@@ -192,10 +202,19 @@
192
  "use_regex": true
193
  },
194
  "post_processor": {
195
- "type": "ByteLevel",
196
- "add_prefix_space": true,
197
- "trim_offsets": false,
198
- "use_regex": true
 
 
 
 
 
 
 
 
 
199
  },
200
  "decoder": {
201
  "type": "ByteLevel",
@@ -207,10 +226,10 @@
207
  "type": "BPE",
208
  "dropout": null,
209
  "unk_token": null,
210
- "continuing_subword_prefix": "",
211
  "end_of_word_suffix": "",
212
  "fuse_unk": false,
213
- "byte_fallback": false,
214
  "ignore_merges": false,
215
  "vocab": {
216
  "<|endoftext|>": 0,
 
184
  "special": true
185
  }
186
  ],
187
+ "normalizer": {
188
+ "type": "Sequence",
189
+ "normalizers": [
190
+ {
191
+ "type": "NFKC"
192
+ },
193
+ {
194
+ "type": "Lowercase"
195
+ }
196
+ ]
197
+ },
198
  "pre_tokenizer": {
199
  "type": "ByteLevel",
200
  "add_prefix_space": false,
 
202
  "use_regex": true
203
  },
204
  "post_processor": {
205
+ "type": "TemplateProcessing",
206
+ "single": "<|im_start|> $A <|im_end|>",
207
+ "pair": "<|im_start|> $A <|im_end|> <|im_start|> $B <|im_end|>",
208
+ "special_tokens": [
209
+ [
210
+ "<|im_start|>",
211
+ 1
212
+ ],
213
+ [
214
+ "<|im_end|>",
215
+ 2
216
+ ]
217
+ ]
218
  },
219
  "decoder": {
220
  "type": "ByteLevel",
 
226
  "type": "BPE",
227
  "dropout": null,
228
  "unk_token": null,
229
+ "continuing_subword_prefix": "##",
230
  "end_of_word_suffix": "",
231
  "fuse_unk": false,
232
+ "byte_fallback": true,
233
  "ignore_merges": false,
234
  "vocab": {
235
  "<|endoftext|>": 0,