Upload tokenizer.json
Browse files- tokenizer.json +26 -7
tokenizer.json
CHANGED
|
@@ -184,7 +184,17 @@
|
|
| 184 |
"special": true
|
| 185 |
}
|
| 186 |
],
|
| 187 |
-
"normalizer":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
"pre_tokenizer": {
|
| 189 |
"type": "ByteLevel",
|
| 190 |
"add_prefix_space": false,
|
|
@@ -192,10 +202,19 @@
|
|
| 192 |
"use_regex": true
|
| 193 |
},
|
| 194 |
"post_processor": {
|
| 195 |
-
"type": "
|
| 196 |
-
"
|
| 197 |
-
"
|
| 198 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
},
|
| 200 |
"decoder": {
|
| 201 |
"type": "ByteLevel",
|
|
@@ -207,10 +226,10 @@
|
|
| 207 |
"type": "BPE",
|
| 208 |
"dropout": null,
|
| 209 |
"unk_token": null,
|
| 210 |
-
"continuing_subword_prefix": "",
|
| 211 |
"end_of_word_suffix": "",
|
| 212 |
"fuse_unk": false,
|
| 213 |
-
"byte_fallback":
|
| 214 |
"ignore_merges": false,
|
| 215 |
"vocab": {
|
| 216 |
"<|endoftext|>": 0,
|
|
|
|
| 184 |
"special": true
|
| 185 |
}
|
| 186 |
],
|
| 187 |
+
"normalizer": {
|
| 188 |
+
"type": "Sequence",
|
| 189 |
+
"normalizers": [
|
| 190 |
+
{
|
| 191 |
+
"type": "NFKC"
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"type": "Lowercase"
|
| 195 |
+
}
|
| 196 |
+
]
|
| 197 |
+
},
|
| 198 |
"pre_tokenizer": {
|
| 199 |
"type": "ByteLevel",
|
| 200 |
"add_prefix_space": false,
|
|
|
|
| 202 |
"use_regex": true
|
| 203 |
},
|
| 204 |
"post_processor": {
|
| 205 |
+
"type": "TemplateProcessing",
|
| 206 |
+
"single": "<|im_start|> $A <|im_end|>",
|
| 207 |
+
"pair": "<|im_start|> $A <|im_end|> <|im_start|> $B <|im_end|>",
|
| 208 |
+
"special_tokens": [
|
| 209 |
+
[
|
| 210 |
+
"<|im_start|>",
|
| 211 |
+
1
|
| 212 |
+
],
|
| 213 |
+
[
|
| 214 |
+
"<|im_end|>",
|
| 215 |
+
2
|
| 216 |
+
]
|
| 217 |
+
]
|
| 218 |
},
|
| 219 |
"decoder": {
|
| 220 |
"type": "ByteLevel",
|
|
|
|
| 226 |
"type": "BPE",
|
| 227 |
"dropout": null,
|
| 228 |
"unk_token": null,
|
| 229 |
+
"continuing_subword_prefix": "##",
|
| 230 |
"end_of_word_suffix": "",
|
| 231 |
"fuse_unk": false,
|
| 232 |
+
"byte_fallback": true,
|
| 233 |
"ignore_merges": false,
|
| 234 |
"vocab": {
|
| 235 |
"<|endoftext|>": 0,
|