Upload tokenizer
#2
by Xenova HF Staff - opened
- tokenizer.json +53 -5
- tokenizer_config.json +1 -1
tokenizer.json
CHANGED
|
@@ -7370,14 +7370,62 @@
|
|
| 7370 |
"pre_tokenizer": {
|
| 7371 |
"type": "Metaspace",
|
| 7372 |
"replacement": "▁",
|
| 7373 |
-
"prepend_scheme": "
|
| 7374 |
"split": false
|
| 7375 |
},
|
| 7376 |
"post_processor": {
|
| 7377 |
-
"type": "
|
| 7378 |
-
"
|
| 7379 |
-
|
| 7380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7381 |
},
|
| 7382 |
"decoder": {
|
| 7383 |
"type": "Sequence",
|
|
|
|
| 7370 |
"pre_tokenizer": {
|
| 7371 |
"type": "Metaspace",
|
| 7372 |
"replacement": "▁",
|
| 7373 |
+
"prepend_scheme": "first",
|
| 7374 |
"split": false
|
| 7375 |
},
|
| 7376 |
"post_processor": {
|
| 7377 |
+
"type": "TemplateProcessing",
|
| 7378 |
+
"single": [
|
| 7379 |
+
{
|
| 7380 |
+
"SpecialToken": {
|
| 7381 |
+
"id": "<|begin▁of▁sentence|>",
|
| 7382 |
+
"type_id": 0
|
| 7383 |
+
}
|
| 7384 |
+
},
|
| 7385 |
+
{
|
| 7386 |
+
"Sequence": {
|
| 7387 |
+
"id": "A",
|
| 7388 |
+
"type_id": 0
|
| 7389 |
+
}
|
| 7390 |
+
}
|
| 7391 |
+
],
|
| 7392 |
+
"pair": [
|
| 7393 |
+
{
|
| 7394 |
+
"SpecialToken": {
|
| 7395 |
+
"id": "<|begin▁of▁sentence|>",
|
| 7396 |
+
"type_id": 0
|
| 7397 |
+
}
|
| 7398 |
+
},
|
| 7399 |
+
{
|
| 7400 |
+
"Sequence": {
|
| 7401 |
+
"id": "A",
|
| 7402 |
+
"type_id": 0
|
| 7403 |
+
}
|
| 7404 |
+
},
|
| 7405 |
+
{
|
| 7406 |
+
"SpecialToken": {
|
| 7407 |
+
"id": "<|begin▁of▁sentence|>",
|
| 7408 |
+
"type_id": 1
|
| 7409 |
+
}
|
| 7410 |
+
},
|
| 7411 |
+
{
|
| 7412 |
+
"Sequence": {
|
| 7413 |
+
"id": "B",
|
| 7414 |
+
"type_id": 1
|
| 7415 |
+
}
|
| 7416 |
+
}
|
| 7417 |
+
],
|
| 7418 |
+
"special_tokens": {
|
| 7419 |
+
"<|begin▁of▁sentence|>": {
|
| 7420 |
+
"id": "<|begin▁of▁sentence|>",
|
| 7421 |
+
"ids": [
|
| 7422 |
+
0
|
| 7423 |
+
],
|
| 7424 |
+
"tokens": [
|
| 7425 |
+
"<|begin▁of▁sentence|>"
|
| 7426 |
+
]
|
| 7427 |
+
}
|
| 7428 |
+
}
|
| 7429 |
},
|
| 7430 |
"decoder": {
|
| 7431 |
"type": "Sequence",
|
tokenizer_config.json
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
"bos_token": "<|begin▁of▁sentence|>",
|
| 5 |
"clean_up_tokenization_spaces": false,
|
| 6 |
"eos_token": "<|end▁of▁sentence|>",
|
| 7 |
-
"is_local":
|
| 8 |
"model_max_length": 131072,
|
| 9 |
"pad_token": "<|end▁of▁sentence|>",
|
| 10 |
"sp_model_kwargs": {},
|
|
|
|
| 4 |
"bos_token": "<|begin▁of▁sentence|>",
|
| 5 |
"clean_up_tokenization_spaces": false,
|
| 6 |
"eos_token": "<|end▁of▁sentence|>",
|
| 7 |
+
"is_local": true,
|
| 8 |
"model_max_length": 131072,
|
| 9 |
"pad_token": "<|end▁of▁sentence|>",
|
| 10 |
"sp_model_kwargs": {},
|