Upload tokenizer
Browse files- special_tokens_map.json +1 -1
- tokenizer.json +7 -28
- tokenizer_config.json +3 -3
special_tokens_map.json
CHANGED
|
@@ -16,7 +16,7 @@
|
|
| 16 |
"rstrip": false,
|
| 17 |
"single_word": false
|
| 18 |
},
|
| 19 |
-
"pad_token": "<
|
| 20 |
"unk_token": {
|
| 21 |
"content": "<unk>",
|
| 22 |
"lstrip": false,
|
|
|
|
| 16 |
"rstrip": false,
|
| 17 |
"single_word": false
|
| 18 |
},
|
| 19 |
+
"pad_token": "<unk>",
|
| 20 |
"unk_token": {
|
| 21 |
"content": "<unk>",
|
| 22 |
"lstrip": false,
|
tokenizer.json
CHANGED
|
@@ -1,6 +1,11 @@
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
-
"truncation":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"padding": null,
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
|
@@ -404,12 +409,6 @@
|
|
| 404 |
"id": "A",
|
| 405 |
"type_id": 0
|
| 406 |
}
|
| 407 |
-
},
|
| 408 |
-
{
|
| 409 |
-
"SpecialToken": {
|
| 410 |
-
"id": "<|endoftext|>",
|
| 411 |
-
"type_id": 0
|
| 412 |
-
}
|
| 413 |
}
|
| 414 |
],
|
| 415 |
"pair": [
|
|
@@ -425,12 +424,6 @@
|
|
| 425 |
"type_id": 0
|
| 426 |
}
|
| 427 |
},
|
| 428 |
-
{
|
| 429 |
-
"SpecialToken": {
|
| 430 |
-
"id": "<|endoftext|>",
|
| 431 |
-
"type_id": 0
|
| 432 |
-
}
|
| 433 |
-
},
|
| 434 |
{
|
| 435 |
"SpecialToken": {
|
| 436 |
"id": "<s>",
|
|
@@ -442,12 +435,6 @@
|
|
| 442 |
"id": "B",
|
| 443 |
"type_id": 1
|
| 444 |
}
|
| 445 |
-
},
|
| 446 |
-
{
|
| 447 |
-
"SpecialToken": {
|
| 448 |
-
"id": "<|endoftext|>",
|
| 449 |
-
"type_id": 1
|
| 450 |
-
}
|
| 451 |
}
|
| 452 |
],
|
| 453 |
"special_tokens": {
|
|
@@ -459,15 +446,6 @@
|
|
| 459 |
"tokens": [
|
| 460 |
"<s>"
|
| 461 |
]
|
| 462 |
-
},
|
| 463 |
-
"<|endoftext|>": {
|
| 464 |
-
"id": "<|endoftext|>",
|
| 465 |
-
"ids": [
|
| 466 |
-
32000
|
| 467 |
-
],
|
| 468 |
-
"tokens": [
|
| 469 |
-
"<|endoftext|>"
|
| 470 |
-
]
|
| 471 |
}
|
| 472 |
}
|
| 473 |
},
|
|
@@ -503,6 +481,7 @@
|
|
| 503 |
"end_of_word_suffix": null,
|
| 504 |
"fuse_unk": true,
|
| 505 |
"byte_fallback": true,
|
|
|
|
| 506 |
"vocab": {
|
| 507 |
"<unk>": 0,
|
| 508 |
"<s>": 1,
|
|
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
+
"truncation": {
|
| 4 |
+
"direction": "Right",
|
| 5 |
+
"max_length": 2048,
|
| 6 |
+
"strategy": "LongestFirst",
|
| 7 |
+
"stride": 0
|
| 8 |
+
},
|
| 9 |
"padding": null,
|
| 10 |
"added_tokens": [
|
| 11 |
{
|
|
|
|
| 409 |
"id": "A",
|
| 410 |
"type_id": 0
|
| 411 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
}
|
| 413 |
],
|
| 414 |
"pair": [
|
|
|
|
| 424 |
"type_id": 0
|
| 425 |
}
|
| 426 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
{
|
| 428 |
"SpecialToken": {
|
| 429 |
"id": "<s>",
|
|
|
|
| 435 |
"id": "B",
|
| 436 |
"type_id": 1
|
| 437 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
}
|
| 439 |
],
|
| 440 |
"special_tokens": {
|
|
|
|
| 446 |
"tokens": [
|
| 447 |
"<s>"
|
| 448 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
}
|
| 450 |
}
|
| 451 |
},
|
|
|
|
| 481 |
"end_of_word_suffix": null,
|
| 482 |
"fuse_unk": true,
|
| 483 |
"byte_fallback": true,
|
| 484 |
+
"ignore_merges": false,
|
| 485 |
"vocab": {
|
| 486 |
"<unk>": 0,
|
| 487 |
"<s>": 1,
|
tokenizer_config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"add_bos_token": true,
|
| 3 |
-
"add_eos_token":
|
| 4 |
"added_tokens_decoder": {
|
| 5 |
"0": {
|
| 6 |
"content": "<unk>",
|
|
@@ -340,8 +340,8 @@
|
|
| 340 |
"eos_token": "<|endoftext|>",
|
| 341 |
"legacy": false,
|
| 342 |
"model_max_length": 4096,
|
| 343 |
-
"pad_token": "<
|
| 344 |
-
"padding_side": "
|
| 345 |
"sp_model_kwargs": {},
|
| 346 |
"tokenizer_class": "LlamaTokenizer",
|
| 347 |
"unk_token": "<unk>",
|
|
|
|
| 1 |
{
|
| 2 |
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
"added_tokens_decoder": {
|
| 5 |
"0": {
|
| 6 |
"content": "<unk>",
|
|
|
|
| 340 |
"eos_token": "<|endoftext|>",
|
| 341 |
"legacy": false,
|
| 342 |
"model_max_length": 4096,
|
| 343 |
+
"pad_token": "<unk>",
|
| 344 |
+
"padding_side": "right",
|
| 345 |
"sp_model_kwargs": {},
|
| 346 |
"tokenizer_class": "LlamaTokenizer",
|
| 347 |
"unk_token": "<unk>",
|