Commit
·
26d1356
1
Parent(s):
b369210
removed unused special tokens
Browse files- fix_tokens.py +10 -10
- tokenizer.json +20 -20
- tokenizer_config.json +10 -10
- vocab.json +10 -10
fix_tokens.py
CHANGED
|
@@ -79,16 +79,16 @@ DESIRED_MAPPING = [
|
|
| 79 |
SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
|
| 80 |
SpecialToken(id=100264, content="<|im_start|>", special=True),
|
| 81 |
SpecialToken(id=100265, content="<|im_end|>", special=True),
|
| 82 |
-
SpecialToken(id=100266, content="
|
| 83 |
-
SpecialToken(id=100267, content="
|
| 84 |
-
SpecialToken(id=100268, content="<
|
| 85 |
-
SpecialToken(id=100269, content="</
|
| 86 |
-
SpecialToken(id=100270, content="
|
| 87 |
-
SpecialToken(id=100271, content="
|
| 88 |
-
SpecialToken(id=100272, content="
|
| 89 |
-
SpecialToken(id=100273, content="
|
| 90 |
-
SpecialToken(id=100274, content="
|
| 91 |
-
SpecialToken(id=100275, content="
|
| 92 |
SpecialToken(id=100276, content="<|endofprompt|>", special=True),
|
| 93 |
SpecialToken(
|
| 94 |
id=100277,
|
|
|
|
| 79 |
SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
|
| 80 |
SpecialToken(id=100264, content="<|im_start|>", special=True),
|
| 81 |
SpecialToken(id=100265, content="<|im_end|>", special=True),
|
| 82 |
+
SpecialToken(id=100266, content="<functions>"),
|
| 83 |
+
SpecialToken(id=100267, content="</functions>"),
|
| 84 |
+
SpecialToken(id=100268, content="<function_calls>"),
|
| 85 |
+
SpecialToken(id=100269, content="</function_calls>"),
|
| 86 |
+
SpecialToken(id=100270, content="<|extra_id_1|>"),
|
| 87 |
+
SpecialToken(id=100271, content="<|extra_id_2|>"),
|
| 88 |
+
SpecialToken(id=100272, content="<|extra_id_3|>"),
|
| 89 |
+
SpecialToken(id=100273, content="<|extra_id_4|>"),
|
| 90 |
+
SpecialToken(id=100274, content="<|extra_id_5|>"),
|
| 91 |
+
SpecialToken(id=100275, content="<|extra_id_6|>"),
|
| 92 |
SpecialToken(id=100276, content="<|endofprompt|>", special=True),
|
| 93 |
SpecialToken(
|
| 94 |
id=100277,
|
tokenizer.json
CHANGED
|
@@ -95,7 +95,7 @@
|
|
| 95 |
},
|
| 96 |
{
|
| 97 |
"id": 100266,
|
| 98 |
-
"content": "
|
| 99 |
"lstrip": false,
|
| 100 |
"normalized": false,
|
| 101 |
"rstrip": false,
|
|
@@ -104,7 +104,7 @@
|
|
| 104 |
},
|
| 105 |
{
|
| 106 |
"id": 100267,
|
| 107 |
-
"content": "
|
| 108 |
"lstrip": false,
|
| 109 |
"normalized": false,
|
| 110 |
"rstrip": false,
|
|
@@ -113,7 +113,7 @@
|
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"id": 100268,
|
| 116 |
-
"content": "<
|
| 117 |
"lstrip": false,
|
| 118 |
"normalized": false,
|
| 119 |
"rstrip": false,
|
|
@@ -122,7 +122,7 @@
|
|
| 122 |
},
|
| 123 |
{
|
| 124 |
"id": 100269,
|
| 125 |
-
"content": "</
|
| 126 |
"lstrip": false,
|
| 127 |
"normalized": false,
|
| 128 |
"rstrip": false,
|
|
@@ -131,7 +131,7 @@
|
|
| 131 |
},
|
| 132 |
{
|
| 133 |
"id": 100270,
|
| 134 |
-
"content": "
|
| 135 |
"lstrip": false,
|
| 136 |
"normalized": false,
|
| 137 |
"rstrip": false,
|
|
@@ -140,7 +140,7 @@
|
|
| 140 |
},
|
| 141 |
{
|
| 142 |
"id": 100271,
|
| 143 |
-
"content": "
|
| 144 |
"lstrip": false,
|
| 145 |
"normalized": false,
|
| 146 |
"rstrip": false,
|
|
@@ -149,7 +149,7 @@
|
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"id": 100272,
|
| 152 |
-
"content": "
|
| 153 |
"lstrip": false,
|
| 154 |
"normalized": false,
|
| 155 |
"rstrip": false,
|
|
@@ -158,7 +158,7 @@
|
|
| 158 |
},
|
| 159 |
{
|
| 160 |
"id": 100273,
|
| 161 |
-
"content": "
|
| 162 |
"lstrip": false,
|
| 163 |
"normalized": false,
|
| 164 |
"rstrip": false,
|
|
@@ -167,7 +167,7 @@
|
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"id": 100274,
|
| 170 |
-
"content": "
|
| 171 |
"lstrip": false,
|
| 172 |
"normalized": false,
|
| 173 |
"rstrip": false,
|
|
@@ -176,7 +176,7 @@
|
|
| 176 |
},
|
| 177 |
{
|
| 178 |
"id": 100275,
|
| 179 |
-
"content": "
|
| 180 |
"lstrip": false,
|
| 181 |
"normalized": false,
|
| 182 |
"rstrip": false,
|
|
@@ -100495,6 +100495,10 @@
|
|
| 100495 |
".WaitFor": 100253,
|
| 100496 |
"Ġdaycare": 100254,
|
| 100497 |
"ĠConveyor": 100255,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100498 |
"<|extra_id_0|>": 100256,
|
| 100499 |
"<|endoftext|>": 100257,
|
| 100500 |
"<|fim_prefix|>": 100258,
|
|
@@ -100505,16 +100509,12 @@
|
|
| 100505 |
"|||IP_ADDRESS|||": 100263,
|
| 100506 |
"<|im_start|>": 100264,
|
| 100507 |
"<|im_end|>": 100265,
|
| 100508 |
-
"<|extra_id_1|>":
|
| 100509 |
-
"<|extra_id_2|>":
|
| 100510 |
-
"
|
| 100511 |
-
"
|
| 100512 |
-
"
|
| 100513 |
-
"
|
| 100514 |
-
"<function_calls>": 100272,
|
| 100515 |
-
"</function_calls>": 100273,
|
| 100516 |
-
"<answer>": 100274,
|
| 100517 |
-
"</answer>": 100275,
|
| 100518 |
"<|endofprompt|>": 100276,
|
| 100519 |
"<|pad|>": 100277
|
| 100520 |
},
|
|
|
|
| 95 |
},
|
| 96 |
{
|
| 97 |
"id": 100266,
|
| 98 |
+
"content": "<functions>",
|
| 99 |
"lstrip": false,
|
| 100 |
"normalized": false,
|
| 101 |
"rstrip": false,
|
|
|
|
| 104 |
},
|
| 105 |
{
|
| 106 |
"id": 100267,
|
| 107 |
+
"content": "</functions>",
|
| 108 |
"lstrip": false,
|
| 109 |
"normalized": false,
|
| 110 |
"rstrip": false,
|
|
|
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"id": 100268,
|
| 116 |
+
"content": "<function_calls>",
|
| 117 |
"lstrip": false,
|
| 118 |
"normalized": false,
|
| 119 |
"rstrip": false,
|
|
|
|
| 122 |
},
|
| 123 |
{
|
| 124 |
"id": 100269,
|
| 125 |
+
"content": "</function_calls>",
|
| 126 |
"lstrip": false,
|
| 127 |
"normalized": false,
|
| 128 |
"rstrip": false,
|
|
|
|
| 131 |
},
|
| 132 |
{
|
| 133 |
"id": 100270,
|
| 134 |
+
"content": "<|extra_id_1|>",
|
| 135 |
"lstrip": false,
|
| 136 |
"normalized": false,
|
| 137 |
"rstrip": false,
|
|
|
|
| 140 |
},
|
| 141 |
{
|
| 142 |
"id": 100271,
|
| 143 |
+
"content": "<|extra_id_2|>",
|
| 144 |
"lstrip": false,
|
| 145 |
"normalized": false,
|
| 146 |
"rstrip": false,
|
|
|
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"id": 100272,
|
| 152 |
+
"content": "<|extra_id_3|>",
|
| 153 |
"lstrip": false,
|
| 154 |
"normalized": false,
|
| 155 |
"rstrip": false,
|
|
|
|
| 158 |
},
|
| 159 |
{
|
| 160 |
"id": 100273,
|
| 161 |
+
"content": "<|extra_id_4|>",
|
| 162 |
"lstrip": false,
|
| 163 |
"normalized": false,
|
| 164 |
"rstrip": false,
|
|
|
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"id": 100274,
|
| 170 |
+
"content": "<|extra_id_5|>",
|
| 171 |
"lstrip": false,
|
| 172 |
"normalized": false,
|
| 173 |
"rstrip": false,
|
|
|
|
| 176 |
},
|
| 177 |
{
|
| 178 |
"id": 100275,
|
| 179 |
+
"content": "<|extra_id_6|>",
|
| 180 |
"lstrip": false,
|
| 181 |
"normalized": false,
|
| 182 |
"rstrip": false,
|
|
|
|
| 100495 |
".WaitFor": 100253,
|
| 100496 |
"Ġdaycare": 100254,
|
| 100497 |
"ĠConveyor": 100255,
|
| 100498 |
+
"<functions>": 100266,
|
| 100499 |
+
"</functions>": 100267,
|
| 100500 |
+
"<function_calls>": 100268,
|
| 100501 |
+
"</function_calls>": 100269,
|
| 100502 |
"<|extra_id_0|>": 100256,
|
| 100503 |
"<|endoftext|>": 100257,
|
| 100504 |
"<|fim_prefix|>": 100258,
|
|
|
|
| 100509 |
"|||IP_ADDRESS|||": 100263,
|
| 100510 |
"<|im_start|>": 100264,
|
| 100511 |
"<|im_end|>": 100265,
|
| 100512 |
+
"<|extra_id_1|>": 100270,
|
| 100513 |
+
"<|extra_id_2|>": 100271,
|
| 100514 |
+
"<|extra_id_3|>": 100272,
|
| 100515 |
+
"<|extra_id_4|>": 100273,
|
| 100516 |
+
"<|extra_id_5|>": 100274,
|
| 100517 |
+
"<|extra_id_6|>": 100275,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100518 |
"<|endofprompt|>": 100276,
|
| 100519 |
"<|pad|>": 100277
|
| 100520 |
},
|
tokenizer_config.json
CHANGED
|
@@ -82,7 +82,7 @@
|
|
| 82 |
"special": true
|
| 83 |
},
|
| 84 |
"100266": {
|
| 85 |
-
"content": "
|
| 86 |
"lstrip": false,
|
| 87 |
"normalized": false,
|
| 88 |
"rstrip": false,
|
|
@@ -90,7 +90,7 @@
|
|
| 90 |
"special": false
|
| 91 |
},
|
| 92 |
"100267": {
|
| 93 |
-
"content": "
|
| 94 |
"lstrip": false,
|
| 95 |
"normalized": false,
|
| 96 |
"rstrip": false,
|
|
@@ -98,7 +98,7 @@
|
|
| 98 |
"special": false
|
| 99 |
},
|
| 100 |
"100268": {
|
| 101 |
-
"content": "<
|
| 102 |
"lstrip": false,
|
| 103 |
"normalized": false,
|
| 104 |
"rstrip": false,
|
|
@@ -106,7 +106,7 @@
|
|
| 106 |
"special": false
|
| 107 |
},
|
| 108 |
"100269": {
|
| 109 |
-
"content": "</
|
| 110 |
"lstrip": false,
|
| 111 |
"normalized": false,
|
| 112 |
"rstrip": false,
|
|
@@ -114,7 +114,7 @@
|
|
| 114 |
"special": false
|
| 115 |
},
|
| 116 |
"100270": {
|
| 117 |
-
"content": "
|
| 118 |
"lstrip": false,
|
| 119 |
"normalized": false,
|
| 120 |
"rstrip": false,
|
|
@@ -122,7 +122,7 @@
|
|
| 122 |
"special": false
|
| 123 |
},
|
| 124 |
"100271": {
|
| 125 |
-
"content": "
|
| 126 |
"lstrip": false,
|
| 127 |
"normalized": false,
|
| 128 |
"rstrip": false,
|
|
@@ -130,7 +130,7 @@
|
|
| 130 |
"special": false
|
| 131 |
},
|
| 132 |
"100272": {
|
| 133 |
-
"content": "
|
| 134 |
"lstrip": false,
|
| 135 |
"normalized": false,
|
| 136 |
"rstrip": false,
|
|
@@ -138,7 +138,7 @@
|
|
| 138 |
"special": false
|
| 139 |
},
|
| 140 |
"100273": {
|
| 141 |
-
"content": "
|
| 142 |
"lstrip": false,
|
| 143 |
"normalized": false,
|
| 144 |
"rstrip": false,
|
|
@@ -146,7 +146,7 @@
|
|
| 146 |
"special": false
|
| 147 |
},
|
| 148 |
"100274": {
|
| 149 |
-
"content": "
|
| 150 |
"lstrip": false,
|
| 151 |
"normalized": false,
|
| 152 |
"rstrip": false,
|
|
@@ -154,7 +154,7 @@
|
|
| 154 |
"special": false
|
| 155 |
},
|
| 156 |
"100275": {
|
| 157 |
-
"content": "
|
| 158 |
"lstrip": false,
|
| 159 |
"normalized": false,
|
| 160 |
"rstrip": false,
|
|
|
|
| 82 |
"special": true
|
| 83 |
},
|
| 84 |
"100266": {
|
| 85 |
+
"content": "<functions>",
|
| 86 |
"lstrip": false,
|
| 87 |
"normalized": false,
|
| 88 |
"rstrip": false,
|
|
|
|
| 90 |
"special": false
|
| 91 |
},
|
| 92 |
"100267": {
|
| 93 |
+
"content": "</functions>",
|
| 94 |
"lstrip": false,
|
| 95 |
"normalized": false,
|
| 96 |
"rstrip": false,
|
|
|
|
| 98 |
"special": false
|
| 99 |
},
|
| 100 |
"100268": {
|
| 101 |
+
"content": "<function_calls>",
|
| 102 |
"lstrip": false,
|
| 103 |
"normalized": false,
|
| 104 |
"rstrip": false,
|
|
|
|
| 106 |
"special": false
|
| 107 |
},
|
| 108 |
"100269": {
|
| 109 |
+
"content": "</function_calls>",
|
| 110 |
"lstrip": false,
|
| 111 |
"normalized": false,
|
| 112 |
"rstrip": false,
|
|
|
|
| 114 |
"special": false
|
| 115 |
},
|
| 116 |
"100270": {
|
| 117 |
+
"content": "<|extra_id_1|>",
|
| 118 |
"lstrip": false,
|
| 119 |
"normalized": false,
|
| 120 |
"rstrip": false,
|
|
|
|
| 122 |
"special": false
|
| 123 |
},
|
| 124 |
"100271": {
|
| 125 |
+
"content": "<|extra_id_2|>",
|
| 126 |
"lstrip": false,
|
| 127 |
"normalized": false,
|
| 128 |
"rstrip": false,
|
|
|
|
| 130 |
"special": false
|
| 131 |
},
|
| 132 |
"100272": {
|
| 133 |
+
"content": "<|extra_id_3|>",
|
| 134 |
"lstrip": false,
|
| 135 |
"normalized": false,
|
| 136 |
"rstrip": false,
|
|
|
|
| 138 |
"special": false
|
| 139 |
},
|
| 140 |
"100273": {
|
| 141 |
+
"content": "<|extra_id_4|>",
|
| 142 |
"lstrip": false,
|
| 143 |
"normalized": false,
|
| 144 |
"rstrip": false,
|
|
|
|
| 146 |
"special": false
|
| 147 |
},
|
| 148 |
"100274": {
|
| 149 |
+
"content": "<|extra_id_5|>",
|
| 150 |
"lstrip": false,
|
| 151 |
"normalized": false,
|
| 152 |
"rstrip": false,
|
|
|
|
| 154 |
"special": false
|
| 155 |
},
|
| 156 |
"100275": {
|
| 157 |
+
"content": "<|extra_id_6|>",
|
| 158 |
"lstrip": false,
|
| 159 |
"normalized": false,
|
| 160 |
"rstrip": false,
|
vocab.json
CHANGED
|
@@ -100255,6 +100255,10 @@
|
|
| 100255 |
".WaitFor": 100253,
|
| 100256 |
"Ġdaycare": 100254,
|
| 100257 |
"ĠConveyor": 100255,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100258 |
"<|extra_id_0|>": 100256,
|
| 100259 |
"<|endoftext|>": 100257,
|
| 100260 |
"<|fim_prefix|>": 100258,
|
|
@@ -100265,16 +100269,12 @@
|
|
| 100265 |
"|||IP_ADDRESS|||": 100263,
|
| 100266 |
"<|im_start|>": 100264,
|
| 100267 |
"<|im_end|>": 100265,
|
| 100268 |
-
"<|extra_id_1|>":
|
| 100269 |
-
"<|extra_id_2|>":
|
| 100270 |
-
"
|
| 100271 |
-
"
|
| 100272 |
-
"
|
| 100273 |
-
"
|
| 100274 |
-
"<function_calls>": 100272,
|
| 100275 |
-
"</function_calls>": 100273,
|
| 100276 |
-
"<answer>": 100274,
|
| 100277 |
-
"</answer>": 100275,
|
| 100278 |
"<|endofprompt|>": 100276,
|
| 100279 |
"<|pad|>": 100277
|
| 100280 |
}
|
|
|
|
| 100255 |
".WaitFor": 100253,
|
| 100256 |
"Ġdaycare": 100254,
|
| 100257 |
"ĠConveyor": 100255,
|
| 100258 |
+
"<functions>": 100266,
|
| 100259 |
+
"</functions>": 100267,
|
| 100260 |
+
"<function_calls>": 100268,
|
| 100261 |
+
"</function_calls>": 100269,
|
| 100262 |
"<|extra_id_0|>": 100256,
|
| 100263 |
"<|endoftext|>": 100257,
|
| 100264 |
"<|fim_prefix|>": 100258,
|
|
|
|
| 100269 |
"|||IP_ADDRESS|||": 100263,
|
| 100270 |
"<|im_start|>": 100264,
|
| 100271 |
"<|im_end|>": 100265,
|
| 100272 |
+
"<|extra_id_1|>": 100270,
|
| 100273 |
+
"<|extra_id_2|>": 100271,
|
| 100274 |
+
"<|extra_id_3|>": 100272,
|
| 100275 |
+
"<|extra_id_4|>": 100273,
|
| 100276 |
+
"<|extra_id_5|>": 100274,
|
| 100277 |
+
"<|extra_id_6|>": 100275,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100278 |
"<|endofprompt|>": 100276,
|
| 100279 |
"<|pad|>": 100277
|
| 100280 |
}
|