saumyamalik commited on
Commit
26d1356
·
1 Parent(s): b369210

removed unused special tokens

Browse files
Files changed (4) hide show
  1. fix_tokens.py +10 -10
  2. tokenizer.json +20 -20
  3. tokenizer_config.json +10 -10
  4. vocab.json +10 -10
fix_tokens.py CHANGED
@@ -79,16 +79,16 @@ DESIRED_MAPPING = [
79
  SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
80
  SpecialToken(id=100264, content="<|im_start|>", special=True),
81
  SpecialToken(id=100265, content="<|im_end|>", special=True),
82
- SpecialToken(id=100266, content="<|extra_id_1|>"),
83
- SpecialToken(id=100267, content="<|extra_id_2|>"),
84
- SpecialToken(id=100268, content="<think>"),
85
- SpecialToken(id=100269, content="</think>"),
86
- SpecialToken(id=100270, content="<functions>"),
87
- SpecialToken(id=100271, content="</functions>"),
88
- SpecialToken(id=100272, content="<function_calls>"),
89
- SpecialToken(id=100273, content="</function_calls>"),
90
- SpecialToken(id=100274, content="<answer>"),
91
- SpecialToken(id=100275, content="</answer>"),
92
  SpecialToken(id=100276, content="<|endofprompt|>", special=True),
93
  SpecialToken(
94
  id=100277,
 
79
  SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
80
  SpecialToken(id=100264, content="<|im_start|>", special=True),
81
  SpecialToken(id=100265, content="<|im_end|>", special=True),
82
+ SpecialToken(id=100266, content="<functions>"),
83
+ SpecialToken(id=100267, content="</functions>"),
84
+ SpecialToken(id=100268, content="<function_calls>"),
85
+ SpecialToken(id=100269, content="</function_calls>"),
86
+ SpecialToken(id=100270, content="<|extra_id_1|>"),
87
+ SpecialToken(id=100271, content="<|extra_id_2|>"),
88
+ SpecialToken(id=100272, content="<|extra_id_3|>"),
89
+ SpecialToken(id=100273, content="<|extra_id_4|>"),
90
+ SpecialToken(id=100274, content="<|extra_id_5|>"),
91
+ SpecialToken(id=100275, content="<|extra_id_6|>"),
92
  SpecialToken(id=100276, content="<|endofprompt|>", special=True),
93
  SpecialToken(
94
  id=100277,
tokenizer.json CHANGED
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 100266,
98
- "content": "<|extra_id_1|>",
99
  "lstrip": false,
100
  "normalized": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 100267,
107
- "content": "<|extra_id_2|>",
108
  "lstrip": false,
109
  "normalized": false,
110
  "rstrip": false,
@@ -113,7 +113,7 @@
113
  },
114
  {
115
  "id": 100268,
116
- "content": "<think>",
117
  "lstrip": false,
118
  "normalized": false,
119
  "rstrip": false,
@@ -122,7 +122,7 @@
122
  },
123
  {
124
  "id": 100269,
125
- "content": "</think>",
126
  "lstrip": false,
127
  "normalized": false,
128
  "rstrip": false,
@@ -131,7 +131,7 @@
131
  },
132
  {
133
  "id": 100270,
134
- "content": "<functions>",
135
  "lstrip": false,
136
  "normalized": false,
137
  "rstrip": false,
@@ -140,7 +140,7 @@
140
  },
141
  {
142
  "id": 100271,
143
- "content": "</functions>",
144
  "lstrip": false,
145
  "normalized": false,
146
  "rstrip": false,
@@ -149,7 +149,7 @@
149
  },
150
  {
151
  "id": 100272,
152
- "content": "<function_calls>",
153
  "lstrip": false,
154
  "normalized": false,
155
  "rstrip": false,
@@ -158,7 +158,7 @@
158
  },
159
  {
160
  "id": 100273,
161
- "content": "</function_calls>",
162
  "lstrip": false,
163
  "normalized": false,
164
  "rstrip": false,
@@ -167,7 +167,7 @@
167
  },
168
  {
169
  "id": 100274,
170
- "content": "<answer>",
171
  "lstrip": false,
172
  "normalized": false,
173
  "rstrip": false,
@@ -176,7 +176,7 @@
176
  },
177
  {
178
  "id": 100275,
179
- "content": "</answer>",
180
  "lstrip": false,
181
  "normalized": false,
182
  "rstrip": false,
@@ -100495,6 +100495,10 @@
100495
  ".WaitFor": 100253,
100496
  "Ġdaycare": 100254,
100497
  "ĠConveyor": 100255,
 
 
 
 
100498
  "<|extra_id_0|>": 100256,
100499
  "<|endoftext|>": 100257,
100500
  "<|fim_prefix|>": 100258,
@@ -100505,16 +100509,12 @@
100505
  "|||IP_ADDRESS|||": 100263,
100506
  "<|im_start|>": 100264,
100507
  "<|im_end|>": 100265,
100508
- "<|extra_id_1|>": 100266,
100509
- "<|extra_id_2|>": 100267,
100510
- "<think>": 100268,
100511
- "</think>": 100269,
100512
- "<functions>": 100270,
100513
- "</functions>": 100271,
100514
- "<function_calls>": 100272,
100515
- "</function_calls>": 100273,
100516
- "<answer>": 100274,
100517
- "</answer>": 100275,
100518
  "<|endofprompt|>": 100276,
100519
  "<|pad|>": 100277
100520
  },
 
95
  },
96
  {
97
  "id": 100266,
98
+ "content": "<functions>",
99
  "lstrip": false,
100
  "normalized": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 100267,
107
+ "content": "</functions>",
108
  "lstrip": false,
109
  "normalized": false,
110
  "rstrip": false,
 
113
  },
114
  {
115
  "id": 100268,
116
+ "content": "<function_calls>",
117
  "lstrip": false,
118
  "normalized": false,
119
  "rstrip": false,
 
122
  },
123
  {
124
  "id": 100269,
125
+ "content": "</function_calls>",
126
  "lstrip": false,
127
  "normalized": false,
128
  "rstrip": false,
 
131
  },
132
  {
133
  "id": 100270,
134
+ "content": "<|extra_id_1|>",
135
  "lstrip": false,
136
  "normalized": false,
137
  "rstrip": false,
 
140
  },
141
  {
142
  "id": 100271,
143
+ "content": "<|extra_id_2|>",
144
  "lstrip": false,
145
  "normalized": false,
146
  "rstrip": false,
 
149
  },
150
  {
151
  "id": 100272,
152
+ "content": "<|extra_id_3|>",
153
  "lstrip": false,
154
  "normalized": false,
155
  "rstrip": false,
 
158
  },
159
  {
160
  "id": 100273,
161
+ "content": "<|extra_id_4|>",
162
  "lstrip": false,
163
  "normalized": false,
164
  "rstrip": false,
 
167
  },
168
  {
169
  "id": 100274,
170
+ "content": "<|extra_id_5|>",
171
  "lstrip": false,
172
  "normalized": false,
173
  "rstrip": false,
 
176
  },
177
  {
178
  "id": 100275,
179
+ "content": "<|extra_id_6|>",
180
  "lstrip": false,
181
  "normalized": false,
182
  "rstrip": false,
 
100495
  ".WaitFor": 100253,
100496
  "Ġdaycare": 100254,
100497
  "ĠConveyor": 100255,
100498
+ "<functions>": 100266,
100499
+ "</functions>": 100267,
100500
+ "<function_calls>": 100268,
100501
+ "</function_calls>": 100269,
100502
  "<|extra_id_0|>": 100256,
100503
  "<|endoftext|>": 100257,
100504
  "<|fim_prefix|>": 100258,
 
100509
  "|||IP_ADDRESS|||": 100263,
100510
  "<|im_start|>": 100264,
100511
  "<|im_end|>": 100265,
100512
+ "<|extra_id_1|>": 100270,
100513
+ "<|extra_id_2|>": 100271,
100514
+ "<|extra_id_3|>": 100272,
100515
+ "<|extra_id_4|>": 100273,
100516
+ "<|extra_id_5|>": 100274,
100517
+ "<|extra_id_6|>": 100275,
 
 
 
 
100518
  "<|endofprompt|>": 100276,
100519
  "<|pad|>": 100277
100520
  },
tokenizer_config.json CHANGED
@@ -82,7 +82,7 @@
82
  "special": true
83
  },
84
  "100266": {
85
- "content": "<|extra_id_1|>",
86
  "lstrip": false,
87
  "normalized": false,
88
  "rstrip": false,
@@ -90,7 +90,7 @@
90
  "special": false
91
  },
92
  "100267": {
93
- "content": "<|extra_id_2|>",
94
  "lstrip": false,
95
  "normalized": false,
96
  "rstrip": false,
@@ -98,7 +98,7 @@
98
  "special": false
99
  },
100
  "100268": {
101
- "content": "<think>",
102
  "lstrip": false,
103
  "normalized": false,
104
  "rstrip": false,
@@ -106,7 +106,7 @@
106
  "special": false
107
  },
108
  "100269": {
109
- "content": "</think>",
110
  "lstrip": false,
111
  "normalized": false,
112
  "rstrip": false,
@@ -114,7 +114,7 @@
114
  "special": false
115
  },
116
  "100270": {
117
- "content": "<functions>",
118
  "lstrip": false,
119
  "normalized": false,
120
  "rstrip": false,
@@ -122,7 +122,7 @@
122
  "special": false
123
  },
124
  "100271": {
125
- "content": "</functions>",
126
  "lstrip": false,
127
  "normalized": false,
128
  "rstrip": false,
@@ -130,7 +130,7 @@
130
  "special": false
131
  },
132
  "100272": {
133
- "content": "<function_calls>",
134
  "lstrip": false,
135
  "normalized": false,
136
  "rstrip": false,
@@ -138,7 +138,7 @@
138
  "special": false
139
  },
140
  "100273": {
141
- "content": "</function_calls>",
142
  "lstrip": false,
143
  "normalized": false,
144
  "rstrip": false,
@@ -146,7 +146,7 @@
146
  "special": false
147
  },
148
  "100274": {
149
- "content": "<answer>",
150
  "lstrip": false,
151
  "normalized": false,
152
  "rstrip": false,
@@ -154,7 +154,7 @@
154
  "special": false
155
  },
156
  "100275": {
157
- "content": "</answer>",
158
  "lstrip": false,
159
  "normalized": false,
160
  "rstrip": false,
 
82
  "special": true
83
  },
84
  "100266": {
85
+ "content": "<functions>",
86
  "lstrip": false,
87
  "normalized": false,
88
  "rstrip": false,
 
90
  "special": false
91
  },
92
  "100267": {
93
+ "content": "</functions>",
94
  "lstrip": false,
95
  "normalized": false,
96
  "rstrip": false,
 
98
  "special": false
99
  },
100
  "100268": {
101
+ "content": "<function_calls>",
102
  "lstrip": false,
103
  "normalized": false,
104
  "rstrip": false,
 
106
  "special": false
107
  },
108
  "100269": {
109
+ "content": "</function_calls>",
110
  "lstrip": false,
111
  "normalized": false,
112
  "rstrip": false,
 
114
  "special": false
115
  },
116
  "100270": {
117
+ "content": "<|extra_id_1|>",
118
  "lstrip": false,
119
  "normalized": false,
120
  "rstrip": false,
 
122
  "special": false
123
  },
124
  "100271": {
125
+ "content": "<|extra_id_2|>",
126
  "lstrip": false,
127
  "normalized": false,
128
  "rstrip": false,
 
130
  "special": false
131
  },
132
  "100272": {
133
+ "content": "<|extra_id_3|>",
134
  "lstrip": false,
135
  "normalized": false,
136
  "rstrip": false,
 
138
  "special": false
139
  },
140
  "100273": {
141
+ "content": "<|extra_id_4|>",
142
  "lstrip": false,
143
  "normalized": false,
144
  "rstrip": false,
 
146
  "special": false
147
  },
148
  "100274": {
149
+ "content": "<|extra_id_5|>",
150
  "lstrip": false,
151
  "normalized": false,
152
  "rstrip": false,
 
154
  "special": false
155
  },
156
  "100275": {
157
+ "content": "<|extra_id_6|>",
158
  "lstrip": false,
159
  "normalized": false,
160
  "rstrip": false,
vocab.json CHANGED
@@ -100255,6 +100255,10 @@
100255
  ".WaitFor": 100253,
100256
  "Ġdaycare": 100254,
100257
  "ĠConveyor": 100255,
 
 
 
 
100258
  "<|extra_id_0|>": 100256,
100259
  "<|endoftext|>": 100257,
100260
  "<|fim_prefix|>": 100258,
@@ -100265,16 +100269,12 @@
100265
  "|||IP_ADDRESS|||": 100263,
100266
  "<|im_start|>": 100264,
100267
  "<|im_end|>": 100265,
100268
- "<|extra_id_1|>": 100266,
100269
- "<|extra_id_2|>": 100267,
100270
- "<think>": 100268,
100271
- "</think>": 100269,
100272
- "<functions>": 100270,
100273
- "</functions>": 100271,
100274
- "<function_calls>": 100272,
100275
- "</function_calls>": 100273,
100276
- "<answer>": 100274,
100277
- "</answer>": 100275,
100278
  "<|endofprompt|>": 100276,
100279
  "<|pad|>": 100277
100280
  }
 
100255
  ".WaitFor": 100253,
100256
  "Ġdaycare": 100254,
100257
  "ĠConveyor": 100255,
100258
+ "<functions>": 100266,
100259
+ "</functions>": 100267,
100260
+ "<function_calls>": 100268,
100261
+ "</function_calls>": 100269,
100262
  "<|extra_id_0|>": 100256,
100263
  "<|endoftext|>": 100257,
100264
  "<|fim_prefix|>": 100258,
 
100269
  "|||IP_ADDRESS|||": 100263,
100270
  "<|im_start|>": 100264,
100271
  "<|im_end|>": 100265,
100272
+ "<|extra_id_1|>": 100270,
100273
+ "<|extra_id_2|>": 100271,
100274
+ "<|extra_id_3|>": 100272,
100275
+ "<|extra_id_4|>": 100273,
100276
+ "<|extra_id_5|>": 100274,
100277
+ "<|extra_id_6|>": 100275,
 
 
 
 
100278
  "<|endofprompt|>": 100276,
100279
  "<|pad|>": 100277
100280
  }