sujithatz commited on
Commit
c9fbe9a
·
verified ·
1 Parent(s): 540ea00

Upload tokenizer

Browse files
special_tokens_map.json CHANGED
@@ -13,13 +13,7 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": {
17
- "content": "<|placeholder6|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": true,
21
- "single_word": false
22
- },
23
  "unk_token": {
24
  "content": "<unk>",
25
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": "<unk>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
@@ -155,6 +160,12 @@
155
  "id": "A",
156
  "type_id": 0
157
  }
 
 
 
 
 
 
158
  }
159
  ],
160
  "pair": [
@@ -164,14 +175,36 @@
164
  "type_id": 0
165
  }
166
  },
 
 
 
 
 
 
167
  {
168
  "Sequence": {
169
  "id": "B",
170
  "type_id": 1
171
  }
 
 
 
 
 
 
172
  }
173
  ],
174
- "special_tokens": {}
 
 
 
 
 
 
 
 
 
 
175
  },
176
  "decoder": {
177
  "type": "Sequence",
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
160
  "id": "A",
161
  "type_id": 0
162
  }
163
+ },
164
+ {
165
+ "SpecialToken": {
166
+ "id": "<|endoftext|>",
167
+ "type_id": 0
168
+ }
169
  }
170
  ],
171
  "pair": [
 
175
  "type_id": 0
176
  }
177
  },
178
+ {
179
+ "SpecialToken": {
180
+ "id": "<|endoftext|>",
181
+ "type_id": 0
182
+ }
183
+ },
184
  {
185
  "Sequence": {
186
  "id": "B",
187
  "type_id": 1
188
  }
189
+ },
190
+ {
191
+ "SpecialToken": {
192
+ "id": "<|endoftext|>",
193
+ "type_id": 1
194
+ }
195
  }
196
  ],
197
+ "special_tokens": {
198
+ "<|endoftext|>": {
199
+ "id": "<|endoftext|>",
200
+ "ids": [
201
+ 32000
202
+ ],
203
+ "tokens": [
204
+ "<|endoftext|>"
205
+ ]
206
+ }
207
+ }
208
  },
209
  "decoder": {
210
  "type": "Sequence",
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "add_bos_token": false,
3
- "add_eos_token": false,
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
@@ -117,12 +117,12 @@
117
  }
118
  },
119
  "bos_token": "<s>",
120
- "chat_template": "{% if 'role' in messages[0] %}{% for message in messages %}{% if message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% else %}{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}{% else %}{% for message in messages %}{% if message['about'] == 'human' %}{{'<|user|>\n' + message['news'] + '<|end|>\n'}}{% elif message['about'] == 'gpt' %}{{'<|assistant|>\n' + message['news'] + '<|end|>\n'}}{% else %}{{'<|' + message['about'] + '|>\n' + message['news'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
- "model_max_length": 131072,
125
- "pad_token": "<|placeholder6|>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
128
  "tokenizer_class": "LlamaTokenizer",
 
1
  {
2
  "add_bos_token": false,
3
+ "add_eos_token": true,
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
 
117
  }
118
  },
119
  "bos_token": "<s>",
120
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
+ "model_max_length": 4096,
125
+ "pad_token": "<unk>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
128
  "tokenizer_class": "LlamaTokenizer",