TinyPixel commited on
Commit
25283f4
·
verified ·
1 Parent(s): 6477424

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +39 -14
  2. tokenizer.json +48 -9
  3. tokenizer_config.json +31 -10
special_tokens_map.json CHANGED
@@ -1,4 +1,41 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<|endoftext|>",
4
  "lstrip": false,
@@ -6,20 +43,8 @@
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
- "eos_token": {
10
- "content": "<|im_end|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<PAD>",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
  "unk_token": {
24
  "content": "<|endoftext|>",
25
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|START_OF_TURN_TOKEN|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<PAD>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|USER_TOKEN|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|CHATBOT_TOKEN|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<|END_OF_TURN_TOKEN|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ ],
39
  "bos_token": {
40
  "content": "<|endoftext|>",
41
  "lstrip": false,
 
43
  "rstrip": false,
44
  "single_word": false
45
  },
46
+ "eos_token": "<|END_OF_TURN_TOKEN|>",
47
+ "pad_token": "<PAD>",
 
 
 
 
 
 
 
 
 
 
 
 
48
  "unk_token": {
49
  "content": "<|endoftext|>",
50
  "lstrip": false,
tokenizer.json CHANGED
@@ -230,12 +230,12 @@
230
  },
231
  {
232
  "id": 50277,
233
- "content": "<|im_start|>",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
237
- "normalized": true,
238
- "special": false
239
  },
240
  {
241
  "id": 50278,
@@ -243,12 +243,30 @@
243
  "single_word": false,
244
  "lstrip": false,
245
  "rstrip": false,
246
- "normalized": true,
247
  "special": true
248
  },
249
  {
250
  "id": 50279,
251
- "content": "<|im_end|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  "single_word": false,
253
  "lstrip": false,
254
  "rstrip": false,
@@ -266,10 +284,30 @@
266
  "use_regex": true
267
  },
268
  "post_processor": {
269
- "type": "ByteLevel",
270
- "add_prefix_space": false,
271
- "trim_offsets": true,
272
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  },
274
  "decoder": {
275
  "type": "ByteLevel",
@@ -285,6 +323,7 @@
285
  "end_of_word_suffix": null,
286
  "fuse_unk": false,
287
  "byte_fallback": false,
 
288
  "vocab": {
289
  "<|endoftext|>": 0,
290
  "<|padding|>": 1,
 
230
  },
231
  {
232
  "id": 50277,
233
+ "content": "<|START_OF_TURN_TOKEN|>",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
237
+ "normalized": false,
238
+ "special": true
239
  },
240
  {
241
  "id": 50278,
 
243
  "single_word": false,
244
  "lstrip": false,
245
  "rstrip": false,
246
+ "normalized": false,
247
  "special": true
248
  },
249
  {
250
  "id": 50279,
251
+ "content": "<|USER_TOKEN|>",
252
+ "single_word": false,
253
+ "lstrip": false,
254
+ "rstrip": false,
255
+ "normalized": false,
256
+ "special": true
257
+ },
258
+ {
259
+ "id": 50280,
260
+ "content": "<|CHATBOT_TOKEN|>",
261
+ "single_word": false,
262
+ "lstrip": false,
263
+ "rstrip": false,
264
+ "normalized": false,
265
+ "special": true
266
+ },
267
+ {
268
+ "id": 50281,
269
+ "content": "<|END_OF_TURN_TOKEN|>",
270
  "single_word": false,
271
  "lstrip": false,
272
  "rstrip": false,
 
284
  "use_regex": true
285
  },
286
  "post_processor": {
287
+ "type": "TemplateProcessing",
288
+ "single": [
289
+ {
290
+ "Sequence": {
291
+ "id": "A",
292
+ "type_id": 0
293
+ }
294
+ }
295
+ ],
296
+ "pair": [
297
+ {
298
+ "Sequence": {
299
+ "id": "A",
300
+ "type_id": 0
301
+ }
302
+ },
303
+ {
304
+ "Sequence": {
305
+ "id": "B",
306
+ "type_id": 1
307
+ }
308
+ }
309
+ ],
310
+ "special_tokens": {}
311
  },
312
  "decoder": {
313
  "type": "ByteLevel",
 
323
  "end_of_word_suffix": null,
324
  "fuse_unk": false,
325
  "byte_fallback": false,
326
+ "ignore_merges": false,
327
  "vocab": {
328
  "<|endoftext|>": 0,
329
  "<|padding|>": 1,
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "0": {
@@ -202,23 +204,39 @@
202
  "special": false
203
  },
204
  "50277": {
205
- "content": "<|im_start|>",
206
  "lstrip": false,
207
- "normalized": true,
208
  "rstrip": false,
209
  "single_word": false,
210
- "special": false
211
  },
212
  "50278": {
213
  "content": "<PAD>",
214
  "lstrip": false,
215
- "normalized": true,
216
  "rstrip": false,
217
  "single_word": false,
218
  "special": true
219
  },
220
  "50279": {
221
- "content": "<|im_end|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  "lstrip": false,
223
  "normalized": false,
224
  "rstrip": false,
@@ -226,15 +244,18 @@
226
  "special": true
227
  }
228
  },
 
 
 
 
 
 
 
229
  "bos_token": "<|endoftext|>",
230
  "clean_up_tokenization_spaces": true,
231
- "eos_token": "<|im_end|>",
232
- "max_length": 1024,
233
  "model_max_length": 1000000000000000019884624838656,
234
  "pad_token": "<PAD>",
235
- "stride": 0,
236
  "tokenizer_class": "GPTNeoXTokenizer",
237
- "truncation_side": "right",
238
- "truncation_strategy": "longest_first",
239
  "unk_token": "<|endoftext|>"
240
  }
 
1
  {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
  "0": {
 
204
  "special": false
205
  },
206
  "50277": {
207
+ "content": "<|START_OF_TURN_TOKEN|>",
208
  "lstrip": false,
209
+ "normalized": false,
210
  "rstrip": false,
211
  "single_word": false,
212
+ "special": true
213
  },
214
  "50278": {
215
  "content": "<PAD>",
216
  "lstrip": false,
217
+ "normalized": false,
218
  "rstrip": false,
219
  "single_word": false,
220
  "special": true
221
  },
222
  "50279": {
223
+ "content": "<|USER_TOKEN|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "50280": {
231
+ "content": "<|CHATBOT_TOKEN|>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "50281": {
239
+ "content": "<|END_OF_TURN_TOKEN|>",
240
  "lstrip": false,
241
  "normalized": false,
242
  "rstrip": false,
 
244
  "special": true
245
  }
246
  },
247
+ "additional_special_tokens": [
248
+ "<|START_OF_TURN_TOKEN|>",
249
+ "<PAD>",
250
+ "<|USER_TOKEN|>",
251
+ "<|CHATBOT_TOKEN|>",
252
+ "<|END_OF_TURN_TOKEN|>"
253
+ ],
254
  "bos_token": "<|endoftext|>",
255
  "clean_up_tokenization_spaces": true,
256
+ "eos_token": "<|END_OF_TURN_TOKEN|>",
 
257
  "model_max_length": 1000000000000000019884624838656,
258
  "pad_token": "<PAD>",
 
259
  "tokenizer_class": "GPTNeoXTokenizer",
 
 
260
  "unk_token": "<|endoftext|>"
261
  }