edpowers commited on
Commit
852cb62
·
verified ·
1 Parent(s): 1243046

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +15 -19
  2. tokenizer.json +6 -6
  3. tokenizer_config.json +11 -14
special_tokens_map.json CHANGED
@@ -1,23 +1,19 @@
1
  {
2
- "additional_special_tokens": [
3
- {
4
- "content": "<|im_start|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<|im_end|>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
- ],
18
- "bos_token": "<|im_start|>",
19
- "eos_token": "<|im_end|>",
20
- "pad_token": "<|im_end|>",
21
  "unk_token": {
22
  "content": "<unk>",
23
  "lstrip": false,
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
tokenizer.json CHANGED
@@ -32,21 +32,21 @@
32
  },
33
  {
34
  "id": 32000,
35
- "content": "<|im_start|>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
  },
42
  {
43
  "id": 32001,
44
- "content": "<|im_end|>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
- "normalized": false,
49
- "special": true
50
  }
51
  ],
52
  "normalizer": {
 
32
  },
33
  {
34
  "id": 32000,
35
+ "content": "<new_token1>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
+ "normalized": true,
40
+ "special": false
41
  },
42
  {
43
  "id": 32001,
44
+ "content": "<new_token2>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
+ "normalized": true,
49
+ "special": false
50
  }
51
  ],
52
  "normalizer": {
tokenizer_config.json CHANGED
@@ -27,33 +27,30 @@
27
  "special": true
28
  },
29
  "32000": {
30
- "content": "<|im_start|>",
31
  "lstrip": false,
32
- "normalized": false,
33
  "rstrip": false,
34
  "single_word": false,
35
- "special": true
36
  },
37
  "32001": {
38
- "content": "<|im_end|>",
39
  "lstrip": false,
40
- "normalized": false,
41
  "rstrip": false,
42
  "single_word": false,
43
- "special": true
44
  }
45
  },
46
- "additional_special_tokens": [
47
- "<|im_start|>",
48
- "<|im_end|>"
49
- ],
50
- "bos_token": "<|im_start|>",
51
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
52
  "clean_up_tokenization_spaces": false,
53
- "eos_token": "<|im_end|>",
54
  "legacy": true,
55
  "model_max_length": 1000000000000000019884624838656,
56
- "pad_token": "<|im_end|>",
57
  "sp_model_kwargs": {},
58
  "spaces_between_special_tokens": false,
59
  "tokenizer_class": "LlamaTokenizer",
 
27
  "special": true
28
  },
29
  "32000": {
30
+ "content": "<new_token1>",
31
  "lstrip": false,
32
+ "normalized": true,
33
  "rstrip": false,
34
  "single_word": false,
35
+ "special": false
36
  },
37
  "32001": {
38
+ "content": "<new_token2>",
39
  "lstrip": false,
40
+ "normalized": true,
41
  "rstrip": false,
42
  "single_word": false,
43
+ "special": false
44
  }
45
  },
46
+ "additional_special_tokens": [],
47
+ "bos_token": "<s>",
48
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
 
 
 
49
  "clean_up_tokenization_spaces": false,
50
+ "eos_token": "</s>",
51
  "legacy": true,
52
  "model_max_length": 1000000000000000019884624838656,
53
+ "pad_token": "</s>",
54
  "sp_model_kwargs": {},
55
  "spaces_between_special_tokens": false,
56
  "tokenizer_class": "LlamaTokenizer",