amdbook commited on
Commit
2c3e10f
·
verified ·
1 Parent(s): b36791f

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|assistant|>": 32001,
3
+ "<|endoftext|>": 32000,
4
+ "<|end|>": 32007,
5
+ "<|placeholder1|>": 32002,
6
+ "<|placeholder2|>": 32003,
7
+ "<|placeholder3|>": 32004,
8
+ "<|placeholder4|>": 32005,
9
+ "<|placeholder5|>": 32008,
10
+ "<|placeholder6|>": 32009,
11
+ "<|system|>": 32006,
12
+ "<|user|>": 32010
13
+ }
special_tokens_map.json CHANGED
@@ -13,7 +13,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<unk>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "unk_token": {
24
  "content": "<unk>",
25
  "lstrip": false,
tokenizer.json CHANGED
@@ -155,12 +155,6 @@
155
  "id": "A",
156
  "type_id": 0
157
  }
158
- },
159
- {
160
- "SpecialToken": {
161
- "id": "<|endoftext|>",
162
- "type_id": 0
163
- }
164
  }
165
  ],
166
  "pair": [
@@ -170,36 +164,14 @@
170
  "type_id": 0
171
  }
172
  },
173
- {
174
- "SpecialToken": {
175
- "id": "<|endoftext|>",
176
- "type_id": 0
177
- }
178
- },
179
  {
180
  "Sequence": {
181
  "id": "B",
182
  "type_id": 1
183
  }
184
- },
185
- {
186
- "SpecialToken": {
187
- "id": "<|endoftext|>",
188
- "type_id": 1
189
- }
190
  }
191
  ],
192
- "special_tokens": {
193
- "<|endoftext|>": {
194
- "id": "<|endoftext|>",
195
- "ids": [
196
- 32000
197
- ],
198
- "tokens": [
199
- "<|endoftext|>"
200
- ]
201
- }
202
- }
203
  },
204
  "decoder": {
205
  "type": "Sequence",
 
155
  "id": "A",
156
  "type_id": 0
157
  }
 
 
 
 
 
 
158
  }
159
  ],
160
  "pair": [
 
164
  "type_id": 0
165
  }
166
  },
 
 
 
 
 
 
167
  {
168
  "Sequence": {
169
  "id": "B",
170
  "type_id": 1
171
  }
 
 
 
 
 
 
172
  }
173
  ],
174
+ "special_tokens": {}
 
 
 
 
 
 
 
 
 
 
175
  },
176
  "decoder": {
177
  "type": "Sequence",
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json CHANGED
@@ -1,7 +1,6 @@
1
  {
2
  "add_bos_token": false,
3
- "add_eos_token": true,
4
- "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
@@ -120,10 +119,9 @@
120
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
- "legacy": false,
124
  "model_max_length": 4096,
125
- "pad_token": "<unk>",
126
- "padding_side": "right",
127
  "sp_model_kwargs": {},
128
  "tokenizer_class": "LlamaTokenizer",
129
  "unk_token": "<unk>",
 
1
  {
2
  "add_bos_token": false,
3
+ "add_eos_token": false,
 
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
 
119
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
120
  "clean_up_tokenization_spaces": false,
121
  "eos_token": "<|endoftext|>",
 
122
  "model_max_length": 4096,
123
+ "pad_token": "<|endoftext|>",
124
+ "padding_side": "left",
125
  "sp_model_kwargs": {},
126
  "tokenizer_class": "LlamaTokenizer",
127
  "unk_token": "<unk>",