BRUNOKRISTI commited on
Commit
92c8df2
·
verified ·
1 Parent(s): ecb9245

Upload tokenizer

Browse files
special_tokens_map.json CHANGED
@@ -13,7 +13,13 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": "<unk>",
 
 
 
 
 
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "unk_token": {
24
  "content": "<unk>",
25
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
@@ -160,12 +155,6 @@
160
  "id": "A",
161
  "type_id": 0
162
  }
163
- },
164
- {
165
- "SpecialToken": {
166
- "id": "<|endoftext|>",
167
- "type_id": 0
168
- }
169
  }
170
  ],
171
  "pair": [
@@ -175,36 +164,14 @@
175
  "type_id": 0
176
  }
177
  },
178
- {
179
- "SpecialToken": {
180
- "id": "<|endoftext|>",
181
- "type_id": 0
182
- }
183
- },
184
  {
185
  "Sequence": {
186
  "id": "B",
187
  "type_id": 1
188
  }
189
- },
190
- {
191
- "SpecialToken": {
192
- "id": "<|endoftext|>",
193
- "type_id": 1
194
- }
195
  }
196
  ],
197
- "special_tokens": {
198
- "<|endoftext|>": {
199
- "id": "<|endoftext|>",
200
- "ids": [
201
- 32000
202
- ],
203
- "tokens": [
204
- "<|endoftext|>"
205
- ]
206
- }
207
- }
208
  },
209
  "decoder": {
210
  "type": "Sequence",
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
155
  "id": "A",
156
  "type_id": 0
157
  }
 
 
 
 
 
 
158
  }
159
  ],
160
  "pair": [
 
164
  "type_id": 0
165
  }
166
  },
 
 
 
 
 
 
167
  {
168
  "Sequence": {
169
  "id": "B",
170
  "type_id": 1
171
  }
 
 
 
 
 
 
172
  }
173
  ],
174
+ "special_tokens": {}
 
 
 
 
 
 
 
 
 
 
175
  },
176
  "decoder": {
177
  "type": "Sequence",
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "add_bos_token": false,
3
- "add_eos_token": true,
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
@@ -122,7 +122,7 @@
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
  "model_max_length": 4096,
125
- "pad_token": "<unk>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
128
  "tokenizer_class": "LlamaTokenizer",
 
1
  {
2
  "add_bos_token": false,
3
+ "add_eos_token": false,
4
  "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
 
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
  "model_max_length": 4096,
125
+ "pad_token": "<|endoftext|>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
128
  "tokenizer_class": "LlamaTokenizer",