lfhe commited on
Commit
59f80c3
·
1 Parent(s): 917c80b
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: microsoft/Phi-4-mini-instruct
3
  library_name: peft
4
  ---
5
 
 
1
  ---
2
+ base_model: microsoft/Phi-3-medium-128k-instruct
3
  library_name: peft
4
  ---
5
 
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "microsoft/Phi-4-mini-instruct",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
@@ -13,9 +13,9 @@
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
- "lora_alpha": 8,
17
  "lora_bias": false,
18
- "lora_dropout": 0.05,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
@@ -24,8 +24,6 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "down_proj",
28
- "gate_up_proj",
29
  "o_proj",
30
  "qkv_proj"
31
  ],
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "microsoft/Phi-3-medium-128k-instruct",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
 
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
+ "lora_alpha": 16,
17
  "lora_bias": false,
18
+ "lora_dropout": 0.25,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
 
27
  "o_proj",
28
  "qkv_proj"
29
  ],
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:219f86143308bbf5a40521635145ad144d4da92a2d9db5710ffd6d68c0cf2b00
3
- size 46171456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e211ae9f7ba6800d50e6b895986a46b5aec0979c030527053c9e70baf1197d8d
3
+ size 29512680
added_tokens.json CHANGED
@@ -1,12 +1,13 @@
1
  {
2
- "<|/tool_call|>": 200026,
3
- "<|/tool|>": 200024,
4
- "<|assistant|>": 200019,
5
- "<|end|>": 200020,
6
- "<|system|>": 200022,
7
- "<|tag|>": 200028,
8
- "<|tool_call|>": 200025,
9
- "<|tool_response|>": 200027,
10
- "<|tool|>": 200023,
11
- "<|user|>": 200021
 
12
  }
 
1
  {
2
+ "<|assistant|>": 32001,
3
+ "<|endoftext|>": 32000,
4
+ "<|end|>": 32007,
5
+ "<|placeholder1|>": 32002,
6
+ "<|placeholder2|>": 32003,
7
+ "<|placeholder3|>": 32004,
8
+ "<|placeholder4|>": 32005,
9
+ "<|placeholder5|>": 32008,
10
+ "<|placeholder6|>": 32009,
11
+ "<|system|>": 32006,
12
+ "<|user|>": 32010
13
  }
cl100k_base.tiktoken DELETED
The diff for this file is too large to render. See raw diff
 
merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -9,7 +9,7 @@
9
  }
10
  ],
11
  "bos_token": {
12
- "content": "<|endoftext|>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -30,7 +30,7 @@
30
  "single_word": false
31
  },
32
  "unk_token": {
33
- "content": "<|endoftext|>",
34
  "lstrip": false,
35
  "normalized": false,
36
  "rstrip": false,
 
9
  }
10
  ],
11
  "bos_token": {
12
+ "content": "<s>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
30
  "single_word": false
31
  },
32
  "unk_token": {
33
+ "content": "<unk>",
34
  "lstrip": false,
35
  "normalized": false,
36
  "rstrip": false,
tokenization_phi3_small.py DELETED
@@ -1,338 +0,0 @@
1
- # Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py
2
- import os
3
- from typing import Collection, List, Optional, Dict, Set, Tuple, Union
4
-
5
- from functools import cached_property
6
-
7
- import base64
8
- import requests
9
-
10
- from transformers import PreTrainedTokenizer, AddedToken, AutoConfig
11
- from transformers.models.auto.tokenization_auto import get_tokenizer_config
12
- import tiktoken
13
-
14
-
15
- """
16
- This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base")
17
- with a few additional special tokens to support the ChatML format.
18
-
19
- TODO(bapatra): Right now, I do not save the special tokens to the vocab file.
20
- Maybe in the future, that would be useful? Can add that support later.
21
-
22
- """
23
-
24
- def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
25
- with open(tiktoken_bpe_file, "rb") as f:
26
- contents = f.read()
27
- return {
28
- base64.b64decode(token): int(rank)
29
- for token, rank in (line.split() for line in contents.splitlines() if line)
30
- }
31
-
32
- # On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast.
33
- # this in turn causes some indices to be empty. We account for these empty indices by adding
34
- # dummy tokens to the tokenizer.
35
-
36
- EFFECTIVE_PADDED_VOCAB_SIZE = 100352
37
- ACTUAL_VOCAB_SIZE = 100276
38
-
39
-
40
- DUMMY_TOKENS = {
41
- f"<|dummy_id_{11 + offset}|>": 100276 + offset
42
- for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE)
43
- }
44
-
45
- SPECIAL_TOKENS = {
46
- # tiktoken.get_encoding("cl100k_base")._special_tokens
47
- '<|endoftext|>': 100257,
48
- '<|fim_prefix|>': 100258,
49
- '<|fim_middle|>': 100259,
50
- '<|fim_suffix|>': 100260,
51
- # Special tokens for post-training
52
- "<|system|>": 100261,
53
- "<|user|>": 100262,
54
- "<|assistant|>": 100263,
55
- # Dummy unused tokens
56
- "<|dummy_id_0|>": 100264,
57
- "<|dummy_id_1|>": 100265,
58
- # Special tokens for post-training continued
59
- "<|end|>": 100266,
60
- # Some dummy tokens, so that tokenization is contiguous and does not cause issues
61
- # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not
62
- # actually map to anything. So we use a dummy token here.
63
- "<|dummy_id_2|>": 100256,
64
- # Likewise, tokens from 100267 to 100275 are also unused
65
- "<|dummy_id_3|>": 100267,
66
- "<|dummy_id_4|>": 100268,
67
- "<|dummy_id_5|>": 100269,
68
- "<|dummy_id_6|>": 100270,
69
- "<|dummy_id_7|>": 100271,
70
- "<|dummy_id_8|>": 100272,
71
- "<|dummy_id_9|>": 100273,
72
- "<|dummy_id_10|>": 100274,
73
- "<|dummy_id_11|>": 100275,
74
- # The final end of prompt token
75
- # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens)
76
- '<|endofprompt|>': 100276,
77
- # Dummy tokens to account for padding of the tokenizer
78
- # We pad to ensure tensor cores are used for vocab multiplication
79
- **DUMMY_TOKENS
80
- }
81
-
82
- class Phi3SmallTokenizer(PreTrainedTokenizer):
83
- vocab_files_names = {
84
- "vocab_file": "cl100k_base.tiktoken"
85
- }
86
-
87
- model_input_names: List[str] = ["input_ids", "attention_mask"]
88
- padding_side = "left"
89
-
90
- def __init__(
91
- self,
92
- vocab_file: Optional[str] = None,
93
- errors: str = "replace",
94
- **kwargs
95
- ) -> None:
96
- # PreTrainedTokenizer's init calls _add_tokens, which in turn checks
97
- # if the token is present in `self.special_tokens``. Hence instantiating it here.
98
- # The way Qwen gets around this is by checking against SPECIAL_TOKENS
99
- # But I think it's better to check against the objects own `special_tokens`
100
- # in case we eventually want to allow the tokenizer to have special tokens.
101
- self.special_tokens = SPECIAL_TOKENS
102
-
103
- super().__init__(**kwargs)
104
- self.errors = errors
105
-
106
- try:
107
- base = tiktoken.get_encoding("cl100k_base")
108
- # This deals with the scenario where user has restricted internet access
109
- # and thus fails to download the tokenizer file from https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
110
- # It is assumed that user should be able to access files on huggingface hub.
111
- except requests.RequestException:
112
- import hashlib
113
- from transformers.utils import cached_file
114
- cached_tokenizer_path = cached_file(
115
- "microsoft/Phi-3-small-8k-instruct",
116
- "cl100k_base.tiktoken",
117
- _raise_exceptions_for_gated_repo=False,
118
- _raise_exceptions_for_missing_entries=False,
119
- _raise_exceptions_for_connection_errors=False
120
- )
121
- tiktoken_cache_dir = os.path.dirname(cached_tokenizer_path)
122
- tiktoken_cache_path = os.path.join(
123
- tiktoken_cache_dir,
124
- hashlib.sha1("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken".encode()).hexdigest()
125
- )
126
- if not os.path.exists(tiktoken_cache_path):
127
- os.rename(cached_tokenizer_path, tiktoken_cache_path)
128
- os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
129
- base = tiktoken.get_encoding("cl100k_base")
130
-
131
- if vocab_file is None:
132
- self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks
133
- else:
134
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
135
-
136
- self.pat_str = base._pat_str
137
-
138
- enc = tiktoken.Encoding(
139
- name="phi3small",
140
- pat_str=self.pat_str,
141
- mergeable_ranks=self.mergeable_ranks,
142
- special_tokens=self.special_tokens,
143
- )
144
- self.tokenizer = enc
145
-
146
- self.decoder: Dict[int, bytes] = {
147
- v: k for k, v in self.mergeable_ranks.items()
148
- }
149
- self.decoder.update({v: k for k, v in self.special_tokens.items()})
150
-
151
- self.eod_id = self.tokenizer.eot_token
152
- self._eos_token = self._convert_id_to_token(self.eod_id)
153
-
154
- # Setting the bos_token to be the same as the eos_token
155
- # Note that this is **not** the correct thing to do, and is done
156
- # just so that some of the downstream libraries do not break.
157
- self._bos_token = self._eos_token
158
-
159
- # Assign the special tokens to class variables
160
- self.system_id = self.special_tokens["<|system|>"]
161
- self.user_id = self.special_tokens["<|user|>"]
162
- self.assistant_id = self.special_tokens["<|assistant|>"]
163
- self.end_id = self.special_tokens["<|end|>"]
164
-
165
- @cached_property
166
- def dummy_token_indices(self) -> List[int]:
167
- # There are some additional special tokens in the cl100k_base tokenizer
168
- # that we do not use. Hence, we also consider them to be dummy tokens.
169
- additional_tokens = [
170
- "<|fim_prefix|>",
171
- "<|fim_middle|>",
172
- "<|fim_suffix|>",
173
- "<|endofprompt|>"
174
- ]
175
- dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
176
- dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
177
- return sorted(dummy_token_indices)
178
-
179
- def __getstate__(self):
180
- state = self.__dict__.copy()
181
- del state["tokenizer"]
182
- return state
183
-
184
- def __setstate__(self, state):
185
- self.__dict__ = state
186
- enc = tiktoken.Encoding(
187
- name="cl100k_im",
188
- pat_str=self.pat_str,
189
- mergeable_ranks=self.mergeable_ranks,
190
- special_tokens=self.special_tokens,
191
- )
192
- self.tokenizer = enc
193
-
194
- def __len__(self):
195
- return self.tokenizer.n_vocab
196
-
197
- @classmethod
198
- def from_pretrained(
199
- cls,
200
- pretrained_model_name_or_path: Union[str, os.PathLike],
201
- *init_inputs,
202
- **kwargs,
203
- ):
204
- cls_kwargs = kwargs
205
- # First try to load from the tokenization config if it exists
206
- tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
207
- if tokenization_config:
208
- cls_kwargs = {
209
- **tokenization_config,
210
- **cls_kwargs
211
- }
212
- else:
213
- config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
214
- cls_kwargs["model_max_length"] = config.max_position_embeddings
215
- return cls(**cls_kwargs)
216
-
217
- def get_vocab(self) -> Dict[Union[str, bytes], int]:
218
- return {**self.mergeable_ranks, **self.special_tokens}
219
-
220
- def convert_tokens_to_ids(
221
- self,
222
- tokens: Union[bytes, str, List[Union[bytes, str]]]
223
- ) -> Union[int, List[int]]:
224
- ids = []
225
- if isinstance(tokens, (str, bytes)):
226
- if tokens in self.special_tokens:
227
- return self.special_tokens[tokens]
228
- else:
229
- return self.mergeable_ranks.get(tokens)
230
- ids: List[int] = []
231
- for token in tokens:
232
- ids.append(self.convert_tokens_to_ids(token))
233
- return ids
234
-
235
- def _add_tokens(
236
- self,
237
- new_tokens: Union[List[str], List[AddedToken]],
238
- special_tokens: bool = False,
239
- ) -> int:
240
- if not special_tokens and new_tokens:
241
- raise ValueError("Only special tokens can be added to this tokenizer")
242
- for token in new_tokens:
243
- surface_form = token.content if isinstance(token, AddedToken) else token
244
- if surface_form not in self.special_tokens:
245
- raise ValueError(
246
- "For now, we do not support unknown special tokens\n"
247
- "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
248
- "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
249
- "And finally, we can re-construct the enc object back\n"
250
- )
251
- return 0
252
-
253
- def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
254
- file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
255
- with open(file_path, "w") as f:
256
- for token, rank in self.mergeable_ranks.items():
257
- line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
258
- f.write(line)
259
- return (file_path,)
260
-
261
- def tokenize(
262
- self,
263
- text: str,
264
- allowed_special: Union[Set, str] = "all",
265
- disallowed_special: Union[Collection, str] = (),
266
- **kwargs
267
- ) -> List[Union[bytes, str]]:
268
- tokens: List[Union[bytes, str]] = []
269
- for token_id in self.tokenizer.encode(
270
- text, allowed_special=allowed_special, disallowed_special=disallowed_special
271
- ):
272
- tokens.append(self.decoder[token_id])
273
- return tokens
274
-
275
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
276
- """
277
- Converts a sequence of tokens in a single string.
278
- """
279
- text = ""
280
- temp = b""
281
- for t in tokens:
282
- if isinstance(t, str):
283
- if temp:
284
- text += temp.decode("utf-8", errors=self.errors)
285
- temp = b""
286
- text += t
287
- elif isinstance(t, bytes):
288
- temp += t
289
- else:
290
- raise TypeError("token should only be of type types or str")
291
- if temp:
292
- text += temp.decode("utf-8", errors=self.errors)
293
- return text
294
-
295
- @property
296
- def vocab_size(self):
297
- return self.tokenizer.n_vocab
298
-
299
- @property
300
- def eos_token_id(self) -> int:
301
- return self.eod_id
302
-
303
- def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
304
- """Converts an id to a token, special tokens included"""
305
- if index in self.decoder:
306
- return self.decoder[index]
307
- raise ValueError("unknown ids")
308
-
309
- def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
310
- """Converts a token to an id using the vocab, special tokens included"""
311
- if token in self.special_tokens:
312
- return self.special_tokens[token]
313
- if token in self.mergeable_ranks:
314
- return self.mergeable_ranks[token]
315
- raise ValueError("unknown token")
316
-
317
- def _tokenize(self, text: str, **kwargs):
318
- """
319
- Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
320
- vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
321
- Do NOT take care of added tokens.
322
- """
323
- raise NotImplementedError
324
-
325
- def _decode(
326
- self,
327
- token_ids: Union[int, List[int]],
328
- skip_special_tokens: bool = False,
329
- errors: str = None,
330
- **kwargs,
331
- ) -> str:
332
- if isinstance(token_ids, int):
333
- token_ids = [token_ids]
334
- if skip_special_tokens:
335
- token_ids = [i for i in token_ids if i < self.eod_id]
336
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)
337
-
338
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03694739c3ccc766544b2ecc80a899498ffebd3f34419475ef2e2995c7210fd7
3
- size 15524096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2923f15e986925cfb5e017bc9acbe2e24add5218d2b44558e1283fe76bb6df04
3
+ size 3620658
tokenizer_config.json CHANGED
@@ -1,98 +1,114 @@
1
  {
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
- "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
- "199999": {
7
- "content": "<|endoftext|>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false,
12
  "special": true
13
  },
14
- "200018": {
15
- "content": "<|endofprompt|>",
16
  "lstrip": false,
17
  "normalized": false,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
21
  },
22
- "200019": {
23
- "content": "<|assistant|>",
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": true,
27
  "single_word": false,
28
- "special": true
29
  },
30
- "200020": {
31
- "content": "<|end|>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false,
36
  "special": true
37
  },
38
- "200021": {
39
- "content": "<|user|>",
40
  "lstrip": false,
41
  "normalized": false,
42
  "rstrip": true,
43
  "single_word": false,
44
  "special": true
45
  },
46
- "200022": {
47
- "content": "<|system|>",
48
  "lstrip": false,
49
  "normalized": false,
50
  "rstrip": true,
51
  "single_word": false,
52
  "special": true
53
  },
54
- "200023": {
55
- "content": "<|tool|>",
56
  "lstrip": false,
57
  "normalized": false,
58
  "rstrip": true,
59
  "single_word": false,
60
- "special": false
61
  },
62
- "200024": {
63
- "content": "<|/tool|>",
64
  "lstrip": false,
65
  "normalized": false,
66
  "rstrip": true,
67
  "single_word": false,
68
- "special": false
69
  },
70
- "200025": {
71
- "content": "<|tool_call|>",
72
  "lstrip": false,
73
  "normalized": false,
74
  "rstrip": true,
75
  "single_word": false,
76
- "special": false
77
  },
78
- "200026": {
79
- "content": "<|/tool_call|>",
80
  "lstrip": false,
81
  "normalized": false,
82
  "rstrip": true,
83
  "single_word": false,
84
- "special": false
85
  },
86
- "200027": {
87
- "content": "<|tool_response|>",
 
 
 
 
 
 
 
 
88
  "lstrip": false,
89
  "normalized": false,
90
  "rstrip": true,
91
  "single_word": false,
92
- "special": false
 
 
 
 
 
 
 
 
93
  },
94
- "200028": {
95
- "content": "<|tag|>",
96
  "lstrip": false,
97
  "normalized": false,
98
  "rstrip": true,
@@ -103,15 +119,18 @@
103
  "additional_special_tokens": [
104
  "<|end|>"
105
  ],
106
- "bos_token": "<|endoftext|>",
107
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
108
  "clean_up_tokenization_spaces": false,
109
  "eos_token": "<|endoftext|>",
110
  "extra_special_tokens": {},
 
111
  "model_max_length": 131072,
112
  "pad_token": "<|endoftext|>",
113
  "padding_side": "right",
 
114
  "split_special_tokens": false,
115
- "tokenizer_class": "GPT2Tokenizer",
116
- "unk_token": "<|endoftext|>"
 
117
  }
 
1
  {
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
+ "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false,
12
  "special": true
13
  },
14
+ "1": {
15
+ "content": "<s>",
16
  "lstrip": false,
17
  "normalized": false,
18
  "rstrip": false,
19
  "single_word": false,
20
  "special": true
21
  },
22
+ "2": {
23
+ "content": "</s>",
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": true,
27
  "single_word": false,
28
+ "special": false
29
  },
30
+ "32000": {
31
+ "content": "<|endoftext|>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false,
36
  "special": true
37
  },
38
+ "32001": {
39
+ "content": "<|assistant|>",
40
  "lstrip": false,
41
  "normalized": false,
42
  "rstrip": true,
43
  "single_word": false,
44
  "special": true
45
  },
46
+ "32002": {
47
+ "content": "<|placeholder1|>",
48
  "lstrip": false,
49
  "normalized": false,
50
  "rstrip": true,
51
  "single_word": false,
52
  "special": true
53
  },
54
+ "32003": {
55
+ "content": "<|placeholder2|>",
56
  "lstrip": false,
57
  "normalized": false,
58
  "rstrip": true,
59
  "single_word": false,
60
+ "special": true
61
  },
62
+ "32004": {
63
+ "content": "<|placeholder3|>",
64
  "lstrip": false,
65
  "normalized": false,
66
  "rstrip": true,
67
  "single_word": false,
68
+ "special": true
69
  },
70
+ "32005": {
71
+ "content": "<|placeholder4|>",
72
  "lstrip": false,
73
  "normalized": false,
74
  "rstrip": true,
75
  "single_word": false,
76
+ "special": true
77
  },
78
+ "32006": {
79
+ "content": "<|system|>",
80
  "lstrip": false,
81
  "normalized": false,
82
  "rstrip": true,
83
  "single_word": false,
84
+ "special": true
85
  },
86
+ "32007": {
87
+ "content": "<|end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "32008": {
95
+ "content": "<|placeholder5|>",
96
  "lstrip": false,
97
  "normalized": false,
98
  "rstrip": true,
99
  "single_word": false,
100
+ "special": true
101
+ },
102
+ "32009": {
103
+ "content": "<|placeholder6|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": true,
107
+ "single_word": false,
108
+ "special": true
109
  },
110
+ "32010": {
111
+ "content": "<|user|>",
112
  "lstrip": false,
113
  "normalized": false,
114
  "rstrip": true,
 
119
  "additional_special_tokens": [
120
  "<|end|>"
121
  ],
122
+ "bos_token": "<s>",
123
+ "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
124
  "clean_up_tokenization_spaces": false,
125
  "eos_token": "<|endoftext|>",
126
  "extra_special_tokens": {},
127
+ "legacy": false,
128
  "model_max_length": 131072,
129
  "pad_token": "<|endoftext|>",
130
  "padding_side": "right",
131
+ "sp_model_kwargs": {},
132
  "split_special_tokens": false,
133
+ "tokenizer_class": "LlamaTokenizer",
134
+ "unk_token": "<unk>",
135
+ "use_default_system_prompt": false
136
  }
vocab.json DELETED
The diff for this file is too large to render. See raw diff