HyperAccel commited on
Commit
0256d5b
·
verified ·
1 Parent(s): 576780f

Upload tokenizer from kimi_k2

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
8
+ <|media_start|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% else -%}
10
+ {{ content['text'] }}
11
+ {%- endif -%}
12
+ {%- endfor -%}
13
+ {%- endif -%}
14
+ {%- endmacro %}
15
+
16
+
17
+ {%- if tools -%}
18
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
19
+ {%- endif -%}
20
+ {% for message in messages %}
21
+ {%- if loop.first and messages[0]['role'] != 'system' -%}
22
+ <|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|>
23
+ {% endif %}
24
+
25
+ {%- set role_name = message.get('name') or message['role'] -%}
26
+ {%- if message['role'] == 'user' -%}
27
+ <|im_user|>{{role_name}}<|im_middle|>
28
+ {%- elif message['role'] == 'assistant' -%}
29
+ <|im_assistant|>{{role_name}}<|im_middle|>
30
+ {%- else -%}
31
+ <|im_system|>{{role_name}}<|im_middle|>
32
+ {%- endif -%}
33
+
34
+ {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
35
+ {{render_content(message)}}<|tool_calls_section_begin|>
36
+ {%- for tool_call in message['tool_calls'] -%}
37
+ {%- set formatted_id = tool_call['id'] -%}
38
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
39
+ {%- endfor -%}
40
+ <|tool_calls_section_end|>
41
+ {%- elif message['role'] == 'tool' -%}
42
+ {%- set tool_call_id = message.tool_call_id -%}
43
+ ## Return of {{ tool_call_id }}
44
+ {{render_content(message)}}
45
+ {%- elif message['content'] is not none -%}
46
+ {{render_content(message)}}
47
+ {%- endif -%}
48
+ <|im_end|>
49
+ {%- endfor -%}
50
+ {%- if add_generation_prompt -%}
51
+ <|im_assistant|>assistant<|im_middle|>
52
+ {%- endif -%}
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_kimi.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken, pre_tokenizers, Regex
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.convert_slow_tokenizer import bytes_to_unicode
20
+ from typing import Any
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+
26
+
27
+ class TikTokenTokenizer(PreTrainedTokenizer):
28
+ """
29
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
30
+
31
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
32
+ this superclass for more information regarding those methods.
33
+
34
+ Args:
35
+ vocab_file (`str`):
36
+ The path to the Tiktoken model file.
37
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
38
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
39
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
40
+ The end of sequence token.
41
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
42
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
43
+ token instead. The second to last item in special_tokens.
44
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
45
+ The token used for padding, for example when batching sequences of different lengths.
46
+ additional_special_tokens (list of `str`, *optional*):
47
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
48
+ skipped when decoding if `skip_special_tokens` is set to `True`.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+
53
+ model_input_names = ["input_ids", "attention_mask"]
54
+
55
+ special_tokens: Dict[str, int]
56
+
57
+ num_reserved_special_tokens = 256
58
+
59
+ pat_str = "|".join(
60
+ [
61
+ r"""[\p{Han}]+""",
62
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""\p{N}{1,3}""",
65
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
66
+ r"""\s*[\r\n]+""",
67
+ r"""\s+(?!\S)""",
68
+ r"""\s+""",
69
+ ]
70
+ )
71
+
72
+ def __init__(
73
+ self,
74
+ vocab_file,
75
+ bos_token: Union[str, AddedToken]="[BOS]",
76
+ eos_token: Union[str, AddedToken]="[EOS]",
77
+ unk_token: Union[str, AddedToken, None]=None,
78
+ pad_token: Union[str, AddedToken, None]=None,
79
+ additional_special_tokens: List[str]=None,
80
+ added_tokens_decoder: Optional[dict] = None,
81
+ **kwargs,
82
+ ):
83
+ assert os.path.isfile(vocab_file), vocab_file
84
+
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_user|>",
89
+ "<|im_assistant|>",
90
+ "<|start_header_id|>",
91
+ "<|end_header_id|>",
92
+ "[EOT]",
93
+ "<|im_system|>",
94
+ "<|im_middle|>",
95
+ ]
96
+
97
+ special_tokens_mapping = {
98
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
99
+ }
100
+
101
+ self.vocab_file = vocab_file
102
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
103
+ num_base_tokens = len(mergeable_ranks)
104
+ self.special_tokens = {
105
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
106
+ for i in range(
107
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens
108
+ )
109
+ }
110
+
111
+
112
+
113
+ self.model = tiktoken.Encoding(
114
+ name=Path(vocab_file).name,
115
+ pat_str=self.pat_str,
116
+ mergeable_ranks=mergeable_ranks,
117
+ special_tokens=self.special_tokens,
118
+ )
119
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
120
+
121
+ self.n_words: int = self.model.n_vocab
122
+ # BOS / EOS token IDs
123
+ self.bos_id: int = self.special_tokens[str(bos_token)]
124
+ self.eos_id: int = self.special_tokens[str(eos_token)]
125
+ logger.info(
126
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
127
+ )
128
+
129
+ self.pad_id: int = self.special_tokens[str(pad_token)]
130
+ self.unk_id: int = self.special_tokens[str(unk_token)]
131
+
132
+ self.byte_encoder = bytes_to_unicode()
133
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
134
+
135
+ self.decoder = {}
136
+ for i in range(self.n_words):
137
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
138
+ decoding = ''.join([
139
+ self.byte_encoder[ord(char)] for char in
140
+ self.model.decode_single_token_bytes(i).decode('latin-1')
141
+ ])
142
+ self.decoder[i] = decoding
143
+
144
+ self.encoder = {}
145
+ for i in range(self.n_words):
146
+ if i in self.decoder:
147
+ self.encoder[self.decoder[i]] = i
148
+
149
+ super().__init__(
150
+ bos_token=bos_token,
151
+ eos_token=eos_token,
152
+ unk_token=unk_token,
153
+ pad_token=pad_token,
154
+ additional_special_tokens=additional_special_tokens,
155
+ added_tokens_decoder=added_tokens_decoder,
156
+ **kwargs,
157
+ )
158
+ self.all_special_ids_set = set(self.all_special_ids)
159
+
160
+ def encode(
161
+ self,
162
+ text: str,
163
+ allow_special_tokens: bool = True,
164
+ **kwargs
165
+ ) -> List[int]:
166
+ """
167
+ Encodes a string into a list of token IDs.
168
+
169
+ Args:
170
+ text (str): The input string to be encoded.
171
+
172
+ Returns:
173
+ list[int]: A list of token IDs.
174
+ """
175
+ # If there are other args, we should call super().encode because there are a lot of code
176
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
177
+ # NOTE: our encode method is not compatible with the super().encode method,
178
+ # e.g. split_special_tokens' default is True in our encode method.
179
+ if len(kwargs) > 0:
180
+ logger.warning( f"Calling super().encode with {kwargs}" )
181
+ return super().encode(text, **kwargs)
182
+
183
+ assert type(text) is str
184
+
185
+ # The tiktoken tokenizer can handle <=400k chars without
186
+ # pyo3_runtime.PanicException.
187
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
188
+
189
+ # https://github.com/openai/tiktoken/issues/195
190
+ # Here we iterate over subsequences and split if we exceed the limit
191
+ # of max consecutive non-whitespace or whitespace characters.
192
+ MAX_NO_WHITESPACES_CHARS = 25_000
193
+
194
+ texts = self.pre_tokenizer_process(text)
195
+
196
+ all_substrs = []
197
+ for text in texts:
198
+ substrs = (
199
+ substr
200
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
201
+ for substr in self._split_whitespaces_or_nonwhitespaces(
202
+ text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
203
+ )
204
+ )
205
+ all_substrs.extend(substrs)
206
+
207
+ t: List[int] = []
208
+ for substr in all_substrs:
209
+ if allow_special_tokens:
210
+ t.extend(
211
+ # we should consider special token as a common token
212
+ self.model.encode(
213
+ substr,
214
+ allowed_special="all",
215
+ )
216
+ )
217
+ else:
218
+ t.extend(
219
+ # we should consider special token as a common token
220
+ self.model.encode(
221
+ substr,
222
+ disallowed_special=(),
223
+ )
224
+ )
225
+
226
+ return t
227
+
228
+ def decode(
229
+ self,
230
+ token_ids: Union[int, List[int]],
231
+ **kwargs
232
+ ) -> str:
233
+ """
234
+ Decodes a list of token IDs into a string.
235
+
236
+ Args:
237
+ token_ids (List[int]): The list of token IDs to be decoded.
238
+
239
+ Returns:
240
+ str: The decoded string.
241
+ """
242
+ # If there are other args, we should call super().decode because there are a lot of code
243
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
244
+ if len(kwargs) > 0:
245
+ return super().decode(token_ids, **kwargs)
246
+
247
+ if type(token_ids) is int:
248
+ token_ids = [token_ids]
249
+
250
+ return self.model.decode(cast(List[int], token_ids))
251
+
252
+ @staticmethod
253
+ def _split_whitespaces_or_nonwhitespaces(
254
+ s: str, max_consecutive_slice_len: int
255
+ ) -> Iterator[str]:
256
+ """
257
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
258
+ consecutive whitespaces or consecutive non-whitespaces.
259
+ """
260
+ current_slice_len = 0
261
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
262
+ slice_start = 0
263
+
264
+ for i in range(len(s)):
265
+ is_now_space = s[i].isspace()
266
+
267
+ if current_slice_is_space ^ is_now_space:
268
+ current_slice_len = 1
269
+ current_slice_is_space = is_now_space
270
+ else:
271
+ current_slice_len += 1
272
+ if current_slice_len > max_consecutive_slice_len:
273
+ yield s[slice_start:i]
274
+ slice_start = i
275
+ current_slice_len = 1
276
+ yield s[slice_start:]
277
+
278
+ def pre_tokenizer_process(self, text: str) -> List[str]:
279
+ """
280
+ pre-tokenizes the input text into a list of tokens.
281
+ This method is used to split the input text into smaller chunks for internal processing.
282
+ """
283
+ return [text]
284
+
285
+
286
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
287
+ @property
288
+ def vocab_size(self) -> int:
289
+ return self.n_words
290
+
291
+ def get_vocab(self) -> Dict[str, int]:
292
+ return self.encoder
293
+
294
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
295
+ return [
296
+ self.decoder[t]
297
+ for t in self.encode(text)
298
+ ]
299
+
300
+ def _convert_token_to_id(self, token: str) -> int:
301
+ return self.encoder.get(token, self.unk_id)
302
+
303
+ def _convert_id_to_token(self, index: int) -> str:
304
+ return self.decoder.get(index)
305
+
306
+ @staticmethod
307
+ def clean_up_tokenization(out_string: str) -> str:
308
+ return out_string
309
+
310
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
311
+ text = ''.join(tokens)
312
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
313
+ return text
314
+
315
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
316
+ if not os.path.isdir(save_directory):
317
+ raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
318
+ out_vocab_file = os.path.join(
319
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
320
+ )
321
+
322
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
323
+ copyfile(self.vocab_file, out_vocab_file)
324
+
325
+ return (out_vocab_file,)
326
+
327
+
328
+
329
+ def apply_chat_template(
330
+ self, conversation, tools: Optional[list[dict]] = None,
331
+ tokenize: bool = False,
332
+ add_generation_prompt: bool = True,
333
+ **kwargs
334
+ ):
335
+ tools = deep_sort_dict(tools)
336
+ return super().apply_chat_template(conversation,
337
+ tools=tools,
338
+ tokenize=tokenize,
339
+ add_generation_prompt=add_generation_prompt,
340
+ **kwargs)
341
+
342
+
343
+ def deep_sort_dict(obj: Any) -> Any:
344
+ if isinstance(obj, dict):
345
+ return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
346
+ if isinstance(obj, list):
347
+ return [deep_sort_dict(item) for item in obj]
348
+ return obj
349
+
tokenizer_config.json ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163590": {
44
+ "content": "<|start_header_id|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163591": {
52
+ "content": "<|end_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163593": {
60
+ "content": "[EOT]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163594": {
68
+ "content": "<|im_system|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163595": {
76
+ "content": "<|tool_calls_section_begin|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "163596": {
84
+ "content": "<|tool_calls_section_end|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "163597": {
92
+ "content": "<|tool_call_begin|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "163598": {
100
+ "content": "<|tool_call_argument_begin|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "163599": {
108
+ "content": "<|tool_call_end|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "163601": {
116
+ "content": "<|im_middle|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "163838": {
124
+ "content": "[UNK]",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "163839": {
132
+ "content": "[PAD]",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ }
139
+ },
140
+ "auto_map": {
141
+ "AutoTokenizer": [
142
+ "tokenization_kimi.TikTokenTokenizer",
143
+ null
144
+ ]
145
+ },
146
+ "backend": "custom",
147
+ "bos_token": "[BOS]",
148
+ "clean_up_tokenization_spaces": false,
149
+ "eos_token": "[EOS]",
150
+ "extra_special_tokens": [
151
+ "<|im_end|>",
152
+ "<|im_user|>",
153
+ "<|im_assistant|>",
154
+ "<|start_header_id|>",
155
+ "<|end_header_id|>",
156
+ "[EOT]",
157
+ "<|im_system|>",
158
+ "<|im_middle|>"
159
+ ],
160
+ "is_local": false,
161
+ "model_max_length": 1000000000000000019884624838656,
162
+ "pad_token": "[PAD]",
163
+ "tokenizer_class": "TikTokenTokenizer",
164
+ "unk_token": "[UNK]"
165
+ }