nntoan209 commited on
Commit
039cabd
·
verified ·
1 Parent(s): 0554cf5

Upload tokenizer

Browse files
Files changed (5) hide show
  1. README.md +199 -0
  2. chat_template.jinja +97 -0
  3. tiktoken.model +3 -0
  4. tokenization_kimi.py +349 -0
  5. tokenizer_config.json +181 -0
README.md ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags: []
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+ This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
chat_template.jinja ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
8
+ <|media_start|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% else -%}
10
+ {{ content['text'] }}
11
+ {%- endif -%}
12
+ {%- endfor -%}
13
+ {%- endif -%}
14
+ {%- endmacro -%}
15
+
16
+ {% macro set_roles(message) -%}
17
+ {%- set role_name = message.get('name') or message['role'] -%}
18
+ {%- if message['role'] == 'user' -%}
19
+ <|im_user|>{{role_name}}<|im_middle|>
20
+ {%- elif message['role'] == 'assistant' -%}
21
+ <|im_assistant|>{{role_name}}<|im_middle|>
22
+ {%- else -%}
23
+ <|im_system|>{{role_name}}<|im_middle|>
24
+ {%- endif -%}
25
+ {%- endmacro -%}
26
+
27
+
28
+ {%- macro render_toolcalls(message) -%}
29
+ <|tool_calls_section_begin|>
30
+ {%- for tool_call in message['tool_calls'] -%}
31
+ {%- set formatted_id = tool_call['id'] -%}
32
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
33
+ {%- endfor -%}
34
+ <|tool_calls_section_end|>
35
+ {%- endmacro -%}
36
+
37
+
38
+ {# Find last non-tool-call assisitant message #}
39
+ {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
40
+ {%- for idx in range(messages|length-1, -1, -1) -%}
41
+ {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
42
+ {%- set ns.last_non_tool_call_assistant_msg = idx -%}
43
+ {%- break -%}
44
+ {%- endif -%}
45
+ {%- endfor -%}
46
+
47
+ {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
48
+ {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
49
+ {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
50
+
51
+ {%- if tools -%}
52
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
53
+ {%- endif -%}
54
+
55
+ {%- if messages|length == 0 or messages[0]['role'] != 'system' -%}
56
+ <|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|>
57
+ {%- endif -%}
58
+
59
+ {%- for message in hist_msgs -%}
60
+ {{set_roles(message)}}
61
+ {%- if message['role'] == 'assistant' -%}
62
+ <think></think>{{render_content(message)}}
63
+ {%- if message.get('tool_calls') -%}
64
+ {{render_toolcalls(message)}}
65
+ {%- endif -%}
66
+ {%- elif message['role'] == 'tool' -%}
67
+ {%- set tool_call_id = message.tool_call_id -%}
68
+ ## Return of {{ tool_call_id }}
69
+ {{render_content(message)}}
70
+ {%- elif message['content'] is not none -%}
71
+ {{render_content(message)}}
72
+ {%- endif -%}
73
+ <|im_end|>
74
+ {%- endfor -%}
75
+
76
+ {%- for message in suffix_msgs -%}
77
+ {{set_roles(message)}}
78
+ {%- if message['role'] == 'assistant' -%}
79
+ {%- set rc = message.get('reasoning_content', '') -%}
80
+ <think>{{rc}}</think>{{render_content(message)}}
81
+ {%- if message.get('tool_calls') -%}
82
+ {{render_toolcalls(message)}}
83
+ {%- endif -%}
84
+ {%- elif message['role'] == 'tool' -%}
85
+ {%- set tool_call_id = message.tool_call_id -%}
86
+ ## Return of {{ tool_call_id }}
87
+ {{render_content(message)}}
88
+ {%- elif message['content'] is not none -%}
89
+ {{render_content(message)}}
90
+ {%- endif -%}
91
+ <|im_end|>
92
+ {%- endfor -%}
93
+
94
+
95
+ {%- if add_generation_prompt -%}
96
+ <|im_assistant|>assistant<|im_middle|>
97
+ {%- endif -%}
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_kimi.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken, pre_tokenizers, Regex
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.convert_slow_tokenizer import bytes_to_unicode
20
+ from typing import Any
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+
26
+
27
+ class TikTokenTokenizer(PreTrainedTokenizer):
28
+ """
29
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
30
+
31
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
32
+ this superclass for more information regarding those methods.
33
+
34
+ Args:
35
+ vocab_file (`str`):
36
+ The path to the Tiktoken model file.
37
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
38
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
39
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
40
+ The end of sequence token.
41
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
42
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
43
+ token instead. The second to last item in special_tokens.
44
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
45
+ The token used for padding, for example when batching sequences of different lengths.
46
+ additional_special_tokens (list of `str`, *optional*):
47
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
48
+ skipped when decoding if `skip_special_tokens` is set to `True`.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+
53
+ model_input_names = ["input_ids", "attention_mask"]
54
+
55
+ special_tokens: Dict[str, int]
56
+
57
+ num_reserved_special_tokens = 256
58
+
59
+ pat_str = "|".join(
60
+ [
61
+ r"""[\p{Han}]+""",
62
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""\p{N}{1,3}""",
65
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
66
+ r"""\s*[\r\n]+""",
67
+ r"""\s+(?!\S)""",
68
+ r"""\s+""",
69
+ ]
70
+ )
71
+
72
+ def __init__(
73
+ self,
74
+ vocab_file,
75
+ bos_token: Union[str, AddedToken]="[BOS]",
76
+ eos_token: Union[str, AddedToken]="[EOS]",
77
+ unk_token: Union[str, AddedToken, None]=None,
78
+ pad_token: Union[str, AddedToken, None]=None,
79
+ additional_special_tokens: List[str]=None,
80
+ added_tokens_decoder: Optional[dict] = None,
81
+ **kwargs,
82
+ ):
83
+ assert os.path.isfile(vocab_file), vocab_file
84
+
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_user|>",
89
+ "<|im_assistant|>",
90
+ "<|start_header_id|>",
91
+ "<|end_header_id|>",
92
+ "[EOT]",
93
+ "<|im_system|>",
94
+ "<|im_middle|>",
95
+ ]
96
+
97
+ special_tokens_mapping = {
98
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
99
+ }
100
+
101
+ self.vocab_file = vocab_file
102
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
103
+ num_base_tokens = len(mergeable_ranks)
104
+ self.special_tokens = {
105
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
106
+ for i in range(
107
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens
108
+ )
109
+ }
110
+
111
+
112
+
113
+ self.model = tiktoken.Encoding(
114
+ name=Path(vocab_file).name,
115
+ pat_str=self.pat_str,
116
+ mergeable_ranks=mergeable_ranks,
117
+ special_tokens=self.special_tokens,
118
+ )
119
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
120
+
121
+ self.n_words: int = self.model.n_vocab
122
+ # BOS / EOS token IDs
123
+ self.bos_id: int = self.special_tokens[str(bos_token)]
124
+ self.eos_id: int = self.special_tokens[str(eos_token)]
125
+ logger.info(
126
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
127
+ )
128
+
129
+ self.pad_id: int = self.special_tokens[str(pad_token)]
130
+ self.unk_id: int = self.special_tokens[str(unk_token)]
131
+
132
+ self.byte_encoder = bytes_to_unicode()
133
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
134
+
135
+ self.decoder = {}
136
+ for i in range(self.n_words):
137
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
138
+ decoding = ''.join([
139
+ self.byte_encoder[ord(char)] for char in
140
+ self.model.decode_single_token_bytes(i).decode('latin-1')
141
+ ])
142
+ self.decoder[i] = decoding
143
+
144
+ self.encoder = {}
145
+ for i in range(self.n_words):
146
+ if i in self.decoder:
147
+ self.encoder[self.decoder[i]] = i
148
+
149
+ super().__init__(
150
+ bos_token=bos_token,
151
+ eos_token=eos_token,
152
+ unk_token=unk_token,
153
+ pad_token=pad_token,
154
+ additional_special_tokens=additional_special_tokens,
155
+ added_tokens_decoder=added_tokens_decoder,
156
+ **kwargs,
157
+ )
158
+ self.all_special_ids_set = set(self.all_special_ids)
159
+
160
+ def encode(
161
+ self,
162
+ text: str,
163
+ allow_special_tokens: bool = True,
164
+ **kwargs
165
+ ) -> List[int]:
166
+ """
167
+ Encodes a string into a list of token IDs.
168
+
169
+ Args:
170
+ text (str): The input string to be encoded.
171
+
172
+ Returns:
173
+ list[int]: A list of token IDs.
174
+ """
175
+ # If there are other args, we should call super().encode because there are a lot of code
176
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
177
+ # NOTE: our encode method is not compatible with the super().encode method,
178
+ # e.g. split_special_tokens' default is True in our encode method.
179
+ if len(kwargs) > 0:
180
+ logger.warning( f"Calling super().encode with {kwargs}" )
181
+ return super().encode(text, **kwargs)
182
+
183
+ assert type(text) is str
184
+
185
+ # The tiktoken tokenizer can handle <=400k chars without
186
+ # pyo3_runtime.PanicException.
187
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
188
+
189
+ # https://github.com/openai/tiktoken/issues/195
190
+ # Here we iterate over subsequences and split if we exceed the limit
191
+ # of max consecutive non-whitespace or whitespace characters.
192
+ MAX_NO_WHITESPACES_CHARS = 25_000
193
+
194
+ texts = self.pre_tokenizer_process(text)
195
+
196
+ all_substrs = []
197
+ for text in texts:
198
+ substrs = (
199
+ substr
200
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
201
+ for substr in self._split_whitespaces_or_nonwhitespaces(
202
+ text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
203
+ )
204
+ )
205
+ all_substrs.extend(substrs)
206
+
207
+ t: List[int] = []
208
+ for substr in all_substrs:
209
+ if allow_special_tokens:
210
+ t.extend(
211
+ # we should consider special token as a common token
212
+ self.model.encode(
213
+ substr,
214
+ allowed_special="all",
215
+ )
216
+ )
217
+ else:
218
+ t.extend(
219
+ # we should consider special token as a common token
220
+ self.model.encode(
221
+ substr,
222
+ disallowed_special=(),
223
+ )
224
+ )
225
+
226
+ return t
227
+
228
+ def decode(
229
+ self,
230
+ token_ids: Union[int, List[int]],
231
+ **kwargs
232
+ ) -> str:
233
+ """
234
+ Decodes a list of token IDs into a string.
235
+
236
+ Args:
237
+ token_ids (List[int]): The list of token IDs to be decoded.
238
+
239
+ Returns:
240
+ str: The decoded string.
241
+ """
242
+ # If there are other args, we should call super().decode because there are a lot of code
243
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
244
+ if len(kwargs) > 0:
245
+ return super().decode(token_ids, **kwargs)
246
+
247
+ if type(token_ids) is int:
248
+ token_ids = [token_ids]
249
+
250
+ return self.model.decode(cast(List[int], token_ids))
251
+
252
+ @staticmethod
253
+ def _split_whitespaces_or_nonwhitespaces(
254
+ s: str, max_consecutive_slice_len: int
255
+ ) -> Iterator[str]:
256
+ """
257
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
258
+ consecutive whitespaces or consecutive non-whitespaces.
259
+ """
260
+ current_slice_len = 0
261
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
262
+ slice_start = 0
263
+
264
+ for i in range(len(s)):
265
+ is_now_space = s[i].isspace()
266
+
267
+ if current_slice_is_space ^ is_now_space:
268
+ current_slice_len = 1
269
+ current_slice_is_space = is_now_space
270
+ else:
271
+ current_slice_len += 1
272
+ if current_slice_len > max_consecutive_slice_len:
273
+ yield s[slice_start:i]
274
+ slice_start = i
275
+ current_slice_len = 1
276
+ yield s[slice_start:]
277
+
278
+ def pre_tokenizer_process(self, text: str) -> List[str]:
279
+ """
280
+ pre-tokenizes the input text into a list of tokens.
281
+ This method is used to split the input text into smaller chunks for internal processing.
282
+ """
283
+ return [text]
284
+
285
+
286
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
287
+ @property
288
+ def vocab_size(self) -> int:
289
+ return self.n_words
290
+
291
+ def get_vocab(self) -> Dict[str, int]:
292
+ return self.encoder
293
+
294
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
295
+ return [
296
+ self.decoder[t]
297
+ for t in self.encode(text)
298
+ ]
299
+
300
+ def _convert_token_to_id(self, token: str) -> int:
301
+ return self.encoder.get(token, self.unk_id)
302
+
303
+ def _convert_id_to_token(self, index: int) -> str:
304
+ return self.decoder.get(index)
305
+
306
+ @staticmethod
307
+ def clean_up_tokenization(out_string: str) -> str:
308
+ return out_string
309
+
310
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
311
+ text = ''.join(tokens)
312
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
313
+ return text
314
+
315
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
316
+ if not os.path.isdir(save_directory):
317
+ raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
318
+ out_vocab_file = os.path.join(
319
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
320
+ )
321
+
322
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
323
+ copyfile(self.vocab_file, out_vocab_file)
324
+
325
+ return (out_vocab_file,)
326
+
327
+
328
+
329
+ def apply_chat_template(
330
+ self, conversation, tools: Optional[list[dict]] = None,
331
+ tokenize: bool = False,
332
+ add_generation_prompt: bool = True,
333
+ **kwargs
334
+ ):
335
+ tools = deep_sort_dict(tools)
336
+ return super().apply_chat_template(conversation,
337
+ tools=tools,
338
+ tokenize=tokenize,
339
+ add_generation_prompt=add_generation_prompt,
340
+ **kwargs)
341
+
342
+
343
+ def deep_sort_dict(obj: Any) -> Any:
344
+ if isinstance(obj, dict):
345
+ return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
346
+ if isinstance(obj, list):
347
+ return [deep_sort_dict(item) for item in obj]
348
+ return obj
349
+
tokenizer_config.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163590": {
44
+ "content": "<|start_header_id|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163591": {
52
+ "content": "<|end_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163593": {
60
+ "content": "[EOT]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163594": {
68
+ "content": "<|im_system|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163595": {
76
+ "content": "<|tool_calls_section_begin|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "163596": {
84
+ "content": "<|tool_calls_section_end|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "163597": {
92
+ "content": "<|tool_call_begin|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "163598": {
100
+ "content": "<|tool_call_argument_begin|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "163599": {
108
+ "content": "<|tool_call_end|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "163601": {
116
+ "content": "<|im_middle|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "163606": {
124
+ "content": "<think>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "163607": {
132
+ "content": "</think>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "163838": {
140
+ "content": "[UNK]",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "163839": {
148
+ "content": "[PAD]",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ }
155
+ },
156
+ "auto_map": {
157
+ "AutoTokenizer": [
158
+ "tokenization_kimi.TikTokenTokenizer",
159
+ null
160
+ ]
161
+ },
162
+ "backend": "custom",
163
+ "bos_token": "[BOS]",
164
+ "clean_up_tokenization_spaces": false,
165
+ "eos_token": "[EOS]",
166
+ "extra_special_tokens": [
167
+ "<|im_end|>",
168
+ "<|im_user|>",
169
+ "<|im_assistant|>",
170
+ "<|start_header_id|>",
171
+ "<|end_header_id|>",
172
+ "[EOT]",
173
+ "<|im_system|>",
174
+ "<|im_middle|>"
175
+ ],
176
+ "is_local": false,
177
+ "model_max_length": 1000000000000000019884624838656,
178
+ "pad_token": "[PAD]",
179
+ "tokenizer_class": "TikTokenTokenizer",
180
+ "unk_token": "[UNK]"
181
+ }