bigmoyan commited on
Commit
81bcaaa
·
1 Parent(s): 2755962

use-fast-tokenizer (#38)

Browse files

- use fast tokenizer, fix transformers v5 inference issues (100231dd6b04cec5fff2b2f649754f6b760e9476)
- remove slow tokenizer (fce056ba284610b40a8fad680b2492fe0494df1d)

modeling_deepseek.py CHANGED
@@ -44,7 +44,11 @@ from transformers.utils import (add_start_docstrings,
44
  is_flash_attn_2_available,
45
  is_flash_attn_greater_or_equal_2_10, logging,
46
  replace_return_docstrings)
47
- from transformers.utils.import_utils import is_torch_fx_available
 
 
 
 
48
 
49
  from .configuration_deepseek import DeepseekV3Config
50
 
 
44
  is_flash_attn_2_available,
45
  is_flash_attn_greater_or_equal_2_10, logging,
46
  replace_return_docstrings)
47
+ try:
48
+ from transformers.utils.import_utils import is_torch_fx_available
49
+ except ImportError:
50
+ def is_torch_fx_available() -> bool:
51
+ return hasattr(torch, "fx")
52
 
53
  from .configuration_deepseek import DeepseekV3Config
54
 
modeling_kimi_k25.py CHANGED
@@ -64,6 +64,7 @@ from transformers.models.llava.modeling_llava import \
64
  from transformers.utils import is_flash_attn_2_available
65
 
66
  from .configuration_kimi_k25 import KimiK25Config
 
67
  from .modeling_deepseek import DeepseekV3ForCausalLM
68
 
69
  # Flash attention imports
@@ -245,6 +246,39 @@ def get_1d_sincos_pos_embed(embed_dim, t_size, cls_token=False):
245
  axis=0)
246
  return pos_embed
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  class Learnable2DInterpPosEmbDivided_fixed(nn.Module):
250
 
@@ -636,6 +670,7 @@ class MoonViT3dPretrainedModel(PreTrainedModel):
636
  model_type = 'moonvit3d'
637
  _no_split_modules = ['PackingTransformer']
638
  _supports_flash_attn_2 = True
 
639
  _supports_sdpa = True
640
 
641
  def __init__(self, config, *inputs, **kwargs):
@@ -772,6 +807,7 @@ class KimiK25PreTrainedModel(PreTrainedModel):
772
  ]
773
  _skip_keys_device_placement = "past_key_values"
774
  _supports_flash_attn_2 = True
 
775
  _supports_sdpa = False
776
 
777
  def _init_weights(self, module):
@@ -872,9 +908,10 @@ class KimiK25ForConditionalGeneration(KimiK25PreTrainedModel):
872
 
873
  def get_decoder(self):
874
  return self.language_model.get_decoder()
875
-
876
- def tie_weights(self):
877
- return self.language_model.tie_weights()
 
878
 
879
  def resize_token_embeddings(self,
880
  new_num_tokens: int | None = None,
@@ -1100,42 +1137,43 @@ class KimiK25ForConditionalGeneration(KimiK25PreTrainedModel):
1100
  # generation with cache
1101
  elif (past_key_values is not None and pixel_values is not None
1102
  and input_ids.shape[1] == 1):
1103
- # Retrieve the first layer to inspect the logits and mask out the hidden states
1104
- # that are set to 0
1105
- first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
1106
-
1107
- # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
1108
- batch_index, non_attended_tokens = torch.where(
1109
- first_layer_past_key_value.float().sum(-2) == 0)
1110
-
1111
- # Get the target length
1112
- target_length = input_ids.shape[1]
1113
- past_length = first_layer_past_key_value.shape[-1]
1114
-
1115
- extended_attention_mask = torch.ones(
1116
- (attention_mask.shape[0], past_length),
1117
- dtype=attention_mask.dtype,
1118
- device=attention_mask.device,
1119
- )
1120
-
1121
- # Filter out only the tokens that can be un-attended, this can happen
1122
- # if one uses Llava + Fused modules where the cache on the
1123
- # first iteration is already big enough, or if one passes custom cache
1124
- valid_indices = non_attended_tokens < extended_attention_mask.size(
1125
- -1)
1126
- new_batch_index = batch_index[valid_indices]
1127
- new_non_attended_tokens = non_attended_tokens[valid_indices]
1128
-
1129
- # Zero-out the places where we don't need to attend
1130
- extended_attention_mask[new_batch_index,
1131
- new_non_attended_tokens] = 0
1132
-
1133
- attention_mask = torch.cat(
1134
- (extended_attention_mask, attention_mask[:,
1135
- -target_length:]),
1136
- dim=1)
1137
- position_ids = torch.sum(attention_mask,
1138
- dim=1).unsqueeze(-1) - 1
 
1139
 
1140
  outputs = self.language_model(
1141
  attention_mask=attention_mask,
@@ -1228,6 +1266,13 @@ class KimiK25ForConditionalGeneration(KimiK25PreTrainedModel):
1228
  if past_key_values:
1229
  position_ids = position_ids[:, -input_ids.shape[1]:]
1230
 
 
 
 
 
 
 
 
1231
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1232
  if inputs_embeds is not None and past_key_values is None:
1233
  model_inputs = {"inputs_embeds": inputs_embeds}
 
64
  from transformers.utils import is_flash_attn_2_available
65
 
66
  from .configuration_kimi_k25 import KimiK25Config
67
+ from .configuration_deepseek import DeepseekV3Config
68
  from .modeling_deepseek import DeepseekV3ForCausalLM
69
 
70
  # Flash attention imports
 
246
  axis=0)
247
  return pos_embed
248
 
249
+ def _first_layer_key_first_token_vector(past_key_values):
250
+ """``past_key_values[0][0][..., 0]`` for LLaVA-style cache masking (shape ``[batch, heads, seq]``).
251
+ Legacy caches are ``list`` of ``(key, value)`` per layer. Transformers v4.36+ / v5 use ``Cache`` (e.g.
252
+ ``DynamicCache``) with per-layer ``.keys`` tensors instead of subscripting ``[0][0]``.
253
+ """
254
+ if isinstance(past_key_values, Cache):
255
+ layers = getattr(past_key_values, "layers", None) or []
256
+ if not layers:
257
+ return None
258
+ layer0 = layers[0]
259
+ keys = getattr(layer0, "keys", None)
260
+ if keys is None or keys.numel() == 0 or keys.ndim < 4:
261
+ return None
262
+ return keys[:, :, :, 0]
263
+ return past_key_values[0][0][:, :, :, 0]
264
+
265
+
266
+ def _first_layer_past_seq_length(past_key_values):
267
+ """Layer-0 KV cache sequence length (BHSD keys: ``shape[2] == seq_len``).
268
+ """
269
+ if isinstance(past_key_values, Cache):
270
+ try:
271
+ return int(past_key_values.get_seq_length(0))
272
+ except Exception:
273
+ return None
274
+ try:
275
+ k0 = past_key_values[0][0]
276
+ if k0 is None or k0.ndim < 3:
277
+ return None
278
+ return int(k0.shape[2])
279
+ except Exception:
280
+ return None
281
+
282
 
283
  class Learnable2DInterpPosEmbDivided_fixed(nn.Module):
284
 
 
670
  model_type = 'moonvit3d'
671
  _no_split_modules = ['PackingTransformer']
672
  _supports_flash_attn_2 = True
673
+ _supports_flash_attn = True
674
  _supports_sdpa = True
675
 
676
  def __init__(self, config, *inputs, **kwargs):
 
807
  ]
808
  _skip_keys_device_placement = "past_key_values"
809
  _supports_flash_attn_2 = True
810
+ _supports_flash_attn = True
811
  _supports_sdpa = False
812
 
813
  def _init_weights(self, module):
 
908
 
909
  def get_decoder(self):
910
  return self.language_model.get_decoder()
911
+
912
+ def tie_weights(self, *args, **kwargs):
913
+ # Transformers >=5 passes ``missing_keys`` / ``recompute_mapping``; forward for the text backbone only.
914
+ return self.language_model.tie_weights(*args, **kwargs)
915
 
916
  def resize_token_embeddings(self,
917
  new_num_tokens: int | None = None,
 
1137
  # generation with cache
1138
  elif (past_key_values is not None and pixel_values is not None
1139
  and input_ids.shape[1] == 1):
1140
+ first_layer_past_key_value = _first_layer_key_first_token_vector(
1141
+ past_key_values)
1142
+ if first_layer_past_key_value is not None:
1143
+ # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
1144
+ batch_index, non_attended_tokens = torch.where(
1145
+ first_layer_past_key_value.float().sum(-2) == 0)
1146
+
1147
+ # Get the target length
1148
+ target_length = input_ids.shape[1]
1149
+ past_length = _first_layer_past_seq_length(past_key_values)
1150
+ if past_length is None:
1151
+ past_length = int(first_layer_past_key_value.shape[-1])
1152
+
1153
+ extended_attention_mask = torch.ones(
1154
+ (attention_mask.shape[0], past_length),
1155
+ dtype=attention_mask.dtype,
1156
+ device=attention_mask.device,
1157
+ )
1158
+
1159
+ # Filter out only the tokens that can be un-attended, this can happen
1160
+ # if one uses Llava + Fused modules where the cache on the
1161
+ # first iteration is already big enough, or if one passes custom cache
1162
+ valid_indices = non_attended_tokens < extended_attention_mask.size(
1163
+ -1)
1164
+ new_batch_index = batch_index[valid_indices]
1165
+ new_non_attended_tokens = non_attended_tokens[valid_indices]
1166
+
1167
+ # Zero-out the places where we don't need to attend
1168
+ extended_attention_mask[new_batch_index,
1169
+ new_non_attended_tokens] = 0
1170
+
1171
+ attention_mask = torch.cat(
1172
+ (extended_attention_mask, attention_mask[:,
1173
+ -target_length:]),
1174
+ dim=1)
1175
+ position_ids = torch.sum(attention_mask,
1176
+ dim=1).unsqueeze(-1) - 1
1177
 
1178
  outputs = self.language_model(
1179
  attention_mask=attention_mask,
 
1266
  if past_key_values:
1267
  position_ids = position_ids[:, -input_ids.shape[1]:]
1268
 
1269
+ # Generation (especially transformers v5) may supply ``position_ids`` for the full sequence while
1270
+ # ``input_ids`` here is only the new suffix (e.g. length 1). RoPE must index with the current step length.
1271
+ if past_key_values is not None and position_ids is not None:
1272
+ cur_len = input_ids.shape[1]
1273
+ if position_ids.shape[-1] > cur_len:
1274
+ position_ids = position_ids[..., -cur_len:]
1275
+
1276
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1277
  if inputs_embeds is not None and past_key_values is None:
1278
  model_inputs = {"inputs_embeds": inputs_embeds}
tokenization_kimi.py DELETED
@@ -1,353 +0,0 @@
1
- import os
2
- from collections import OrderedDict
3
- from logging import getLogger
4
- from pathlib import Path
5
- from shutil import copyfile
6
- from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
7
-
8
- import tiktoken
9
- from tiktoken.load import load_tiktoken_bpe
10
- from tokenizers import AddedToken
11
- from transformers.convert_slow_tokenizer import bytes_to_unicode
12
- from transformers.tokenization_utils import PreTrainedTokenizer
13
-
14
- from .tool_declaration_ts import encode_tools_to_typescript_style
15
-
16
- logger = getLogger(__name__)
17
- VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
18
-
19
-
20
- class TikTokenTokenizer(PreTrainedTokenizer):
21
- """
22
- Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
23
-
24
- This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
25
- this superclass for more information regarding those methods.
26
-
27
- Args:
28
- vocab_file (`str`):
29
- The path to the Tiktoken model file.
30
- bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
31
- The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
32
- eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
33
- The end of sequence token.
34
- unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
35
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
36
- token instead. The second to last item in special_tokens.
37
- pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
38
- The token used for padding, for example when batching sequences of different lengths.
39
- additional_special_tokens (list of `str`, *optional*):
40
- A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
41
- skipped when decoding if `skip_special_tokens` is set to `True`.
42
- """
43
-
44
- vocab_files_names = VOCAB_FILES_NAMES
45
-
46
- model_input_names = ["input_ids", "attention_mask"]
47
-
48
- special_tokens: Dict[str, int]
49
-
50
- num_reserved_special_tokens = 256
51
-
52
- pat_str = "|".join([
53
- r"""[\p{Han}]+""",
54
- r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
55
- r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
56
- r"""\p{N}{1,3}""",
57
- r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
58
- r"""\s*[\r\n]+""",
59
- r"""\s+(?!\S)""",
60
- r"""\s+""",
61
- ])
62
-
63
- def __init__(
64
- self,
65
- vocab_file,
66
- bos_token: Union[str, AddedToken] = "[BOS]",
67
- eos_token: Union[str, AddedToken] = "[EOS]",
68
- unk_token: Union[str, AddedToken, None] = None,
69
- pad_token: Union[str, AddedToken, None] = None,
70
- additional_special_tokens: List[str] = None,
71
- added_tokens_decoder: Optional[dict] = None,
72
- **kwargs,
73
- ):
74
- assert os.path.isfile(vocab_file), vocab_file
75
-
76
- if additional_special_tokens is None:
77
- additional_special_tokens = [
78
- "<|im_end|>",
79
- "<|im_user|>",
80
- "<|im_assistant|>",
81
- "<|start_header_id|>",
82
- "<|end_header_id|>",
83
- "[EOT]",
84
- "<|im_system|>",
85
- "<|im_middle|>",
86
- ]
87
-
88
- if added_tokens_decoder:
89
- special_tokens_mapping = {
90
- i: added_tokens_decoder[i].content
91
- for i in added_tokens_decoder
92
- }
93
- else:
94
- special_tokens_mapping = {}
95
-
96
- self.vocab_file = vocab_file
97
- mergeable_ranks = load_tiktoken_bpe(vocab_file)
98
- num_base_tokens = len(mergeable_ranks)
99
- self.special_tokens = {
100
- special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
101
- for i in range(num_base_tokens, num_base_tokens +
102
- self.num_reserved_special_tokens)
103
- }
104
-
105
- self.model = tiktoken.Encoding(
106
- name=Path(vocab_file).name,
107
- pat_str=self.pat_str,
108
- mergeable_ranks=mergeable_ranks,
109
- special_tokens=self.special_tokens,
110
- )
111
- logger.info(f"Reloaded tiktoken model from {vocab_file}")
112
-
113
- self.n_words: int = self.model.n_vocab
114
- # BOS / EOS token IDs
115
- self.bos_id: int = self.special_tokens[str(bos_token)]
116
- self.eos_id: int = self.special_tokens[str(eos_token)]
117
- logger.info(
118
- f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
119
- )
120
-
121
- self.pad_id: int = self.special_tokens[str(pad_token)]
122
- self.unk_id: int = self.special_tokens[str(unk_token)]
123
-
124
- self.byte_encoder = bytes_to_unicode()
125
- self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
126
-
127
- self.decoder = {}
128
- for i in range(self.n_words):
129
- # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
130
- decoding = ''.join([
131
- self.byte_encoder[ord(char)] for char in
132
- self.model.decode_single_token_bytes(i).decode('latin-1')
133
- ])
134
- self.decoder[i] = decoding
135
-
136
- self.encoder = {}
137
- for i in range(self.n_words):
138
- if i in self.decoder:
139
- self.encoder[self.decoder[i]] = i
140
-
141
- self._token_config_cache = OrderedDict()
142
- self._cache_max_size = 128
143
-
144
- super().__init__(
145
- bos_token=bos_token,
146
- eos_token=eos_token,
147
- unk_token=unk_token,
148
- pad_token=pad_token,
149
- additional_special_tokens=additional_special_tokens,
150
- added_tokens_decoder=added_tokens_decoder,
151
- **kwargs,
152
- )
153
- self.all_special_ids_set = set(self.all_special_ids)
154
-
155
- def encode(self,
156
- text: str,
157
- allow_special_tokens: bool = True,
158
- **kwargs) -> List[int]:
159
- """
160
- Encodes a string into a list of token IDs.
161
-
162
- Args:
163
- text (str): The input string to be encoded.
164
-
165
- Returns:
166
- list[int]: A list of token IDs.
167
- """
168
- # If there are other args, we should call super().encode because there are a lot of code
169
- # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
170
- # NOTE: our encode method is not compatible with the super().encode method,
171
- # e.g. split_special_tokens' default is True in our encode method.
172
- if len(kwargs) > 0:
173
- logger.warning(f"Calling super().encode with {kwargs}")
174
- return super().encode(text, **kwargs)
175
-
176
- assert type(text) is str
177
-
178
- # The tiktoken tokenizer can handle <=400k chars without
179
- # pyo3_runtime.PanicException.
180
- TIKTOKEN_MAX_ENCODE_CHARS = 400_000
181
-
182
- # https://github.com/openai/tiktoken/issues/195
183
- # Here we iterate over subsequences and split if we exceed the limit
184
- # of max consecutive non-whitespace or whitespace characters.
185
- MAX_NO_WHITESPACES_CHARS = 25_000
186
-
187
- texts = self.pre_tokenizer_process(text)
188
-
189
- all_substrs = []
190
- for text in texts:
191
- substrs = (
192
- substr for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
193
- for substr in self._split_whitespaces_or_nonwhitespaces(
194
- text[i:i +
195
- TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS))
196
- all_substrs.extend(substrs)
197
-
198
- t: List[int] = []
199
- for substr in all_substrs:
200
- if allow_special_tokens:
201
- t.extend(
202
- # we should consider special token as a common token
203
- self.model.encode(
204
- substr,
205
- allowed_special="all",
206
- ))
207
- else:
208
- t.extend(
209
- # we should consider special token as a common token
210
- self.model.encode(
211
- substr,
212
- disallowed_special=(),
213
- ))
214
-
215
- return t
216
-
217
- def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
218
- """
219
- Decodes a list of token IDs into a string.
220
-
221
- Args:
222
- token_ids (List[int]): The list of token IDs to be decoded.
223
-
224
- Returns:
225
- str: The decoded string.
226
- """
227
- # If there are other args, we should call super().decode because there are a lot of code
228
- # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
229
- if len(kwargs) > 0:
230
- return super().decode(token_ids, **kwargs)
231
-
232
- if type(token_ids) is int:
233
- token_ids = [token_ids]
234
-
235
- return self.model.decode(cast(List[int], token_ids))
236
-
237
- @staticmethod
238
- def _split_whitespaces_or_nonwhitespaces(
239
- s: str, max_consecutive_slice_len: int) -> Iterator[str]:
240
- """
241
- Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
242
- consecutive whitespaces or consecutive non-whitespaces.
243
- """
244
- current_slice_len = 0
245
- current_slice_is_space = s[0].isspace() if len(s) > 0 else False
246
- slice_start = 0
247
-
248
- for i in range(len(s)):
249
- is_now_space = s[i].isspace()
250
-
251
- if current_slice_is_space ^ is_now_space:
252
- current_slice_len = 1
253
- current_slice_is_space = is_now_space
254
- else:
255
- current_slice_len += 1
256
- if current_slice_len > max_consecutive_slice_len:
257
- yield s[slice_start:i]
258
- slice_start = i
259
- current_slice_len = 1
260
- yield s[slice_start:]
261
-
262
- def pre_tokenizer_process(self, text: str) -> List[str]:
263
- """
264
- pre-tokenizes the input text into a list of tokens.
265
- This method is used to split the input text into smaller chunks for internal processing.
266
- """
267
- return [text]
268
-
269
- """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
270
-
271
- @property
272
- def vocab_size(self) -> int:
273
- return self.n_words
274
-
275
- def get_vocab(self) -> Dict[str, int]:
276
- return self.encoder
277
-
278
- def _tokenize(self, text: str, **kwargs) -> List[str]:
279
- return [self.decoder[t] for t in self.encode(text)]
280
-
281
- def _convert_token_to_id(self, token: str) -> int:
282
- return self.encoder.get(token, self.unk_id)
283
-
284
- def _convert_id_to_token(self, index: int) -> str:
285
- return self.decoder.get(index)
286
-
287
- @staticmethod
288
- def clean_up_tokenization(out_string: str) -> str:
289
- return out_string
290
-
291
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
292
- text = ''.join(tokens)
293
- text = bytearray([self.byte_decoder[c]
294
- for c in text]).decode('utf-8', 'replace')
295
- return text
296
-
297
- def save_vocabulary(self,
298
- save_directory: str,
299
- filename_prefix: Optional[str] = None) -> Tuple[str]:
300
- if not os.path.isdir(save_directory):
301
- raise ValueError(
302
- f"vocabulary path ({save_directory}) should be a directory")
303
- out_vocab_file = os.path.join(
304
- save_directory,
305
- (filename_prefix + "-" if filename_prefix else "") +
306
- VOCAB_FILES_NAMES["vocab_file"])
307
-
308
- if os.path.abspath(self.vocab_file) != os.path.abspath(
309
- out_vocab_file) and os.path.isfile(self.vocab_file):
310
- copyfile(self.vocab_file, out_vocab_file)
311
-
312
- return (out_vocab_file, )
313
-
314
- def apply_chat_template(self,
315
- conversation,
316
- tools: Optional[list[dict]] = None,
317
- tokenize: bool = False,
318
- add_generation_prompt: bool = True,
319
- thinking: bool = True,
320
- preserve_thinking: bool = False,
321
- **kwargs):
322
-
323
- tools = deep_sort_dict(tools)
324
-
325
- # Convert tools to TypeScript style string if tools are provided
326
- tools_ts_str = None
327
- if tools:
328
- try:
329
- tools_ts_str = encode_tools_to_typescript_style(tools)
330
-
331
- except Exception as e:
332
- print(f"Failed to convert tools to TypeScript style: {e}")
333
- tools_ts_str = None
334
-
335
- # Store the TypeScript string in kwargs so it can be accessed by the template
336
- if tools_ts_str is not None:
337
- kwargs['tools_ts_str'] = tools_ts_str
338
- return super().apply_chat_template(
339
- conversation,
340
- tools=tools,
341
- tokenize=tokenize,
342
- add_generation_prompt=add_generation_prompt,
343
- thinking=thinking,
344
- preserve_thinking=preserve_thinking,
345
- **kwargs)
346
-
347
-
348
- def deep_sort_dict(obj: Any) -> Any:
349
- if isinstance(obj, dict):
350
- return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
351
- if isinstance(obj, list):
352
- return [deep_sort_dict(item) for item in obj]
353
- return obj
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenization_kimi_fast.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+
4
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
5
+
6
+ from .tool_declaration_ts import encode_tools_to_typescript_style
7
+
8
+
9
+ class TikTokenTokenizerFast(PreTrainedTokenizerFast):
10
+ vocab_files_names = {
11
+ "tokenizer_file": "tokenizer.json",
12
+ "vocab_file": "tiktoken.model",
13
+ }
14
+ model_input_names = ["input_ids", "attention_mask"]
15
+
16
+ @classmethod
17
+ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
18
+ # we need to find tokenizer.json from original path for our custom tokenizer.
19
+ kwargs["model_root"] = str(pretrained_model_name_or_path)
20
+ return super().from_pretrained(pretrained_model_name_or_path, *inputs,
21
+ **kwargs)
22
+
23
+ def __init__(
24
+ self,
25
+ tokenizer_file=None,
26
+ vocab_file=None,
27
+ model_root=None,
28
+ bos_token="[BOS]",
29
+ eos_token="[EOS]",
30
+ unk_token="[UNK]",
31
+ pad_token="[PAD]",
32
+ **kwargs,
33
+ ):
34
+ if model_root is None:
35
+ raise ValueError("model_root is required")
36
+ tokenizer_file = os.path.join(model_root, "tokenizer.json")
37
+ vocab_file = os.path.join(model_root, "tiktoken.model")
38
+ if not (os.path.isfile(tokenizer_file) and os.path.isfile(vocab_file)):
39
+ raise ValueError(f"Missing tokenizer files under: {model_root}")
40
+ self._tokenizer_dir = model_root
41
+ super().__init__(
42
+ tokenizer_file=tokenizer_file,
43
+ bos_token=bos_token,
44
+ eos_token=eos_token,
45
+ unk_token=unk_token,
46
+ pad_token=pad_token,
47
+ **kwargs,
48
+ )
49
+ self.vocab_file = vocab_file
50
+
51
+ @property
52
+ def vocab_size(self) -> int:
53
+ """Return the vocabulary size."""
54
+ return self.backend_tokenizer.get_vocab_size()
55
+
56
+ def _sort_tools(self, tools):
57
+ """Deep sort tools for deterministic output."""
58
+ if isinstance(tools, dict):
59
+ return {k: self._sort_tools(v) for k, v in sorted(tools.items())}
60
+ if isinstance(tools, list):
61
+ return [self._sort_tools(item) for item in tools]
62
+ return tools
63
+
64
+ def save_vocabulary(self,
65
+ save_directory: str,
66
+ filename_prefix: Optional[str] = None) -> tuple:
67
+ """Save the tokenizer vocabulary."""
68
+ if not os.path.isdir(save_directory):
69
+ raise ValueError(
70
+ f"Vocabulary path ({save_directory}) should be a directory")
71
+
72
+ # Save tokenizer.json
73
+ tokenizer_file = os.path.join(
74
+ save_directory,
75
+ (filename_prefix + "-" if filename_prefix else "") +
76
+ "tokenizer.json")
77
+ self.backend_tokenizer.save(tokenizer_file)
78
+
79
+ # Also copy tiktoken.model if available
80
+ vocab_files = []
81
+ if self.vocab_file and os.path.isfile(self.vocab_file):
82
+ vocab_file = os.path.join(
83
+ save_directory,
84
+ (filename_prefix + "-" if filename_prefix else "") +
85
+ "tiktoken.model")
86
+ if os.path.abspath(self.vocab_file) != os.path.abspath(vocab_file):
87
+ import shutil
88
+ shutil.copy(self.vocab_file, vocab_file)
89
+ vocab_files.append(vocab_file)
90
+
91
+ return (tokenizer_file, ) + tuple(vocab_files)
92
+
93
+ def apply_chat_template(self,
94
+ conversation,
95
+ tools=None,
96
+ tokenize=False,
97
+ add_generation_prompt=True,
98
+ thinking: bool = True,
99
+ preserve_thinking: bool = False,
100
+ **kwargs):
101
+ """Apply chat template with TypeScript tools support."""
102
+ tools = self._sort_tools(tools)
103
+
104
+ # Convert tools to TypeScript style string if tools are provided
105
+ tools_ts_str = None
106
+ if tools:
107
+ try:
108
+ tools_ts_str = encode_tools_to_typescript_style(tools)
109
+
110
+ except Exception as e:
111
+ print(f"Failed to convert tools to TypeScript style: {e}")
112
+ tools_ts_str = None
113
+
114
+ # Store the TypeScript string in kwargs so it can be accessed by the template
115
+ if tools_ts_str is not None:
116
+ kwargs['tools_ts_str'] = tools_ts_str
117
+ return super().apply_chat_template(
118
+ conversation,
119
+ tools=tools,
120
+ tokenize=tokenize,
121
+ add_generation_prompt=add_generation_prompt,
122
+ thinking=thinking,
123
+ preserve_thinking=preserve_thinking,
124
+ **kwargs)
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ec7040095cadc25269b917f95ba026e1b2b7b2e5c0540ce0a9afe8afb06d2e
3
+ size 19591764
tokenizer_config.json CHANGED
@@ -205,12 +205,12 @@
205
  "extra_special_tokens": {},
206
  "model_max_length": 1000000000000000019884624838656,
207
  "pad_token": "[PAD]",
208
- "tokenizer_class": "TikTokenTokenizer",
209
  "unk_token": "[UNK]",
 
210
  "auto_map": {
211
  "AutoTokenizer": [
212
- "tokenization_kimi.TikTokenTokenizer",
213
- null
214
  ]
215
  }
216
  }
 
205
  "extra_special_tokens": {},
206
  "model_max_length": 1000000000000000019884624838656,
207
  "pad_token": "[PAD]",
 
208
  "unk_token": "[UNK]",
209
+ "tokenizer_class": "TikTokenTokenizerFast",
210
  "auto_map": {
211
  "AutoTokenizer": [
212
+ null,
213
+ "tokenization_kimi_fast.TikTokenTokenizerFast"
214
  ]
215
  }
216
  }