Instructions to use moonshotai/Kimi-K2.6 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use moonshotai/Kimi-K2.6 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="moonshotai/Kimi-K2.6", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModel

processor = AutoProcessor.from_pretrained("moonshotai/Kimi-K2.6", trust_remote_code=True)
model = AutoModel.from_pretrained("moonshotai/Kimi-K2.6", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Inference
HuggingChat
Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use moonshotai/Kimi-K2.6 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "moonshotai/Kimi-K2.6"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "moonshotai/Kimi-K2.6",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/moonshotai/Kimi-K2.6

SGLang

How to use moonshotai/Kimi-K2.6 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "moonshotai/Kimi-K2.6" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "moonshotai/Kimi-K2.6",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "moonshotai/Kimi-K2.6" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "moonshotai/Kimi-K2.6",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use moonshotai/Kimi-K2.6 with Docker Model Runner:
```
docker model run hf.co/moonshotai/Kimi-K2.6
```

bigmoyan commited on May 11

Commit

81bcaaa

1 Parent(s): 2755962

use-fast-tokenizer (#38)

Browse files

- use fast tokenizer, fix transformers v5 inference issues (100231dd6b04cec5fff2b2f649754f6b760e9476)
- remove slow tokenizer (fce056ba284610b40a8fad680b2492fe0494df1d)

Files changed (6) hide show

modeling_deepseek.py +5 -1
modeling_kimi_k25.py +84 -39
tokenization_kimi.py +0 -353
tokenization_kimi_fast.py +124 -0
tokenizer.json +3 -0
tokenizer_config.json +3 -3

modeling_deepseek.py CHANGED Viewed

@@ -44,7 +44,11 @@ from transformers.utils import (add_start_docstrings,
                                 is_flash_attn_2_available,
                                 is_flash_attn_greater_or_equal_2_10, logging,
                                 replace_return_docstrings)
-from transformers.utils.import_utils import is_torch_fx_available
 from .configuration_deepseek import DeepseekV3Config

                                 is_flash_attn_2_available,
                                 is_flash_attn_greater_or_equal_2_10, logging,
                                 replace_return_docstrings)
+try:
+    from transformers.utils.import_utils import is_torch_fx_available
+except ImportError:
+    def is_torch_fx_available() -> bool:
+        return hasattr(torch, "fx")
 from .configuration_deepseek import DeepseekV3Config

modeling_kimi_k25.py CHANGED Viewed

@@ -64,6 +64,7 @@ from transformers.models.llava.modeling_llava import \
 from transformers.utils import is_flash_attn_2_available
 from .configuration_kimi_k25 import KimiK25Config
 from .modeling_deepseek import DeepseekV3ForCausalLM
 # Flash attention imports
@@ -245,6 +246,39 @@ def get_1d_sincos_pos_embed(embed_dim, t_size, cls_token=False):
                                    axis=0)
     return pos_embed
 class Learnable2DInterpPosEmbDivided_fixed(nn.Module):
@@ -636,6 +670,7 @@ class MoonViT3dPretrainedModel(PreTrainedModel):
     model_type = 'moonvit3d'
     _no_split_modules = ['PackingTransformer']
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     def __init__(self, config, *inputs, **kwargs):
@@ -772,6 +807,7 @@ class KimiK25PreTrainedModel(PreTrainedModel):
     ]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = False
     def _init_weights(self, module):
@@ -872,9 +908,10 @@ class KimiK25ForConditionalGeneration(KimiK25PreTrainedModel):
     def get_decoder(self):
         return self.language_model.get_decoder()
-    def tie_weights(self):
-        return self.language_model.tie_weights()
     def resize_token_embeddings(self,
                                 new_num_tokens: int | None = None,
@@ -1100,42 +1137,43 @@ class KimiK25ForConditionalGeneration(KimiK25PreTrainedModel):
             # generation with cache
             elif (past_key_values is not None and pixel_values is not None
                   and input_ids.shape[1] == 1):
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(
-                    first_layer_past_key_value.float().sum(-2) == 0)
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(
-                    -1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index,
-                                        new_non_attended_tokens] = 0
-                attention_mask = torch.cat(
-                    (extended_attention_mask, attention_mask[:,
-                                                             -target_length:]),
-                    dim=1)
-                position_ids = torch.sum(attention_mask,
-                                         dim=1).unsqueeze(-1) - 1
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -1228,6 +1266,13 @@ class KimiK25ForConditionalGeneration(KimiK25PreTrainedModel):
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1]:]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}

 from transformers.utils import is_flash_attn_2_available
 from .configuration_kimi_k25 import KimiK25Config
+from .configuration_deepseek import DeepseekV3Config
 from .modeling_deepseek import DeepseekV3ForCausalLM
 # Flash attention imports
                                    axis=0)
     return pos_embed
+def _first_layer_key_first_token_vector(past_key_values):
+    """``past_key_values[0][0][..., 0]`` for LLaVA-style cache masking (shape ``[batch, heads, seq]``).
+    Legacy caches are ``list`` of ``(key, value)`` per layer. Transformers v4.36+ / v5 use ``Cache`` (e.g.
+    ``DynamicCache``) with per-layer ``.keys`` tensors instead of subscripting ``[0][0]``.
+    """
+    if isinstance(past_key_values, Cache):
+        layers = getattr(past_key_values, "layers", None) or []
+        if not layers:
+            return None
+        layer0 = layers[0]
+        keys = getattr(layer0, "keys", None)
+        if keys is None or keys.numel() == 0 or keys.ndim < 4:
+            return None
+        return keys[:, :, :, 0]
+    return past_key_values[0][0][:, :, :, 0]
+def _first_layer_past_seq_length(past_key_values):
+    """Layer-0 KV cache sequence length (BHSD keys: ``shape[2] == seq_len``).
+    """
+    if isinstance(past_key_values, Cache):
+        try:
+            return int(past_key_values.get_seq_length(0))
+        except Exception:
+            return None
+    try:
+        k0 = past_key_values[0][0]
+        if k0 is None or k0.ndim < 3:
+            return None
+        return int(k0.shape[2])
+    except Exception:
+        return None
 class Learnable2DInterpPosEmbDivided_fixed(nn.Module):
     model_type = 'moonvit3d'
     _no_split_modules = ['PackingTransformer']
     _supports_flash_attn_2 = True
+    _supports_flash_attn = True
     _supports_sdpa = True
     def __init__(self, config, *inputs, **kwargs):
     ]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
+    _supports_flash_attn = True
     _supports_sdpa = False
     def _init_weights(self, module):
     def get_decoder(self):
         return self.language_model.get_decoder()
+    def tie_weights(self, *args, **kwargs):
+        # Transformers >=5 passes ``missing_keys`` / ``recompute_mapping``; forward for the text backbone only.
+        return self.language_model.tie_weights(*args, **kwargs)
     def resize_token_embeddings(self,
                                 new_num_tokens: int | None = None,
             # generation with cache
             elif (past_key_values is not None and pixel_values is not None
                   and input_ids.shape[1] == 1):
+                first_layer_past_key_value = _first_layer_key_first_token_vector(
+                    past_key_values)
+                if first_layer_past_key_value is not None:
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(
+                        first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = _first_layer_past_seq_length(past_key_values)
+                    if past_length is None:
+                        past_length = int(first_layer_past_key_value.shape[-1])
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(
+                        -1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index,
+                                            new_non_attended_tokens] = 0
+                    attention_mask = torch.cat(
+                        (extended_attention_mask, attention_mask[:,
+                                                                 -target_length:]),
+                        dim=1)
+                    position_ids = torch.sum(attention_mask,
+                                             dim=1).unsqueeze(-1) - 1
         outputs = self.language_model(
             attention_mask=attention_mask,
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1]:]
+        # Generation (especially transformers v5) may supply ``position_ids`` for the full sequence while
+        # ``input_ids`` here is only the new suffix (e.g. length 1). RoPE must index with the current step length.
+        if past_key_values is not None and position_ids is not None:
+            cur_len = input_ids.shape[1]
+            if position_ids.shape[-1] > cur_len:
+                position_ids = position_ids[..., -cur_len:]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}

tokenization_kimi.py DELETED Viewed

@@ -1,353 +0,0 @@
-import os
-from collections import OrderedDict
-from logging import getLogger
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
-import tiktoken
-from tiktoken.load import load_tiktoken_bpe
-from tokenizers import AddedToken
-from transformers.convert_slow_tokenizer import bytes_to_unicode
-from transformers.tokenization_utils import PreTrainedTokenizer
-from .tool_declaration_ts import encode_tools_to_typescript_style
-logger = getLogger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
-class TikTokenTokenizer(PreTrainedTokenizer):
-    """
-    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-    Args:
-        vocab_file (`str`):
-            The path to the Tiktoken model file.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
-            The end of sequence token.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. The second to last item in special_tokens.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (list of `str`, *optional*):
-            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
-            skipped when decoding if `skip_special_tokens` is set to `True`.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    special_tokens: Dict[str, int]
-    num_reserved_special_tokens = 256
-    pat_str = "|".join([
-        r"""[\p{Han}]+""",
-        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
-        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
-        r"""\p{N}{1,3}""",
-        r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
-        r"""\s*[\r\n]+""",
-        r"""\s+(?!\S)""",
-        r"""\s+""",
-    ])
-    def __init__(
-        self,
-        vocab_file,
-        bos_token: Union[str, AddedToken] = "[BOS]",
-        eos_token: Union[str, AddedToken] = "[EOS]",
-        unk_token: Union[str, AddedToken, None] = None,
-        pad_token: Union[str, AddedToken, None] = None,
-        additional_special_tokens: List[str] = None,
-        added_tokens_decoder: Optional[dict] = None,
-        **kwargs,
-    ):
-        assert os.path.isfile(vocab_file), vocab_file
-        if additional_special_tokens is None:
-            additional_special_tokens = [
-                "<|im_end|>",
-                "<|im_user|>",
-                "<|im_assistant|>",
-                "<|start_header_id|>",
-                "<|end_header_id|>",
-                "[EOT]",
-                "<|im_system|>",
-                "<|im_middle|>",
-            ]
-        if added_tokens_decoder:
-            special_tokens_mapping = {
-                i: added_tokens_decoder[i].content
-                for i in added_tokens_decoder
-            }
-        else:
-            special_tokens_mapping = {}
-        self.vocab_file = vocab_file
-        mergeable_ranks = load_tiktoken_bpe(vocab_file)
-        num_base_tokens = len(mergeable_ranks)
-        self.special_tokens = {
-            special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
-            for i in range(num_base_tokens, num_base_tokens +
-                           self.num_reserved_special_tokens)
-        }
-        self.model = tiktoken.Encoding(
-            name=Path(vocab_file).name,
-            pat_str=self.pat_str,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-        logger.info(f"Reloaded tiktoken model from {vocab_file}")
-        self.n_words: int = self.model.n_vocab
-        # BOS / EOS token IDs
-        self.bos_id: int = self.special_tokens[str(bos_token)]
-        self.eos_id: int = self.special_tokens[str(eos_token)]
-        logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
-        )
-        self.pad_id: int = self.special_tokens[str(pad_token)]
-        self.unk_id: int = self.special_tokens[str(unk_token)]
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.decoder = {}
-        for i in range(self.n_words):
-            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
-            decoding = ''.join([
-                self.byte_encoder[ord(char)] for char in
-                self.model.decode_single_token_bytes(i).decode('latin-1')
-            ])
-            self.decoder[i] = decoding
-        self.encoder = {}
-        for i in range(self.n_words):
-            if i in self.decoder:
-                self.encoder[self.decoder[i]] = i
-        self._token_config_cache = OrderedDict()
-        self._cache_max_size = 128
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            additional_special_tokens=additional_special_tokens,
-            added_tokens_decoder=added_tokens_decoder,
-            **kwargs,
-        )
-        self.all_special_ids_set = set(self.all_special_ids)
-    def encode(self,
-               text: str,
-               allow_special_tokens: bool = True,
-               **kwargs) -> List[int]:
-        """
-        Encodes a string into a list of token IDs.
-        Args:
-            text (str): The input string to be encoded.
-        Returns:
-            list[int]: A list of token IDs.
-        """
-        # If there are other args, we should call super().encode because there are a lot of code
-        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
-        # NOTE: our encode method is not compatible with the super().encode method,
-        #   e.g. split_special_tokens' default is True in our encode method.
-        if len(kwargs) > 0:
-            logger.warning(f"Calling super().encode with {kwargs}")
-            return super().encode(text, **kwargs)
-        assert type(text) is str
-        # The tiktoken tokenizer can handle <=400k chars without
-        # pyo3_runtime.PanicException.
-        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-        # https://github.com/openai/tiktoken/issues/195
-        # Here we iterate over subsequences and split if we exceed the limit
-        # of max consecutive non-whitespace or whitespace characters.
-        MAX_NO_WHITESPACES_CHARS = 25_000
-        texts = self.pre_tokenizer_process(text)
-        all_substrs = []
-        for text in texts:
-            substrs = (
-                substr for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
-                for substr in self._split_whitespaces_or_nonwhitespaces(
-                    text[i:i +
-                         TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS))
-            all_substrs.extend(substrs)
-        t: List[int] = []
-        for substr in all_substrs:
-            if allow_special_tokens:
-                t.extend(
-                    # we should consider special token as a common token
-                    self.model.encode(
-                        substr,
-                        allowed_special="all",
-                    ))
-            else:
-                t.extend(
-                    # we should consider special token as a common token
-                    self.model.encode(
-                        substr,
-                        disallowed_special=(),
-                    ))
-        return t
-    def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
-        """
-        Decodes a list of token IDs into a string.
-        Args:
-            token_ids (List[int]): The list of token IDs to be decoded.
-        Returns:
-            str: The decoded string.
-        """
-        # If there are other args, we should call super().decode because there are a lot of code
-        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
-        if len(kwargs) > 0:
-            return super().decode(token_ids, **kwargs)
-        if type(token_ids) is int:
-            token_ids = [token_ids]
-        return self.model.decode(cast(List[int], token_ids))
-    @staticmethod
-    def _split_whitespaces_or_nonwhitespaces(
-            s: str, max_consecutive_slice_len: int) -> Iterator[str]:
-        """
-        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
-        consecutive whitespaces or consecutive non-whitespaces.
-        """
-        current_slice_len = 0
-        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
-        slice_start = 0
-        for i in range(len(s)):
-            is_now_space = s[i].isspace()
-            if current_slice_is_space ^ is_now_space:
-                current_slice_len = 1
-                current_slice_is_space = is_now_space
-            else:
-                current_slice_len += 1
-                if current_slice_len > max_consecutive_slice_len:
-                    yield s[slice_start:i]
-                    slice_start = i
-                    current_slice_len = 1
-        yield s[slice_start:]
-    def pre_tokenizer_process(self, text: str) -> List[str]:
-        """
-        pre-tokenizes the input text into a list of tokens.
-        This method is used to split the input text into smaller chunks for internal processing.
-        """
-        return [text]
-    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
-    @property
-    def vocab_size(self) -> int:
-        return self.n_words
-    def get_vocab(self) -> Dict[str, int]:
-        return self.encoder
-    def _tokenize(self, text: str, **kwargs) -> List[str]:
-        return [self.decoder[t] for t in self.encode(text)]
-    def _convert_token_to_id(self, token: str) -> int:
-        return self.encoder.get(token, self.unk_id)
-    def _convert_id_to_token(self, index: int) -> str:
-        return self.decoder.get(index)
-    @staticmethod
-    def clean_up_tokenization(out_string: str) -> str:
-        return out_string
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c]
-                          for c in text]).decode('utf-8', 'replace')
-        return text
-    def save_vocabulary(self,
-                        save_directory: str,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            raise ValueError(
-                f"vocabulary path ({save_directory}) should be a directory")
-        out_vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") +
-            VOCAB_FILES_NAMES["vocab_file"])
-        if os.path.abspath(self.vocab_file) != os.path.abspath(
-                out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        return (out_vocab_file, )
-    def apply_chat_template(self,
-                            conversation,
-                            tools: Optional[list[dict]] = None,
-                            tokenize: bool = False,
-                            add_generation_prompt: bool = True,
-                            thinking: bool = True,
-                            preserve_thinking: bool = False,
-                            **kwargs):
-        tools = deep_sort_dict(tools)
-        # Convert tools to TypeScript style string if tools are provided
-        tools_ts_str = None
-        if tools:
-            try:
-                tools_ts_str = encode_tools_to_typescript_style(tools)
-            except Exception as e:
-                print(f"Failed to convert tools to TypeScript style: {e}")
-                tools_ts_str = None
-        # Store the TypeScript string in kwargs so it can be accessed by the template
-        if tools_ts_str is not None:
-            kwargs['tools_ts_str'] = tools_ts_str
-        return super().apply_chat_template(
-            conversation,
-            tools=tools,
-            tokenize=tokenize,
-            add_generation_prompt=add_generation_prompt,
-            thinking=thinking,
-            preserve_thinking=preserve_thinking,
-            **kwargs)
-def deep_sort_dict(obj: Any) -> Any:
-    if isinstance(obj, dict):
-        return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
-    if isinstance(obj, list):
-        return [deep_sort_dict(item) for item in obj]
-    return obj

tokenization_kimi_fast.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+from typing import Optional
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from .tool_declaration_ts import encode_tools_to_typescript_style
+class TikTokenTokenizerFast(PreTrainedTokenizerFast):
+    vocab_files_names = {
+        "tokenizer_file": "tokenizer.json",
+        "vocab_file": "tiktoken.model",
+    }
+    model_input_names = ["input_ids", "attention_mask"]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        # we need to find tokenizer.json from original path for our custom tokenizer.
+        kwargs["model_root"] = str(pretrained_model_name_or_path)
+        return super().from_pretrained(pretrained_model_name_or_path, *inputs,
+                                       **kwargs)
+    def __init__(
+        self,
+        tokenizer_file=None,
+        vocab_file=None,
+        model_root=None,
+        bos_token="[BOS]",
+        eos_token="[EOS]",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        **kwargs,
+    ):
+        if model_root is None:
+            raise ValueError("model_root is required")
+        tokenizer_file = os.path.join(model_root, "tokenizer.json")
+        vocab_file = os.path.join(model_root, "tiktoken.model")
+        if not (os.path.isfile(tokenizer_file) and os.path.isfile(vocab_file)):
+            raise ValueError(f"Missing tokenizer files under: {model_root}")
+        self._tokenizer_dir = model_root
+        super().__init__(
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+    @property
+    def vocab_size(self) -> int:
+        """Return the vocabulary size."""
+        return self.backend_tokenizer.get_vocab_size()
+    def _sort_tools(self, tools):
+        """Deep sort tools for deterministic output."""
+        if isinstance(tools, dict):
+            return {k: self._sort_tools(v) for k, v in sorted(tools.items())}
+        if isinstance(tools, list):
+            return [self._sort_tools(item) for item in tools]
+        return tools
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> tuple:
+        """Save the tokenizer vocabulary."""
+        if not os.path.isdir(save_directory):
+            raise ValueError(
+                f"Vocabulary path ({save_directory}) should be a directory")
+        # Save tokenizer.json
+        tokenizer_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") +
+            "tokenizer.json")
+        self.backend_tokenizer.save(tokenizer_file)
+        # Also copy tiktoken.model if available
+        vocab_files = []
+        if self.vocab_file and os.path.isfile(self.vocab_file):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") +
+                "tiktoken.model")
+            if os.path.abspath(self.vocab_file) != os.path.abspath(vocab_file):
+                import shutil
+                shutil.copy(self.vocab_file, vocab_file)
+            vocab_files.append(vocab_file)
+        return (tokenizer_file, ) + tuple(vocab_files)
+    def apply_chat_template(self,
+                            conversation,
+                            tools=None,
+                            tokenize=False,
+                            add_generation_prompt=True,
+                            thinking: bool = True,
+                            preserve_thinking: bool = False,
+                            **kwargs):
+        """Apply chat template with TypeScript tools support."""
+        tools = self._sort_tools(tools)
+        # Convert tools to TypeScript style string if tools are provided
+        tools_ts_str = None
+        if tools:
+            try:
+                tools_ts_str = encode_tools_to_typescript_style(tools)
+            except Exception as e:
+                print(f"Failed to convert tools to TypeScript style: {e}")
+                tools_ts_str = None
+        # Store the TypeScript string in kwargs so it can be accessed by the template
+        if tools_ts_str is not None:
+            kwargs['tools_ts_str'] = tools_ts_str
+        return super().apply_chat_template(
+            conversation,
+            tools=tools,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
+            thinking=thinking,
+            preserve_thinking=preserve_thinking,
+            **kwargs)

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57ec7040095cadc25269b917f95ba026e1b2b7b2e5c0540ce0a9afe8afb06d2e
+size 19591764

tokenizer_config.json CHANGED Viewed

@@ -205,12 +205,12 @@
   "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
-  "tokenizer_class": "TikTokenTokenizer",
   "unk_token": "[UNK]",
   "auto_map": {
     "AutoTokenizer": [
-      "tokenization_kimi.TikTokenTokenizer",
-      null
     ]
   }
 }

   "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "unk_token": "[UNK]",
+  "tokenizer_class": "TikTokenTokenizerFast",
   "auto_map": {
     "AutoTokenizer": [
+      null,
+      "tokenization_kimi_fast.TikTokenTokenizerFast"
     ]
   }
 }