Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

chat_template.jinja +112 -0
config.json +55 -0
generation_config.json +7 -0
model.safetensors +3 -0
tiktoken.model +3 -0
tokenization_kimi.py +353 -0
tokenizer_config.json +214 -0
tool_declaration_ts.py +479 -0

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,112 @@

+{%- macro render_content(msg) -%}
+    {%- set c = msg.get('content') -%}
+    {%- if c is string -%}
+      {{ c }}
+    {%- elif c is not none -%}
+      {% for content in c -%}
+        {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
+          <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
+        {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
+          <|kimi_k25_video_placeholder|>
+        {% else -%}
+          {{ content['text'] }}
+        {%- endif -%}
+      {%- endfor -%}
+    {%- endif -%}
+{%- endmacro -%}
+{% macro set_roles(message) -%}
+  {%- set role_name =  message.get('name') or  message['role'] -%}
+  {%- if message['role'] == 'user' -%}
+    <|im_user|>{{role_name}}<|im_middle|>
+  {%- elif message['role'] == 'assistant' -%}
+    <|im_assistant|>{{role_name}}<|im_middle|>
+  {%- else -%}
+    <|im_system|>{{role_name}}<|im_middle|>
+  {%- endif -%}
+{%- endmacro -%}
+{%- macro render_toolcalls(message) -%}
+  <|tool_calls_section_begin|>
+  {%- for tool_call in message['tool_calls'] -%}
+    {%- set formatted_id = tool_call['id'] -%}
+    <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
+  {%- endfor -%}
+  <|tool_calls_section_end|>
+{%- endmacro -%}
+{%- set preserve_thinking = preserve_thinking | default(false) -%}
+{# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
+{%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
+{%- if not preserve_thinking -%}
+{%- for idx in range(messages|length-1, -1, -1) -%}
+    {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
+        {%- set ns.last_non_tool_call_assistant_msg = idx -%}
+        {%- break -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- endif -%}
+{# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
+{%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
+{%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
+{%- if tools -%}
+  {%- if tools_ts_str -%}
+    <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
+  {%- else -%}
+    <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
+  {%- endif -%}
+{%- endif -%}
+{%- for message in hist_msgs -%}
+  {{set_roles(message)}}
+  {%- if message['role'] == 'assistant' -%}
+    <think></think>{{render_content(message)}}
+    {%- if message.get('tool_calls') -%}
+      {{render_toolcalls(message)}}
+    {%- endif -%}
+  {%- elif message['role'] == 'tool' -%}
+    {%- set tool_call_id = message.tool_call_id -%}
+    ## Return of {{ tool_call_id }}
+{{render_content(message)}}
+  {%- elif message['content'] is not none -%}
+    {{render_content(message)}}
+  {%- endif -%}
+  <|im_end|>
+{%- endfor -%}
+{%- for message in suffix_msgs -%}
+  {{set_roles(message)}}
+  {%- if message['role'] == 'assistant' -%}
+    {%- if thinking is defined and thinking is false and preserve_thinking is false -%}
+    <think></think>{{render_content(message)}}
+    {%- else -%}
+    {%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
+    <think>{{rc}}</think>{{render_content(message)}}
+    {%- endif -%}
+    {%- if message.get('tool_calls') -%}
+     {{render_toolcalls(message)}}
+    {%- endif -%}
+  {%- elif message['role'] == 'tool' -%}
+    {%- set tool_call_id = message.tool_call_id -%}
+    ## Return of {{ tool_call_id }}
+{{render_content(message)}}
+  {%- elif message['content'] is not none -%}
+    {{render_content(message)}}
+  {%- endif -%}
+  <|im_end|>
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+  <|im_assistant|>assistant<|im_middle|>
+  {%- if thinking is defined and thinking is false -%}
+  <think></think>
+  {%- else -%}
+  <think>
+  {%- endif -%}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "architectures": [
+    "DeepseekV3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "aux_loss_alpha": 0.001,
+  "bos_token_id": 163584,
+  "dtype": "float16",
+  "eos_token_id": 163585,
+  "ep_size": 1,
+  "first_k_dense_replace": 1,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11264,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 131072,
+  "model_type": "deepseek_v3",
+  "moe_intermediate_size": 1408,
+  "moe_layer_freq": 1,
+  "n_group": 1,
+  "n_routed_experts": 64,
+  "n_shared_experts": 2,
+  "norm_topk_prob": true,
+  "num_attention_heads": 16,
+  "num_experts_per_tok": 6,
+  "num_hidden_layers": 27,
+  "num_key_value_heads": 16,
+  "num_nextn_predict_layers": 1,
+  "num_shared_experts": 2,
+  "pad_token_id": 163839,
+  "pretraining_tp": 1,
+  "q_lora_rank": null,
+  "qk_head_dim": 192,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "rms_norm_eps": 1e-05,
+  "rope_interleave": true,
+  "rope_parameters": {
+    "rope_theta": 800000.0,
+    "rope_type": "default"
+  },
+  "routed_scaling_factor": 2.446,
+  "scoring_func": "sigmoid",
+  "seq_aux": true,
+  "tie_word_embeddings": false,
+  "topk_group": 1,
+  "topk_method": "noaux_tc",
+  "transformers_version": "5.8.1",
+  "use_cache": false,
+  "v_head_dim": 128,
+  "vocab_size": 163840
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 163584,
+  "eos_token_id": 163585,
+  "pad_token_id": 163839,
+  "transformers_version": "5.8.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0e0b6791300386330613a41b9fe4632ec4c99c7cd1ee51e9d9ec9a3523fa64c
+size 31920888072

tiktoken.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
+size 2795286

tokenization_kimi.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import os
+from collections import OrderedDict
+from logging import getLogger
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+from tokenizers import AddedToken
+from transformers.convert_slow_tokenizer import bytes_to_unicode
+from transformers.tokenization_utils import PreTrainedTokenizer
+from .tool_declaration_ts import encode_tools_to_typescript_style
+logger = getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
+class TikTokenTokenizer(PreTrainedTokenizer):
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            The path to the Tiktoken model file.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
+            The end of sequence token.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. The second to last item in special_tokens.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (list of `str`, *optional*):
+            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
+            skipped when decoding if `skip_special_tokens` is set to `True`.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    special_tokens: Dict[str, int]
+    num_reserved_special_tokens = 256
+    pat_str = "|".join([
+        r"""[\p{Han}]+""",
+        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+        r"""\p{N}{1,3}""",
+        r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
+        r"""\s*[\r\n]+""",
+        r"""\s+(?!\S)""",
+        r"""\s+""",
+    ])
+    def __init__(
+        self,
+        vocab_file,
+        bos_token: Union[str, AddedToken] = "[BOS]",
+        eos_token: Union[str, AddedToken] = "[EOS]",
+        unk_token: Union[str, AddedToken, None] = None,
+        pad_token: Union[str, AddedToken, None] = None,
+        additional_special_tokens: List[str] = None,
+        added_tokens_decoder: Optional[dict] = None,
+        **kwargs,
+    ):
+        assert os.path.isfile(vocab_file), vocab_file
+        if additional_special_tokens is None:
+            additional_special_tokens = [
+                "<|im_end|>",
+                "<|im_user|>",
+                "<|im_assistant|>",
+                "<|start_header_id|>",
+                "<|end_header_id|>",
+                "[EOT]",
+                "<|im_system|>",
+                "<|im_middle|>",
+            ]
+        if added_tokens_decoder:
+            special_tokens_mapping = {
+                i: added_tokens_decoder[i].content
+                for i in added_tokens_decoder
+            }
+        else:
+            special_tokens_mapping = {}
+        self.vocab_file = vocab_file
+        mergeable_ranks = load_tiktoken_bpe(vocab_file)
+        num_base_tokens = len(mergeable_ranks)
+        self.special_tokens = {
+            special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
+            for i in range(num_base_tokens, num_base_tokens +
+                           self.num_reserved_special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(vocab_file).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        logger.info(f"Reloaded tiktoken model from {vocab_file}")
+        self.n_words: int = self.model.n_vocab
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens[str(bos_token)]
+        self.eos_id: int = self.special_tokens[str(eos_token)]
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        self.pad_id: int = self.special_tokens[str(pad_token)]
+        self.unk_id: int = self.special_tokens[str(unk_token)]
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.decoder = {}
+        for i in range(self.n_words):
+            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
+            decoding = ''.join([
+                self.byte_encoder[ord(char)] for char in
+                self.model.decode_single_token_bytes(i).decode('latin-1')
+            ])
+            self.decoder[i] = decoding
+        self.encoder = {}
+        for i in range(self.n_words):
+            if i in self.decoder:
+                self.encoder[self.decoder[i]] = i
+        self._token_config_cache = OrderedDict()
+        self._cache_max_size = 128
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            added_tokens_decoder=added_tokens_decoder,
+            **kwargs,
+        )
+        self.all_special_ids_set = set(self.all_special_ids)
+    def encode(self,
+               text: str,
+               allow_special_tokens: bool = True,
+               **kwargs) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            text (str): The input string to be encoded.
+        Returns:
+            list[int]: A list of token IDs.
+        """
+        # If there are other args, we should call super().encode because there are a lot of code
+        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
+        # NOTE: our encode method is not compatible with the super().encode method,
+        #   e.g. split_special_tokens' default is True in our encode method.
+        if len(kwargs) > 0:
+            logger.warning(f"Calling super().encode with {kwargs}")
+            return super().encode(text, **kwargs)
+        assert type(text) is str
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException.
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+        texts = self.pre_tokenizer_process(text)
+        all_substrs = []
+        for text in texts:
+            substrs = (
+                substr for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
+                for substr in self._split_whitespaces_or_nonwhitespaces(
+                    text[i:i +
+                         TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS))
+            all_substrs.extend(substrs)
+        t: List[int] = []
+        for substr in all_substrs:
+            if allow_special_tokens:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        allowed_special="all",
+                    ))
+            else:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        disallowed_special=(),
+                    ))
+        return t
+    def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            token_ids (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # If there are other args, we should call super().decode because there are a lot of code
+        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
+        if len(kwargs) > 0:
+            return super().decode(token_ids, **kwargs)
+        if type(token_ids) is int:
+            token_ids = [token_ids]
+        return self.model.decode(cast(List[int], token_ids))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+            s: str, max_consecutive_slice_len: int) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+    def pre_tokenizer_process(self, text: str) -> List[str]:
+        """
+        pre-tokenizes the input text into a list of tokens.
+        This method is used to split the input text into smaller chunks for internal processing.
+        """
+        return [text]
+    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
+    @property
+    def vocab_size(self) -> int:
+        return self.n_words
+    def get_vocab(self) -> Dict[str, int]:
+        return self.encoder
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        return [self.decoder[t] for t in self.encode(text)]
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.encoder.get(token, self.unk_id)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.decoder.get(index)
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        return out_string
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c]
+                          for c in text]).decode('utf-8', 'replace')
+        return text
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            raise ValueError(
+                f"vocabulary path ({save_directory}) should be a directory")
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") +
+            VOCAB_FILES_NAMES["vocab_file"])
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+                out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file, )
+    def apply_chat_template(self,
+                            conversation,
+                            tools: Optional[list[dict]] = None,
+                            tokenize: bool = False,
+                            add_generation_prompt: bool = True,
+                            thinking: bool = True,
+                            preserve_thinking: bool = False,
+                            **kwargs):
+        tools = deep_sort_dict(tools)
+        # Convert tools to TypeScript style string if tools are provided
+        tools_ts_str = None
+        if tools:
+            try:
+                tools_ts_str = encode_tools_to_typescript_style(tools)
+            except Exception as e:
+                print(f"Failed to convert tools to TypeScript style: {e}")
+                tools_ts_str = None
+        # Store the TypeScript string in kwargs so it can be accessed by the template
+        if tools_ts_str is not None:
+            kwargs['tools_ts_str'] = tools_ts_str
+        return super().apply_chat_template(
+            conversation,
+            tools=tools,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
+            thinking=thinking,
+            preserve_thinking=preserve_thinking,
+            **kwargs)
+def deep_sort_dict(obj: Any) -> Any:
+    if isinstance(obj, dict):
+        return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
+    if isinstance(obj, list):
+        return [deep_sort_dict(item) for item in obj]
+    return obj

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,214 @@

+{
+  "added_tokens_decoder": {
+    "163584": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163585": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163586": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163587": {
+      "content": "<|im_user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163588": {
+      "content": "<|im_assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163590": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163591": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163593": {
+      "content": "[EOT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163594": {
+      "content": "<|im_system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163595": {
+      "content": "<|tool_calls_section_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163596": {
+      "content": "<|tool_calls_section_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163597": {
+      "content": "<|tool_call_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163598": {
+      "content": "<|tool_call_argument_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163599": {
+      "content": "<|tool_call_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163601": {
+      "content": "<|im_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163602": {
+      "content": "<|media_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163603": {
+      "content": "<|media_content|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163604": {
+      "content": "<|media_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163605": {
+      "content": "<|media_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163606": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163607": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163838": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163839": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_kimi.TikTokenTokenizer",
+      null
+    ]
+  },
+  "backend": "custom",
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": [
+    "<|im_end|>",
+    "<|im_user|>",
+    "<|im_assistant|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "[EOT]",
+    "<|im_system|>",
+    "<|im_middle|>"
+  ],
+  "is_local": false,
+  "local_files_only": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "tokenizer_class": "TikTokenTokenizer",
+  "unk_token": "[UNK]"
+}

tool_declaration_ts.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""
+Encode structured tool declaration to typescript style string.
+"""
+import dataclasses
+import json
+import logging
+from collections.abc import Sequence
+from typing import Any
+logger = logging.getLogger(__name__)
+_TS_INDENT = "  "
+_TS_FIELD_DELIMITER = ",\n"
+class _SchemaRegistry:
+    """Registry for schema definitions to handle $ref resolution"""
+    def __init__(self):
+        self.definitions = {}
+        self.has_self_ref = False
+    def register_definitions(self, defs: dict[str, Any]):
+        """Register schema definitions from $defs section"""
+        if not defs:
+            return
+        for def_name, def_schema in defs.items():
+            self.definitions[def_name] = def_schema
+    def resolve_ref(self, ref: str) -> dict[str, Any]:
+        """Resolve a reference to its schema definition"""
+        if ref == "#":
+            self.has_self_ref = True
+            return {"$self_ref": True}
+        elif ref.startswith("#/$defs/"):
+            def_name = ref.split("/")[-1]
+            if def_name not in self.definitions:
+                raise ValueError(f"Reference not found: {ref}")
+            return self.definitions[def_name]
+        else:
+            raise ValueError(f"Unsupported reference format: {ref}")
+def _format_description(description: str, indent: str = "") -> str:
+    return "\n".join([
+        f"{indent}// {line}" if line else ""
+        for line in description.split("\n")
+    ])
+class _BaseType:
+    description: str
+    constraints: dict[str, Any]
+    def __init__(
+            self,
+            extra_props: dict[str, Any],
+            *,
+            allowed_constraint_keys: Sequence[str] = (),
+    ):
+        self.description = extra_props.get("description", "")
+        self.constraints = {
+            k: v
+            for k, v in extra_props.items() if k in allowed_constraint_keys
+        }
+    def to_typescript_style(self, indent: str = "") -> str:
+        raise NotImplementedError
+    def format_docstring(self, indent: str) -> str:
+        lines = []
+        if self.description:
+            lines.append(_format_description(self.description, indent))
+        if self.constraints:
+            constraints_str = ", ".join(f"{k}: {v}" for k, v in sorted(
+                self.constraints.items(), key=lambda kv: kv[0]))
+            lines.append(f"{indent}// {constraints_str}")
+        return "".join(x + "\n" for x in lines)
+class _ParameterTypeScalar(_BaseType):
+    type: str
+    def __init__(self, type: str, extra_props: dict[str, Any] | None = None):
+        self.type = type
+        allowed_constraint_keys: list[str] = []
+        if self.type == "string":
+            allowed_constraint_keys = ["maxLength", "minLength", "pattern"]
+        elif self.type in ("number", "integer"):
+            allowed_constraint_keys = ["maximum", "minimum"]
+        super().__init__(extra_props or {},
+                         allowed_constraint_keys=allowed_constraint_keys)
+    def to_typescript_style(self, indent: str = "") -> str:
+        # Map integer to number in TypeScript
+        if self.type == "integer":
+            return "number"
+        return self.type
+class _ParameterTypeObject(_BaseType):
+    properties: list["_Parameter"]
+    additional_properties: Any | None = None
+    def __init__(self,
+                 json_schema_object: dict[str, Any],
+                 registry: _SchemaRegistry | None = None):
+        super().__init__(json_schema_object)
+        self.properties = []
+        self.additional_properties = None
+        if not json_schema_object:
+            return
+        if "$defs" in json_schema_object and registry:
+            registry.register_definitions(json_schema_object["$defs"])
+        self.additional_properties = json_schema_object.get(
+            "additionalProperties")
+        if isinstance(self.additional_properties, dict):
+            self.additional_properties = _parse_parameter_type(
+                self.additional_properties, registry)
+        if "properties" not in json_schema_object:
+            return
+        required_parameters = json_schema_object.get("required", [])
+        optional_parameters = set(
+            json_schema_object["properties"].keys()) - set(required_parameters)
+        self.properties = [
+            _Parameter(
+                name=name,
+                type=_parse_parameter_type(prop, registry),
+                optional=name in optional_parameters,
+                default=prop.get("default")
+                if isinstance(prop, dict) else None,
+            ) for name, prop in json_schema_object["properties"].items()
+        ]
+    def to_typescript_style(self, indent: str = "") -> str:
+        # sort by optional, make the required parameters first
+        parameters = [p for p in self.properties if not p.optional]
+        opt_params = [p for p in self.properties if p.optional]
+        parameters = sorted(parameters, key=lambda p: p.name)
+        parameters.extend(sorted(opt_params, key=lambda p: p.name))
+        param_strs = []
+        for p in parameters:
+            one = p.to_typescript_style(indent=indent + _TS_INDENT)
+            param_strs.append(one)
+        if self.additional_properties is not None:
+            ap_type_str = "any"
+            if self.additional_properties is True:
+                ap_type_str = "any"
+            elif self.additional_properties is False:
+                ap_type_str = "never"
+            elif isinstance(self.additional_properties, _ParameterType):
+                ap_type_str = self.additional_properties.to_typescript_style(
+                    indent=indent + _TS_INDENT)
+            else:
+                raise ValueError(
+                    f"Unknown additionalProperties: {self.additional_properties}"
+                )
+            param_strs.append(
+                f"{indent + _TS_INDENT}[k: string]: {ap_type_str}")
+        if not param_strs:
+            return "{}"
+        params_str = _TS_FIELD_DELIMITER.join(param_strs)
+        if params_str:
+            # add new line before and after
+            params_str = f"\n{params_str}\n"
+        # always wrap with object
+        return f"{{{params_str}{indent}}}"
+class _ParameterTypeArray(_BaseType):
+    item: "_ParameterType"
+    def __init__(self,
+                 json_schema_object: dict[str, Any],
+                 registry: _SchemaRegistry | None = None):
+        super().__init__(json_schema_object,
+                         allowed_constraint_keys=("minItems", "maxItems"))
+        if json_schema_object.get("items"):
+            self.item = _parse_parameter_type(json_schema_object["items"],
+                                              registry)
+        else:
+            self.item = _ParameterTypeScalar(type="any")
+    def to_typescript_style(self, indent: str = "") -> str:
+        item_docstring = self.item.format_docstring(indent + _TS_INDENT)
+        if item_docstring:
+            return ("Array<\n" + item_docstring + indent + _TS_INDENT +
+                    self.item.to_typescript_style(indent=indent + _TS_INDENT) +
+                    "\n" + indent + ">")
+        else:
+            return f"Array<{self.item.to_typescript_style(indent=indent)}>"
+class _ParameterTypeEnum(_BaseType):
+    # support scalar types only
+    enum: list[str | int | float | bool | None]
+    def __init__(self, json_schema_object: dict[str, Any]):
+        super().__init__(json_schema_object)
+        self.enum = json_schema_object["enum"]
+        # Validate enum values against declared type if present
+        if "type" in json_schema_object:
+            typ = json_schema_object["type"]
+            if isinstance(typ, list):
+                if len(typ) == 1:
+                    typ = typ[0]
+                elif len(typ) == 2:
+                    if "null" not in typ:
+                        raise ValueError(f"Enum type {typ} is not supported")
+                    else:
+                        typ = typ[0] if typ[0] != "null" else typ[1]
+                else:
+                    raise ValueError(f"Enum type {typ} is not supported")
+            for val in self.enum:
+                if val is None:
+                    continue
+                if typ == "string" and not isinstance(val, str):
+                    raise ValueError(f"Enum value {val} is not a string")
+                elif typ == "number" and not isinstance(val, (int, float)):
+                    raise ValueError(f"Enum value {val} is not a number")
+                elif typ == "integer" and not isinstance(val, int):
+                    raise ValueError(f"Enum value {val} is not an integer")
+                elif typ == "boolean" and not isinstance(val, bool):
+                    raise ValueError(f"Enum value {val} is not a boolean")
+    def to_typescript_style(self, indent: str = "") -> str:
+        return " | ".join(
+            [f'"{e}"' if isinstance(e, str) else str(e) for e in self.enum])
+class _ParameterTypeAnyOf(_BaseType):
+    types: list["_ParameterType"]
+    def __init__(
+        self,
+        json_schema_object: dict[str, Any],
+        registry: _SchemaRegistry | None = None,
+    ):
+        super().__init__(json_schema_object)
+        self.types = [
+            _parse_parameter_type(t, registry)
+            for t in json_schema_object["anyOf"]
+        ]
+    def to_typescript_style(self, indent: str = "") -> str:
+        return " | ".join(
+            [t.to_typescript_style(indent=indent) for t in self.types])
+class _ParameterTypeUnion(_BaseType):
+    types: list[str]
+    def __init__(self, json_schema_object: dict[str, Any]):
+        super().__init__(json_schema_object)
+        mapping = {
+            "string": "string",
+            "number": "number",
+            "integer": "number",
+            "boolean": "boolean",
+            "null": "null",
+            "object": "{}",
+            "array": "Array<any>",
+        }
+        self.types = [mapping[t] for t in json_schema_object["type"]]
+    def to_typescript_style(self, indent: str = "") -> str:
+        return " | ".join(self.types)
+class _ParameterTypeRef(_BaseType):
+    ref_name: str
+    is_self_ref: bool = False
+    def __init__(self, json_schema_object: dict[str, Any],
+                 registry: _SchemaRegistry):
+        super().__init__(json_schema_object)
+        ref = json_schema_object["$ref"]
+        resolved_schema = registry.resolve_ref(ref)
+        if resolved_schema.get("$self_ref", False):
+            self.ref_name = "parameters"
+            self.is_self_ref = True
+        else:
+            self.ref_name = ref.split("/")[-1]
+    def to_typescript_style(self, indent: str = "") -> str:
+        return self.ref_name
+_ParameterType = (_ParameterTypeScalar
+                  | _ParameterTypeObject
+                  | _ParameterTypeArray
+                  | _ParameterTypeEnum
+                  | _ParameterTypeAnyOf
+                  | _ParameterTypeUnion
+                  | _ParameterTypeRef)
+@dataclasses.dataclass
+class _Parameter:
+    """
+    A parameter in a function, or a field in a object.
+    It consists of the type as well as the name.
+    """
+    type: _ParameterType
+    name: str = "_"
+    optional: bool = True
+    default: Any | None = None
+    @classmethod
+    def parse_extended(cls, attributes: dict[str, Any]) -> "_Parameter":
+        if not attributes:
+            raise ValueError("attributes is empty")
+        return cls(
+            name=attributes.get("name", "_"),
+            type=_parse_parameter_type(attributes),
+            optional=attributes.get("optional", False),
+            default=attributes.get("default"),
+        )
+    def to_typescript_style(self, indent: str = "") -> str:
+        comments = self.type.format_docstring(indent)
+        if self.default is not None:
+            default_repr = (json.dumps(self.default, ensure_ascii=False)
+                            if not isinstance(self.default, (int, float, bool))
+                            else repr(self.default))
+            comments += f"{indent}// Default: {default_repr}\n"
+        return (
+            comments +
+            f"{indent}{self.name}{'?' if self.optional else ''}: {self.type.to_typescript_style(indent=indent)}"
+        )
+def _parse_parameter_type(
+        json_schema_object: dict[str, Any] | bool,
+        registry: _SchemaRegistry | None = None) -> _ParameterType:
+    if isinstance(json_schema_object, bool):
+        if json_schema_object:
+            return _ParameterTypeScalar(type="any")
+        else:
+            logger.warning(
+                f"Warning: Boolean value {json_schema_object} is not supported, use null instead."
+            )
+            return _ParameterTypeScalar(type="null")
+    if "$ref" in json_schema_object and registry:
+        return _ParameterTypeRef(json_schema_object, registry)
+    if "anyOf" in json_schema_object:
+        return _ParameterTypeAnyOf(json_schema_object, registry)
+    elif "enum" in json_schema_object:
+        return _ParameterTypeEnum(json_schema_object)
+    elif "type" in json_schema_object:
+        typ = json_schema_object["type"]
+        if isinstance(typ, list):
+            return _ParameterTypeUnion(json_schema_object)
+        elif typ == "object":
+            return _ParameterTypeObject(json_schema_object, registry)
+        elif typ == "array":
+            return _ParameterTypeArray(json_schema_object, registry)
+        else:
+            return _ParameterTypeScalar(typ, json_schema_object)
+    elif json_schema_object == {}:
+        return _ParameterTypeScalar(type="any")
+    else:
+        raise ValueError(f"Invalid JSON Schema object: {json_schema_object}")
+def _openai_function_to_typescript_style(function: dict[str, Any], ) -> str:
+    """Convert OpenAI function definition (dict) to TypeScript style string."""
+    registry = _SchemaRegistry()
+    parameters = function.get("parameters") or {}
+    parsed = _ParameterTypeObject(parameters, registry)
+    interfaces = []
+    root_interface_name = None
+    if registry.has_self_ref:
+        root_interface_name = "parameters"
+        params_str = _TS_FIELD_DELIMITER.join([
+            p.to_typescript_style(indent=_TS_INDENT) for p in parsed.properties
+        ])
+        params_str = f"\n{params_str}\n" if params_str else ""
+        interface_def = f"interface {root_interface_name} {{{params_str}}}"
+        interfaces.append(interface_def)
+    definitions_copy = dict(registry.definitions)
+    for def_name, def_schema in definitions_copy.items():
+        obj_type = _parse_parameter_type(def_schema, registry)
+        params_str = obj_type.to_typescript_style()
+        description_part = ""
+        if obj_description := def_schema.get("description", ""):
+            description_part = _format_description(obj_description) + "\n"
+        interface_def = f"{description_part}interface {def_name} {params_str}"
+        interfaces.append(interface_def)
+    interface_str = "\n".join(interfaces)
+    function_name = function.get("name", "function")
+    if root_interface_name:
+        type_def = f"type {function_name} = (_: {root_interface_name}) => any;"
+    else:
+        params_str = parsed.to_typescript_style()
+        type_def = f"type {function_name} = (_: {params_str}) => any;"
+    description = function.get("description")
+    return "\n".join(
+        filter(
+            bool,
+            [
+                interface_str,
+                ((description and _format_description(description)) or ""),
+                type_def,
+            ],
+        ))
+def encode_tools_to_typescript_style(tools: list[dict[str, Any]], ) -> str:
+    """
+    Convert tools (list of dict) to TypeScript style string.
+    Supports OpenAI format: {"type": "function", "function": {...}}
+    Args:
+        tools: List of tool definitions in dict format
+    Returns:
+        TypeScript style string representation of the tools
+    """
+    if not tools:
+        return ""
+    functions = []
+    for tool in tools:
+        tool_type = tool.get("type")
+        if tool_type == "function":
+            func_def = tool.get("function", {})
+            if func_def:
+                functions.append(
+                    _openai_function_to_typescript_style(func_def))
+        else:
+            # Skip unsupported tool types (like "_plugin")
+            continue
+    if not functions:
+        return ""
+    functions_str = "\n".join(functions)
+    result = "# Tools\n\n"
+    if functions_str:
+        result += "## functions\nnamespace functions {\n"
+        result += functions_str + "\n"
+        result += "}\n"
+    return result