Clean repository before upload

Browse files

Files changed (9) hide show

.gitattributes +0 -35
chat_template.jinja +0 -112
config.json +0 -55
generation_config.json +0 -7
model.safetensors +0 -3
tiktoken.model +0 -3
tokenization_kimi.py +0 -353
tokenizer_config.json +0 -214
tool_declaration_ts.py +0 -479

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

chat_template.jinja DELETED Viewed

@@ -1,112 +0,0 @@
-{%- macro render_content(msg) -%}
-    {%- set c = msg.get('content') -%}
-    {%- if c is string -%}
-      {{ c }}
-    {%- elif c is not none -%}
-      {% for content in c -%}
-        {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
-          <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
-        {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
-          <|kimi_k25_video_placeholder|>
-        {% else -%}
-          {{ content['text'] }}
-        {%- endif -%}
-      {%- endfor -%}
-    {%- endif -%}
-{%- endmacro -%}
-{% macro set_roles(message) -%}
-  {%- set role_name =  message.get('name') or  message['role'] -%}
-  {%- if message['role'] == 'user' -%}
-    <|im_user|>{{role_name}}<|im_middle|>
-  {%- elif message['role'] == 'assistant' -%}
-    <|im_assistant|>{{role_name}}<|im_middle|>
-  {%- else -%}
-    <|im_system|>{{role_name}}<|im_middle|>
-  {%- endif -%}
-{%- endmacro -%}
-{%- macro render_toolcalls(message) -%}
-  <|tool_calls_section_begin|>
-  {%- for tool_call in message['tool_calls'] -%}
-    {%- set formatted_id = tool_call['id'] -%}
-    <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
-  {%- endfor -%}
-  <|tool_calls_section_end|>
-{%- endmacro -%}
-{%- set preserve_thinking = preserve_thinking | default(false) -%}
-{# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
-{%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
-{%- if not preserve_thinking -%}
-{%- for idx in range(messages|length-1, -1, -1) -%}
-    {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
-        {%- set ns.last_non_tool_call_assistant_msg = idx -%}
-        {%- break -%}
-    {%- endif -%}
-{%- endfor -%}
-{%- endif -%}
-{# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
-{%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
-{%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
-{%- if tools -%}
-  {%- if tools_ts_str -%}
-    <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
-  {%- else -%}
-    <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
-  {%- endif -%}
-{%- endif -%}
-{%- for message in hist_msgs -%}
-  {{set_roles(message)}}
-  {%- if message['role'] == 'assistant' -%}
-    <think></think>{{render_content(message)}}
-    {%- if message.get('tool_calls') -%}
-      {{render_toolcalls(message)}}
-    {%- endif -%}
-  {%- elif message['role'] == 'tool' -%}
-    {%- set tool_call_id = message.tool_call_id -%}
-    ## Return of {{ tool_call_id }}
-{{render_content(message)}}
-  {%- elif message['content'] is not none -%}
-    {{render_content(message)}}
-  {%- endif -%}
-  <|im_end|>
-{%- endfor -%}
-{%- for message in suffix_msgs -%}
-  {{set_roles(message)}}
-  {%- if message['role'] == 'assistant' -%}
-    {%- if thinking is defined and thinking is false and preserve_thinking is false -%}
-    <think></think>{{render_content(message)}}
-    {%- else -%}
-    {%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
-    <think>{{rc}}</think>{{render_content(message)}}
-    {%- endif -%}
-    {%- if message.get('tool_calls') -%}
-     {{render_toolcalls(message)}}
-    {%- endif -%}
-  {%- elif message['role'] == 'tool' -%}
-    {%- set tool_call_id = message.tool_call_id -%}
-    ## Return of {{ tool_call_id }}
-{{render_content(message)}}
-  {%- elif message['content'] is not none -%}
-    {{render_content(message)}}
-  {%- endif -%}
-  <|im_end|>
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-  <|im_assistant|>assistant<|im_middle|>
-  {%- if thinking is defined and thinking is false -%}
-  <think></think>
-  {%- else -%}
-  <think>
-  {%- endif -%}
-{%- endif -%}

config.json DELETED Viewed

@@ -1,55 +0,0 @@
-{
-  "architectures": [
-    "DeepseekV3ForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "aux_loss_alpha": 0.001,
-  "bos_token_id": 163584,
-  "dtype": "float16",
-  "eos_token_id": 163585,
-  "ep_size": 1,
-  "first_k_dense_replace": 1,
-  "head_dim": 64,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 11264,
-  "kv_lora_rank": 512,
-  "max_position_embeddings": 131072,
-  "model_type": "deepseek_v3",
-  "moe_intermediate_size": 1408,
-  "moe_layer_freq": 1,
-  "n_group": 1,
-  "n_routed_experts": 64,
-  "n_shared_experts": 2,
-  "norm_topk_prob": true,
-  "num_attention_heads": 16,
-  "num_experts_per_tok": 6,
-  "num_hidden_layers": 27,
-  "num_key_value_heads": 16,
-  "num_nextn_predict_layers": 1,
-  "num_shared_experts": 2,
-  "pad_token_id": 163839,
-  "pretraining_tp": 1,
-  "q_lora_rank": null,
-  "qk_head_dim": 192,
-  "qk_nope_head_dim": 128,
-  "qk_rope_head_dim": 64,
-  "rms_norm_eps": 1e-05,
-  "rope_interleave": true,
-  "rope_parameters": {
-    "rope_theta": 800000.0,
-    "rope_type": "default"
-  },
-  "routed_scaling_factor": 2.446,
-  "scoring_func": "sigmoid",
-  "seq_aux": true,
-  "tie_word_embeddings": false,
-  "topk_group": 1,
-  "topk_method": "noaux_tc",
-  "transformers_version": "5.8.1",
-  "use_cache": false,
-  "v_head_dim": 128,
-  "vocab_size": 163840
-}

generation_config.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 163584,
-  "eos_token_id": 163585,
-  "pad_token_id": 163839,
-  "transformers_version": "5.8.1"
-}

model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0e0b6791300386330613a41b9fe4632ec4c99c7cd1ee51e9d9ec9a3523fa64c
-size 31920888072

tiktoken.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
-size 2795286

tokenization_kimi.py DELETED Viewed

@@ -1,353 +0,0 @@
-import os
-from collections import OrderedDict
-from logging import getLogger
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
-import tiktoken
-from tiktoken.load import load_tiktoken_bpe
-from tokenizers import AddedToken
-from transformers.convert_slow_tokenizer import bytes_to_unicode
-from transformers.tokenization_utils import PreTrainedTokenizer
-from .tool_declaration_ts import encode_tools_to_typescript_style
-logger = getLogger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
-class TikTokenTokenizer(PreTrainedTokenizer):
-    """
-    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-    Args:
-        vocab_file (`str`):
-            The path to the Tiktoken model file.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
-            The end of sequence token.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. The second to last item in special_tokens.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (list of `str`, *optional*):
-            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
-            skipped when decoding if `skip_special_tokens` is set to `True`.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    special_tokens: Dict[str, int]
-    num_reserved_special_tokens = 256
-    pat_str = "|".join([
-        r"""[\p{Han}]+""",
-        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
-        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
-        r"""\p{N}{1,3}""",
-        r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
-        r"""\s*[\r\n]+""",
-        r"""\s+(?!\S)""",
-        r"""\s+""",
-    ])
-    def __init__(
-        self,
-        vocab_file,
-        bos_token: Union[str, AddedToken] = "[BOS]",
-        eos_token: Union[str, AddedToken] = "[EOS]",
-        unk_token: Union[str, AddedToken, None] = None,
-        pad_token: Union[str, AddedToken, None] = None,
-        additional_special_tokens: List[str] = None,
-        added_tokens_decoder: Optional[dict] = None,
-        **kwargs,
-    ):
-        assert os.path.isfile(vocab_file), vocab_file
-        if additional_special_tokens is None:
-            additional_special_tokens = [
-                "<|im_end|>",
-                "<|im_user|>",
-                "<|im_assistant|>",
-                "<|start_header_id|>",
-                "<|end_header_id|>",
-                "[EOT]",
-                "<|im_system|>",
-                "<|im_middle|>",
-            ]
-        if added_tokens_decoder:
-            special_tokens_mapping = {
-                i: added_tokens_decoder[i].content
-                for i in added_tokens_decoder
-            }
-        else:
-            special_tokens_mapping = {}
-        self.vocab_file = vocab_file
-        mergeable_ranks = load_tiktoken_bpe(vocab_file)
-        num_base_tokens = len(mergeable_ranks)
-        self.special_tokens = {
-            special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
-            for i in range(num_base_tokens, num_base_tokens +
-                           self.num_reserved_special_tokens)
-        }
-        self.model = tiktoken.Encoding(
-            name=Path(vocab_file).name,
-            pat_str=self.pat_str,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-        logger.info(f"Reloaded tiktoken model from {vocab_file}")
-        self.n_words: int = self.model.n_vocab
-        # BOS / EOS token IDs
-        self.bos_id: int = self.special_tokens[str(bos_token)]
-        self.eos_id: int = self.special_tokens[str(eos_token)]
-        logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
-        )
-        self.pad_id: int = self.special_tokens[str(pad_token)]
-        self.unk_id: int = self.special_tokens[str(unk_token)]
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.decoder = {}
-        for i in range(self.n_words):
-            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
-            decoding = ''.join([
-                self.byte_encoder[ord(char)] for char in
-                self.model.decode_single_token_bytes(i).decode('latin-1')
-            ])
-            self.decoder[i] = decoding
-        self.encoder = {}
-        for i in range(self.n_words):
-            if i in self.decoder:
-                self.encoder[self.decoder[i]] = i
-        self._token_config_cache = OrderedDict()
-        self._cache_max_size = 128
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            additional_special_tokens=additional_special_tokens,
-            added_tokens_decoder=added_tokens_decoder,
-            **kwargs,
-        )
-        self.all_special_ids_set = set(self.all_special_ids)
-    def encode(self,
-               text: str,
-               allow_special_tokens: bool = True,
-               **kwargs) -> List[int]:
-        """
-        Encodes a string into a list of token IDs.
-        Args:
-            text (str): The input string to be encoded.
-        Returns:
-            list[int]: A list of token IDs.
-        """
-        # If there are other args, we should call super().encode because there are a lot of code
-        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
-        # NOTE: our encode method is not compatible with the super().encode method,
-        #   e.g. split_special_tokens' default is True in our encode method.
-        if len(kwargs) > 0:
-            logger.warning(f"Calling super().encode with {kwargs}")
-            return super().encode(text, **kwargs)
-        assert type(text) is str
-        # The tiktoken tokenizer can handle <=400k chars without
-        # pyo3_runtime.PanicException.
-        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-        # https://github.com/openai/tiktoken/issues/195
-        # Here we iterate over subsequences and split if we exceed the limit
-        # of max consecutive non-whitespace or whitespace characters.
-        MAX_NO_WHITESPACES_CHARS = 25_000
-        texts = self.pre_tokenizer_process(text)
-        all_substrs = []
-        for text in texts:
-            substrs = (
-                substr for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
-                for substr in self._split_whitespaces_or_nonwhitespaces(
-                    text[i:i +
-                         TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS))
-            all_substrs.extend(substrs)
-        t: List[int] = []
-        for substr in all_substrs:
-            if allow_special_tokens:
-                t.extend(
-                    # we should consider special token as a common token
-                    self.model.encode(
-                        substr,
-                        allowed_special="all",
-                    ))
-            else:
-                t.extend(
-                    # we should consider special token as a common token
-                    self.model.encode(
-                        substr,
-                        disallowed_special=(),
-                    ))
-        return t
-    def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
-        """
-        Decodes a list of token IDs into a string.
-        Args:
-            token_ids (List[int]): The list of token IDs to be decoded.
-        Returns:
-            str: The decoded string.
-        """
-        # If there are other args, we should call super().decode because there are a lot of code
-        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
-        if len(kwargs) > 0:
-            return super().decode(token_ids, **kwargs)
-        if type(token_ids) is int:
-            token_ids = [token_ids]
-        return self.model.decode(cast(List[int], token_ids))
-    @staticmethod
-    def _split_whitespaces_or_nonwhitespaces(
-            s: str, max_consecutive_slice_len: int) -> Iterator[str]:
-        """
-        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
-        consecutive whitespaces or consecutive non-whitespaces.
-        """
-        current_slice_len = 0
-        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
-        slice_start = 0
-        for i in range(len(s)):
-            is_now_space = s[i].isspace()
-            if current_slice_is_space ^ is_now_space:
-                current_slice_len = 1
-                current_slice_is_space = is_now_space
-            else:
-                current_slice_len += 1
-                if current_slice_len > max_consecutive_slice_len:
-                    yield s[slice_start:i]
-                    slice_start = i
-                    current_slice_len = 1
-        yield s[slice_start:]
-    def pre_tokenizer_process(self, text: str) -> List[str]:
-        """
-        pre-tokenizes the input text into a list of tokens.
-        This method is used to split the input text into smaller chunks for internal processing.
-        """
-        return [text]
-    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
-    @property
-    def vocab_size(self) -> int:
-        return self.n_words
-    def get_vocab(self) -> Dict[str, int]:
-        return self.encoder
-    def _tokenize(self, text: str, **kwargs) -> List[str]:
-        return [self.decoder[t] for t in self.encode(text)]
-    def _convert_token_to_id(self, token: str) -> int:
-        return self.encoder.get(token, self.unk_id)
-    def _convert_id_to_token(self, index: int) -> str:
-        return self.decoder.get(index)
-    @staticmethod
-    def clean_up_tokenization(out_string: str) -> str:
-        return out_string
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c]
-                          for c in text]).decode('utf-8', 'replace')
-        return text
-    def save_vocabulary(self,
-                        save_directory: str,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            raise ValueError(
-                f"vocabulary path ({save_directory}) should be a directory")
-        out_vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") +
-            VOCAB_FILES_NAMES["vocab_file"])
-        if os.path.abspath(self.vocab_file) != os.path.abspath(
-                out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        return (out_vocab_file, )
-    def apply_chat_template(self,
-                            conversation,
-                            tools: Optional[list[dict]] = None,
-                            tokenize: bool = False,
-                            add_generation_prompt: bool = True,
-                            thinking: bool = True,
-                            preserve_thinking: bool = False,
-                            **kwargs):
-        tools = deep_sort_dict(tools)
-        # Convert tools to TypeScript style string if tools are provided
-        tools_ts_str = None
-        if tools:
-            try:
-                tools_ts_str = encode_tools_to_typescript_style(tools)
-            except Exception as e:
-                print(f"Failed to convert tools to TypeScript style: {e}")
-                tools_ts_str = None
-        # Store the TypeScript string in kwargs so it can be accessed by the template
-        if tools_ts_str is not None:
-            kwargs['tools_ts_str'] = tools_ts_str
-        return super().apply_chat_template(
-            conversation,
-            tools=tools,
-            tokenize=tokenize,
-            add_generation_prompt=add_generation_prompt,
-            thinking=thinking,
-            preserve_thinking=preserve_thinking,
-            **kwargs)
-def deep_sort_dict(obj: Any) -> Any:
-    if isinstance(obj, dict):
-        return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
-    if isinstance(obj, list):
-        return [deep_sort_dict(item) for item in obj]
-    return obj

tokenizer_config.json DELETED Viewed

@@ -1,214 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "163584": {
-      "content": "[BOS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163585": {
-      "content": "[EOS]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163586": {
-      "content": "<|im_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163587": {
-      "content": "<|im_user|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163588": {
-      "content": "<|im_assistant|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163590": {
-      "content": "<|start_header_id|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163591": {
-      "content": "<|end_header_id|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163593": {
-      "content": "[EOT]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163594": {
-      "content": "<|im_system|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163595": {
-      "content": "<|tool_calls_section_begin|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "163596": {
-      "content": "<|tool_calls_section_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "163597": {
-      "content": "<|tool_call_begin|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "163598": {
-      "content": "<|tool_call_argument_begin|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "163599": {
-      "content": "<|tool_call_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "163601": {
-      "content": "<|im_middle|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163602": {
-      "content": "<|media_begin|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163603": {
-      "content": "<|media_content|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163604": {
-      "content": "<|media_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163605": {
-      "content": "<|media_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163606": {
-      "content": "<think>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "163607": {
-      "content": "</think>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "163838": {
-      "content": "[UNK]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "163839": {
-      "content": "[PAD]",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenization_kimi.TikTokenTokenizer",
-      null
-    ]
-  },
-  "backend": "custom",
-  "bos_token": "[BOS]",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "[EOS]",
-  "extra_special_tokens": [
-    "<|im_end|>",
-    "<|im_user|>",
-    "<|im_assistant|>",
-    "<|start_header_id|>",
-    "<|end_header_id|>",
-    "[EOT]",
-    "<|im_system|>",
-    "<|im_middle|>"
-  ],
-  "is_local": false,
-  "local_files_only": false,
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "[PAD]",
-  "tokenizer_class": "TikTokenTokenizer",
-  "unk_token": "[UNK]"
-}

tool_declaration_ts.py DELETED Viewed

@@ -1,479 +0,0 @@
-"""
-Encode structured tool declaration to typescript style string.
-"""
-import dataclasses
-import json
-import logging
-from collections.abc import Sequence
-from typing import Any
-logger = logging.getLogger(__name__)
-_TS_INDENT = "  "
-_TS_FIELD_DELIMITER = ",\n"
-class _SchemaRegistry:
-    """Registry for schema definitions to handle $ref resolution"""
-    def __init__(self):
-        self.definitions = {}
-        self.has_self_ref = False
-    def register_definitions(self, defs: dict[str, Any]):
-        """Register schema definitions from $defs section"""
-        if not defs:
-            return
-        for def_name, def_schema in defs.items():
-            self.definitions[def_name] = def_schema
-    def resolve_ref(self, ref: str) -> dict[str, Any]:
-        """Resolve a reference to its schema definition"""
-        if ref == "#":
-            self.has_self_ref = True
-            return {"$self_ref": True}
-        elif ref.startswith("#/$defs/"):
-            def_name = ref.split("/")[-1]
-            if def_name not in self.definitions:
-                raise ValueError(f"Reference not found: {ref}")
-            return self.definitions[def_name]
-        else:
-            raise ValueError(f"Unsupported reference format: {ref}")
-def _format_description(description: str, indent: str = "") -> str:
-    return "\n".join([
-        f"{indent}// {line}" if line else ""
-        for line in description.split("\n")
-    ])
-class _BaseType:
-    description: str
-    constraints: dict[str, Any]
-    def __init__(
-            self,
-            extra_props: dict[str, Any],
-            *,
-            allowed_constraint_keys: Sequence[str] = (),
-    ):
-        self.description = extra_props.get("description", "")
-        self.constraints = {
-            k: v
-            for k, v in extra_props.items() if k in allowed_constraint_keys
-        }
-    def to_typescript_style(self, indent: str = "") -> str:
-        raise NotImplementedError
-    def format_docstring(self, indent: str) -> str:
-        lines = []
-        if self.description:
-            lines.append(_format_description(self.description, indent))
-        if self.constraints:
-            constraints_str = ", ".join(f"{k}: {v}" for k, v in sorted(
-                self.constraints.items(), key=lambda kv: kv[0]))
-            lines.append(f"{indent}// {constraints_str}")
-        return "".join(x + "\n" for x in lines)
-class _ParameterTypeScalar(_BaseType):
-    type: str
-    def __init__(self, type: str, extra_props: dict[str, Any] | None = None):
-        self.type = type
-        allowed_constraint_keys: list[str] = []
-        if self.type == "string":
-            allowed_constraint_keys = ["maxLength", "minLength", "pattern"]
-        elif self.type in ("number", "integer"):
-            allowed_constraint_keys = ["maximum", "minimum"]
-        super().__init__(extra_props or {},
-                         allowed_constraint_keys=allowed_constraint_keys)
-    def to_typescript_style(self, indent: str = "") -> str:
-        # Map integer to number in TypeScript
-        if self.type == "integer":
-            return "number"
-        return self.type
-class _ParameterTypeObject(_BaseType):
-    properties: list["_Parameter"]
-    additional_properties: Any | None = None
-    def __init__(self,
-                 json_schema_object: dict[str, Any],
-                 registry: _SchemaRegistry | None = None):
-        super().__init__(json_schema_object)
-        self.properties = []
-        self.additional_properties = None
-        if not json_schema_object:
-            return
-        if "$defs" in json_schema_object and registry:
-            registry.register_definitions(json_schema_object["$defs"])
-        self.additional_properties = json_schema_object.get(
-            "additionalProperties")
-        if isinstance(self.additional_properties, dict):
-            self.additional_properties = _parse_parameter_type(
-                self.additional_properties, registry)
-        if "properties" not in json_schema_object:
-            return
-        required_parameters = json_schema_object.get("required", [])
-        optional_parameters = set(
-            json_schema_object["properties"].keys()) - set(required_parameters)
-        self.properties = [
-            _Parameter(
-                name=name,
-                type=_parse_parameter_type(prop, registry),
-                optional=name in optional_parameters,
-                default=prop.get("default")
-                if isinstance(prop, dict) else None,
-            ) for name, prop in json_schema_object["properties"].items()
-        ]
-    def to_typescript_style(self, indent: str = "") -> str:
-        # sort by optional, make the required parameters first
-        parameters = [p for p in self.properties if not p.optional]
-        opt_params = [p for p in self.properties if p.optional]
-        parameters = sorted(parameters, key=lambda p: p.name)
-        parameters.extend(sorted(opt_params, key=lambda p: p.name))
-        param_strs = []
-        for p in parameters:
-            one = p.to_typescript_style(indent=indent + _TS_INDENT)
-            param_strs.append(one)
-        if self.additional_properties is not None:
-            ap_type_str = "any"
-            if self.additional_properties is True:
-                ap_type_str = "any"
-            elif self.additional_properties is False:
-                ap_type_str = "never"
-            elif isinstance(self.additional_properties, _ParameterType):
-                ap_type_str = self.additional_properties.to_typescript_style(
-                    indent=indent + _TS_INDENT)
-            else:
-                raise ValueError(
-                    f"Unknown additionalProperties: {self.additional_properties}"
-                )
-            param_strs.append(
-                f"{indent + _TS_INDENT}[k: string]: {ap_type_str}")
-        if not param_strs:
-            return "{}"
-        params_str = _TS_FIELD_DELIMITER.join(param_strs)
-        if params_str:
-            # add new line before and after
-            params_str = f"\n{params_str}\n"
-        # always wrap with object
-        return f"{{{params_str}{indent}}}"
-class _ParameterTypeArray(_BaseType):
-    item: "_ParameterType"
-    def __init__(self,
-                 json_schema_object: dict[str, Any],
-                 registry: _SchemaRegistry | None = None):
-        super().__init__(json_schema_object,
-                         allowed_constraint_keys=("minItems", "maxItems"))
-        if json_schema_object.get("items"):
-            self.item = _parse_parameter_type(json_schema_object["items"],
-                                              registry)
-        else:
-            self.item = _ParameterTypeScalar(type="any")
-    def to_typescript_style(self, indent: str = "") -> str:
-        item_docstring = self.item.format_docstring(indent + _TS_INDENT)
-        if item_docstring:
-            return ("Array<\n" + item_docstring + indent + _TS_INDENT +
-                    self.item.to_typescript_style(indent=indent + _TS_INDENT) +
-                    "\n" + indent + ">")
-        else:
-            return f"Array<{self.item.to_typescript_style(indent=indent)}>"
-class _ParameterTypeEnum(_BaseType):
-    # support scalar types only
-    enum: list[str | int | float | bool | None]
-    def __init__(self, json_schema_object: dict[str, Any]):
-        super().__init__(json_schema_object)
-        self.enum = json_schema_object["enum"]
-        # Validate enum values against declared type if present
-        if "type" in json_schema_object:
-            typ = json_schema_object["type"]
-            if isinstance(typ, list):
-                if len(typ) == 1:
-                    typ = typ[0]
-                elif len(typ) == 2:
-                    if "null" not in typ:
-                        raise ValueError(f"Enum type {typ} is not supported")
-                    else:
-                        typ = typ[0] if typ[0] != "null" else typ[1]
-                else:
-                    raise ValueError(f"Enum type {typ} is not supported")
-            for val in self.enum:
-                if val is None:
-                    continue
-                if typ == "string" and not isinstance(val, str):
-                    raise ValueError(f"Enum value {val} is not a string")
-                elif typ == "number" and not isinstance(val, (int, float)):
-                    raise ValueError(f"Enum value {val} is not a number")
-                elif typ == "integer" and not isinstance(val, int):
-                    raise ValueError(f"Enum value {val} is not an integer")
-                elif typ == "boolean" and not isinstance(val, bool):
-                    raise ValueError(f"Enum value {val} is not a boolean")
-    def to_typescript_style(self, indent: str = "") -> str:
-        return " | ".join(
-            [f'"{e}"' if isinstance(e, str) else str(e) for e in self.enum])
-class _ParameterTypeAnyOf(_BaseType):
-    types: list["_ParameterType"]
-    def __init__(
-        self,
-        json_schema_object: dict[str, Any],
-        registry: _SchemaRegistry | None = None,
-    ):
-        super().__init__(json_schema_object)
-        self.types = [
-            _parse_parameter_type(t, registry)
-            for t in json_schema_object["anyOf"]
-        ]
-    def to_typescript_style(self, indent: str = "") -> str:
-        return " | ".join(
-            [t.to_typescript_style(indent=indent) for t in self.types])
-class _ParameterTypeUnion(_BaseType):
-    types: list[str]
-    def __init__(self, json_schema_object: dict[str, Any]):
-        super().__init__(json_schema_object)
-        mapping = {
-            "string": "string",
-            "number": "number",
-            "integer": "number",
-            "boolean": "boolean",
-            "null": "null",
-            "object": "{}",
-            "array": "Array<any>",
-        }
-        self.types = [mapping[t] for t in json_schema_object["type"]]
-    def to_typescript_style(self, indent: str = "") -> str:
-        return " | ".join(self.types)
-class _ParameterTypeRef(_BaseType):
-    ref_name: str
-    is_self_ref: bool = False
-    def __init__(self, json_schema_object: dict[str, Any],
-                 registry: _SchemaRegistry):
-        super().__init__(json_schema_object)
-        ref = json_schema_object["$ref"]
-        resolved_schema = registry.resolve_ref(ref)
-        if resolved_schema.get("$self_ref", False):
-            self.ref_name = "parameters"
-            self.is_self_ref = True
-        else:
-            self.ref_name = ref.split("/")[-1]
-    def to_typescript_style(self, indent: str = "") -> str:
-        return self.ref_name
-_ParameterType = (_ParameterTypeScalar
-                  | _ParameterTypeObject
-                  | _ParameterTypeArray
-                  | _ParameterTypeEnum
-                  | _ParameterTypeAnyOf
-                  | _ParameterTypeUnion
-                  | _ParameterTypeRef)
-@dataclasses.dataclass
-class _Parameter:
-    """
-    A parameter in a function, or a field in a object.
-    It consists of the type as well as the name.
-    """
-    type: _ParameterType
-    name: str = "_"
-    optional: bool = True
-    default: Any | None = None
-    @classmethod
-    def parse_extended(cls, attributes: dict[str, Any]) -> "_Parameter":
-        if not attributes:
-            raise ValueError("attributes is empty")
-        return cls(
-            name=attributes.get("name", "_"),
-            type=_parse_parameter_type(attributes),
-            optional=attributes.get("optional", False),
-            default=attributes.get("default"),
-        )
-    def to_typescript_style(self, indent: str = "") -> str:
-        comments = self.type.format_docstring(indent)
-        if self.default is not None:
-            default_repr = (json.dumps(self.default, ensure_ascii=False)
-                            if not isinstance(self.default, (int, float, bool))
-                            else repr(self.default))
-            comments += f"{indent}// Default: {default_repr}\n"
-        return (
-            comments +
-            f"{indent}{self.name}{'?' if self.optional else ''}: {self.type.to_typescript_style(indent=indent)}"
-        )
-def _parse_parameter_type(
-        json_schema_object: dict[str, Any] | bool,
-        registry: _SchemaRegistry | None = None) -> _ParameterType:
-    if isinstance(json_schema_object, bool):
-        if json_schema_object:
-            return _ParameterTypeScalar(type="any")
-        else:
-            logger.warning(
-                f"Warning: Boolean value {json_schema_object} is not supported, use null instead."
-            )
-            return _ParameterTypeScalar(type="null")
-    if "$ref" in json_schema_object and registry:
-        return _ParameterTypeRef(json_schema_object, registry)
-    if "anyOf" in json_schema_object:
-        return _ParameterTypeAnyOf(json_schema_object, registry)
-    elif "enum" in json_schema_object:
-        return _ParameterTypeEnum(json_schema_object)
-    elif "type" in json_schema_object:
-        typ = json_schema_object["type"]
-        if isinstance(typ, list):
-            return _ParameterTypeUnion(json_schema_object)
-        elif typ == "object":
-            return _ParameterTypeObject(json_schema_object, registry)
-        elif typ == "array":
-            return _ParameterTypeArray(json_schema_object, registry)
-        else:
-            return _ParameterTypeScalar(typ, json_schema_object)
-    elif json_schema_object == {}:
-        return _ParameterTypeScalar(type="any")
-    else:
-        raise ValueError(f"Invalid JSON Schema object: {json_schema_object}")
-def _openai_function_to_typescript_style(function: dict[str, Any], ) -> str:
-    """Convert OpenAI function definition (dict) to TypeScript style string."""
-    registry = _SchemaRegistry()
-    parameters = function.get("parameters") or {}
-    parsed = _ParameterTypeObject(parameters, registry)
-    interfaces = []
-    root_interface_name = None
-    if registry.has_self_ref:
-        root_interface_name = "parameters"
-        params_str = _TS_FIELD_DELIMITER.join([
-            p.to_typescript_style(indent=_TS_INDENT) for p in parsed.properties
-        ])
-        params_str = f"\n{params_str}\n" if params_str else ""
-        interface_def = f"interface {root_interface_name} {{{params_str}}}"
-        interfaces.append(interface_def)
-    definitions_copy = dict(registry.definitions)
-    for def_name, def_schema in definitions_copy.items():
-        obj_type = _parse_parameter_type(def_schema, registry)
-        params_str = obj_type.to_typescript_style()
-        description_part = ""
-        if obj_description := def_schema.get("description", ""):
-            description_part = _format_description(obj_description) + "\n"
-        interface_def = f"{description_part}interface {def_name} {params_str}"
-        interfaces.append(interface_def)
-    interface_str = "\n".join(interfaces)
-    function_name = function.get("name", "function")
-    if root_interface_name:
-        type_def = f"type {function_name} = (_: {root_interface_name}) => any;"
-    else:
-        params_str = parsed.to_typescript_style()
-        type_def = f"type {function_name} = (_: {params_str}) => any;"
-    description = function.get("description")
-    return "\n".join(
-        filter(
-            bool,
-            [
-                interface_str,
-                ((description and _format_description(description)) or ""),
-                type_def,
-            ],
-        ))
-def encode_tools_to_typescript_style(tools: list[dict[str, Any]], ) -> str:
-    """
-    Convert tools (list of dict) to TypeScript style string.
-    Supports OpenAI format: {"type": "function", "function": {...}}
-    Args:
-        tools: List of tool definitions in dict format
-    Returns:
-        TypeScript style string representation of the tools
-    """
-    if not tools:
-        return ""
-    functions = []
-    for tool in tools:
-        tool_type = tool.get("type")
-        if tool_type == "function":
-            func_def = tool.get("function", {})
-            if func_def:
-                functions.append(
-                    _openai_function_to_typescript_style(func_def))
-        else:
-            # Skip unsupported tool types (like "_plugin")
-            continue
-    if not functions:
-        return ""
-    functions_str = "\n".join(functions)
-    result = "# Tools\n\n"
-    if functions_str:
-        result += "## functions\nnamespace functions {\n"
-        result += functions_str + "\n"
-        result += "}\n"
-    return result