fix: bugs and improvements

#34

by ThanhNguyxn - opened Jan 31

base: refs/heads/main

←

from: refs/pr/34

Discussion Files changed

+2243

-2079

Files changed (5) hide show

.gitignore +95 -0
modeling_kimi_k25.py +0 -0
requirements.txt +6 -0
tokenization_kimi.py +368 -352
tool_declaration_ts.py +500 -479

.gitignore ADDED Viewed

	@@ -0,0 +1,95 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# Environment
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+# Model weights and large files
+*.bin
+*.safetensors
+*.gguf
+*.pt
+*.pth
+*.ckpt
+*.h5
+model-*.json
+# OS files
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+logs/

modeling_kimi_k25.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+transformers>=4.57.1
+tiktoken>=0.5.0
+numpy>=1.24.0
+Pillow>=9.0.0
+pydantic>=2.0.0

tokenization_kimi.py CHANGED Viewed

@@ -1,352 +1,368 @@
-import os
-from collections import OrderedDict
-from logging import getLogger
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
-import tiktoken
-from tiktoken.load import load_tiktoken_bpe
-from tokenizers import AddedToken
-from transformers.convert_slow_tokenizer import bytes_to_unicode
-from transformers.tokenization_utils import PreTrainedTokenizer
-from .tool_declaration_ts import encode_tools_to_typescript_style
-logger = getLogger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
-class TikTokenTokenizer(PreTrainedTokenizer):
-    """
-    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-    Args:
-        vocab_file (`str`):
-            The path to the Tiktoken model file.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
-            The end of sequence token.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. The second to last item in special_tokens.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (list of `str`, *optional*):
-            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
-            skipped when decoding if `skip_special_tokens` is set to `True`.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    special_tokens: Dict[str, int]
-    num_reserved_special_tokens = 256
-    pat_str = "|".join([
-        r"""[\p{Han}]+""",
-        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
-        r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
-        r"""\p{N}{1,3}""",
-        r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
-        r"""\s*[\r\n]+""",
-        r"""\s+(?!\S)""",
-        r"""\s+""",
-    ])
-    def __init__(
-        self,
-        vocab_file,
-        bos_token: Union[str, AddedToken] = "[BOS]",
-        eos_token: Union[str, AddedToken] = "[EOS]",
-        unk_token: Union[str, AddedToken, None] = None,
-        pad_token: Union[str, AddedToken, None] = None,
-        additional_special_tokens: List[str] = None,
-        added_tokens_decoder: Optional[dict] = None,
-        **kwargs,
-    ):
-        assert os.path.isfile(vocab_file), vocab_file
-        if additional_special_tokens is None:
-            additional_special_tokens = [
-                "<|im_end|>",
-                "<|im_user|>",
-                "<|im_assistant|>",
-                "<|start_header_id|>",
-                "<|end_header_id|>",
-                "[EOT]",
-                "<|im_system|>",
-                "<|im_middle|>",
-            ]
-        if added_tokens_decoder:
-            special_tokens_mapping = {
-                i: added_tokens_decoder[i].content
-                for i in added_tokens_decoder
-            }
-        else:
-            special_tokens_mapping = {}
-        self.vocab_file = vocab_file
-        mergeable_ranks = load_tiktoken_bpe(vocab_file)
-        num_base_tokens = len(mergeable_ranks)
-        self.special_tokens = {
-            special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
-            for i in range(num_base_tokens, num_base_tokens +
-                           self.num_reserved_special_tokens)
-        }
-        self.model = tiktoken.Encoding(
-            name=Path(vocab_file).name,
-            pat_str=self.pat_str,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-        logger.info(f"Reloaded tiktoken model from {vocab_file}")
-        self.n_words: int = self.model.n_vocab
-        # BOS / EOS token IDs
-        self.bos_id: int = self.special_tokens[str(bos_token)]
-        self.eos_id: int = self.special_tokens[str(eos_token)]
-        logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
-        )
-        self.pad_id: int = self.special_tokens[str(pad_token)]
-        self.unk_id: int = self.special_tokens[str(unk_token)]
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.decoder = {}
-        for i in range(self.n_words):
-            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
-            decoding = ''.join([
-                self.byte_encoder[ord(char)] for char in
-                self.model.decode_single_token_bytes(i).decode('latin-1')
-            ])
-            self.decoder[i] = decoding
-        self.encoder = {}
-        for i in range(self.n_words):
-            if i in self.decoder:
-                self.encoder[self.decoder[i]] = i
-        self._token_config_cache = OrderedDict()
-        self._cache_max_size = 128
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            additional_special_tokens=additional_special_tokens,
-            added_tokens_decoder=added_tokens_decoder,
-            **kwargs,
-        )
-        self.all_special_ids_set = set(self.all_special_ids)
-    def encode(self,
-               text: str,
-               allow_special_tokens: bool = True,
-               **kwargs) -> List[int]:
-        """
-        Encodes a string into a list of token IDs.
-        Args:
-            text (str): The input string to be encoded.
-        Returns:
-            list[int]: A list of token IDs.
-        """
-        # If there are other args, we should call super().encode because there are a lot of code
-        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
-        # NOTE: our encode method is not compatible with the super().encode method,
-        #   e.g. split_special_tokens' default is True in our encode method.
-        if len(kwargs) > 0:
-            logger.warning(f"Calling super().encode with {kwargs}")
-            return super().encode(text, **kwargs)
-        assert type(text) is str
-        # The tiktoken tokenizer can handle <=400k chars without
-        # pyo3_runtime.PanicException.
-        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-        # https://github.com/openai/tiktoken/issues/195
-        # Here we iterate over subsequences and split if we exceed the limit
-        # of max consecutive non-whitespace or whitespace characters.
-        MAX_NO_WHITESPACES_CHARS = 25_000
-        texts = self.pre_tokenizer_process(text)
-        all_substrs = []
-        for text in texts:
-            substrs = (
-                substr for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
-                for substr in self._split_whitespaces_or_nonwhitespaces(
-                    text[i:i +
-                         TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS))
-            all_substrs.extend(substrs)
-        t: List[int] = []
-        for substr in all_substrs:
-            if allow_special_tokens:
-                t.extend(
-                    # we should consider special token as a common token
-                    self.model.encode(
-                        substr,
-                        allowed_special="all",
-                    ))
-            else:
-                t.extend(
-                    # we should consider special token as a common token
-                    self.model.encode(
-                        substr,
-                        disallowed_special=(),
-                    ))
-        return t
-    def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
-        """
-        Decodes a list of token IDs into a string.
-        Args:
-            token_ids (List[int]): The list of token IDs to be decoded.
-        Returns:
-            str: The decoded string.
-        """
-        # If there are other args, we should call super().decode because there are a lot of code
-        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
-        if len(kwargs) > 0:
-            return super().decode(token_ids, **kwargs)
-        if type(token_ids) is int:
-            token_ids = [token_ids]
-        return self.model.decode(cast(List[int], token_ids))
-    @staticmethod
-    def _split_whitespaces_or_nonwhitespaces(
-            s: str, max_consecutive_slice_len: int) -> Iterator[str]:
-        """
-        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
-        consecutive whitespaces or consecutive non-whitespaces.
-        """
-        current_slice_len = 0
-        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
-        slice_start = 0
-        for i in range(len(s)):
-            is_now_space = s[i].isspace()
-            if current_slice_is_space ^ is_now_space:
-                current_slice_len = 1
-                current_slice_is_space = is_now_space
-            else:
-                current_slice_len += 1
-                if current_slice_len > max_consecutive_slice_len:
-                    yield s[slice_start:i]
-                    slice_start = i
-                    current_slice_len = 1
-        yield s[slice_start:]
-    def pre_tokenizer_process(self, text: str) -> List[str]:
-        """
-        pre-tokenizes the input text into a list of tokens.
-        This method is used to split the input text into smaller chunks for internal processing.
-        """
-        return [text]
-    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
-    @property
-    def vocab_size(self) -> int:
-        return self.n_words
-    def get_vocab(self) -> Dict[str, int]:
-        return self.encoder
-    def _tokenize(self, text: str, **kwargs) -> List[str]:
-        return [self.decoder[t] for t in self.encode(text)]
-    def _convert_token_to_id(self, token: str) -> int:
-        return self.encoder.get(token, self.unk_id)
-    def _convert_id_to_token(self, index: int) -> str:
-        return self.decoder.get(index)
-    @staticmethod
-    def clean_up_tokenization(out_string: str) -> str:
-        return out_string
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        text = ''.join(tokens)
-        text = bytearray([self.byte_decoder[c]
-                          for c in text]).decode('utf-8', 'replace')
-        return text
-    def save_vocabulary(self,
-                        save_directory: str,
-                        filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            raise ValueError(
-                f"vocabulary path ({save_directory}) should be a directory")
-        out_vocab_file = os.path.join(
-            save_directory,
-            (filename_prefix + "-" if filename_prefix else "") +
-            VOCAB_FILES_NAMES["vocab_file"])
-        if os.path.abspath(self.vocab_file) != os.path.abspath(
-                out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        return (out_vocab_file, )
-    def apply_chat_template(self,
-                            conversation,
-                            tools: Optional[list[dict]] = None,
-                            tokenize: bool = False,
-                            add_generation_prompt: bool = True,
-                            thinking: bool = True,
-                            **kwargs):
-        tools = deep_sort_dict(tools)
-        # Convert tools to TypeScript style string if tools are provided
-        tools_ts_str = None
-        if tools:
-            try:
-                tools_ts_str = encode_tools_to_typescript_style(tools)
-            except Exception as e:
-                print(f"Failed to convert tools to TypeScript style: {e}")
-                tools_ts_str = None
-        # Store the TypeScript string in kwargs so it can be accessed by the template
-        if tools_ts_str is not None:
-            kwargs['tools_ts_str'] = tools_ts_str
-        return super().apply_chat_template(
-            conversation,
-            tools=tools,
-            tokenize=tokenize,
-            add_generation_prompt=add_generation_prompt,
-            thinking=thinking,
-            **kwargs)
-def deep_sort_dict(obj: Any) -> Any:
-    if isinstance(obj, dict):
-        return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
-    if isinstance(obj, list):
-        return [deep_sort_dict(item) for item in obj]
-    return obj

+import os
+from collections import OrderedDict
+from logging import getLogger
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+from tokenizers import AddedToken
+from transformers.convert_slow_tokenizer import bytes_to_unicode
+from transformers.tokenization_utils import PreTrainedTokenizer
+from .tool_declaration_ts import encode_tools_to_typescript_style
+logger = getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
+class TikTokenTokenizer(PreTrainedTokenizer):
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            The path to the Tiktoken model file.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
+            The end of sequence token.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. The second to last item in special_tokens.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (list of `str`, *optional*):
+            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
+            skipped when decoding if `skip_special_tokens` is set to `True`.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    special_tokens: Dict[str, int]
+    num_reserved_special_tokens = 256
+    pat_str = "|".join(
+        [
+            r"""[\p{Han}]+""",
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
+            r"""\p{N}{1,3}""",
+            r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
+            r"""\s*[\r\n]+""",
+            r"""\s+(?!\S)""",
+            r"""\s+""",
+        ]
+    )
+    def __init__(
+        self,
+        vocab_file,
+        bos_token: Union[str, AddedToken] = "[BOS]",
+        eos_token: Union[str, AddedToken] = "[EOS]",
+        unk_token: Union[str, AddedToken, None] = None,
+        pad_token: Union[str, AddedToken, None] = None,
+        additional_special_tokens: Optional[List[str]] = None,
+        added_tokens_decoder: Optional[dict] = None,
+        **kwargs,
+    ):
+        assert os.path.isfile(vocab_file), vocab_file
+        if additional_special_tokens is None:
+            additional_special_tokens = [
+                "<|im_end|>",
+                "<|im_user|>",
+                "<|im_assistant|>",
+                "<|start_header_id|>",
+                "<|end_header_id|>",
+                "[EOT]",
+                "<|im_system|>",
+                "<|im_middle|>",
+            ]
+        if added_tokens_decoder:
+            special_tokens_mapping = {
+                i: added_tokens_decoder[i].content for i in added_tokens_decoder
+            }
+        else:
+            special_tokens_mapping = {}
+        self.vocab_file = vocab_file
+        mergeable_ranks = load_tiktoken_bpe(vocab_file)
+        num_base_tokens = len(mergeable_ranks)
+        self.special_tokens = {
+            special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
+            for i in range(
+                num_base_tokens, num_base_tokens + self.num_reserved_special_tokens
+            )
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(vocab_file).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        logger.info(f"Reloaded tiktoken model from {vocab_file}")
+        self.n_words: int = self.model.n_vocab
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens[str(bos_token)]
+        self.eos_id: int = self.special_tokens[str(eos_token)]
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        self.pad_id: int = self.special_tokens[str(pad_token)]
+        self.unk_id: int = self.special_tokens[str(unk_token)]
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.decoder = {}
+        for i in range(self.n_words):
+            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
+            decoding = "".join(
+                [
+                    self.byte_encoder[ord(char)]
+                    for char in self.model.decode_single_token_bytes(i).decode(
+                        "latin-1"
+                    )
+                ]
+            )
+            self.decoder[i] = decoding
+        self.encoder = {}
+        for i in range(self.n_words):
+            if i in self.decoder:
+                self.encoder[self.decoder[i]] = i
+        self._token_config_cache = OrderedDict()
+        self._cache_max_size = 128
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            added_tokens_decoder=added_tokens_decoder,
+            **kwargs,
+        )
+        self.all_special_ids_set = set(self.all_special_ids)
+    def encode(
+        self, text: str, allow_special_tokens: bool = True, **kwargs
+    ) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            text (str): The input string to be encoded.
+        Returns:
+            list[int]: A list of token IDs.
+        """
+        # If there are other args, we should call super().encode because there are a lot of code
+        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
+        # NOTE: our encode method is not compatible with the super().encode method,
+        #   e.g. split_special_tokens' default is True in our encode method.
+        if len(kwargs) > 0:
+            logger.warning(f"Calling super().encode with {kwargs}")
+            return super().encode(text, **kwargs)
+        assert type(text) is str
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException.
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+        texts = self.pre_tokenizer_process(text)
+        all_substrs = []
+        for text in texts:
+            substrs = (
+                substr
+                for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
+                for substr in self._split_whitespaces_or_nonwhitespaces(
+                    text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+                )
+            )
+            all_substrs.extend(substrs)
+        t: List[int] = []
+        for substr in all_substrs:
+            if allow_special_tokens:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        allowed_special="all",
+                    )
+                )
+            else:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        disallowed_special=(),
+                    )
+                )
+        return t
+    def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            token_ids (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # If there are other args, we should call super().decode because there are a lot of code
+        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
+        if len(kwargs) > 0:
+            return super().decode(token_ids, **kwargs)
+        if type(token_ids) is int:
+            token_ids = [token_ids]
+        return self.model.decode(cast(List[int], token_ids))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+    def pre_tokenizer_process(self, text: str) -> List[str]:
+        """
+        pre-tokenizes the input text into a list of tokens.
+        This method is used to split the input text into smaller chunks for internal processing.
+        """
+        return [text]
+    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
+    @property
+    def vocab_size(self) -> int:
+        return self.n_words
+    def get_vocab(self) -> Dict[str, int]:
+        return self.encoder
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        return [self.decoder[t] for t in self.encode(text)]
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.encoder.get(token, self.unk_id)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.decoder.get(index)
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        return out_string
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            "utf-8", "replace"
+        )
+        return text
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            raise ValueError(
+                f"vocabulary path ({save_directory}) should be a directory"
+            )
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + VOCAB_FILES_NAMES["vocab_file"],
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)
+    def apply_chat_template(
+        self,
+        conversation,
+        tools: Optional[list[dict]] = None,
+        tokenize: bool = False,
+        add_generation_prompt: bool = True,
+        thinking: bool = True,
+        **kwargs,
+    ):
+        tools = deep_sort_dict(tools)
+        # Convert tools to TypeScript style string if tools are provided
+        tools_ts_str = None
+        if tools:
+            try:
+                tools_ts_str = encode_tools_to_typescript_style(tools)
+            except Exception as e:
+                print(f"Failed to convert tools to TypeScript style: {e}")
+                tools_ts_str = None
+        # Store the TypeScript string in kwargs so it can be accessed by the template
+        if tools_ts_str is not None:
+            kwargs["tools_ts_str"] = tools_ts_str
+        return super().apply_chat_template(
+            conversation,
+            tools=tools,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
+            thinking=thinking,
+            **kwargs,
+        )
+def deep_sort_dict(obj: Any) -> Any:
+    if isinstance(obj, dict):
+        return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
+    if isinstance(obj, list):
+        return [deep_sort_dict(item) for item in obj]
+    return obj

tool_declaration_ts.py CHANGED Viewed

@@ -1,479 +1,500 @@
-"""
-Encode structured tool declaration to typescript style string.
-"""
-import dataclasses
-import json
-import logging
-from collections.abc import Sequence
-from typing import Any
-logger = logging.getLogger(__name__)
-_TS_INDENT = "  "
-_TS_FIELD_DELIMITER = ",\n"
-class _SchemaRegistry:
-    """Registry for schema definitions to handle $ref resolution"""
-    def __init__(self):
-        self.definitions = {}
-        self.has_self_ref = False
-    def register_definitions(self, defs: dict[str, Any]):
-        """Register schema definitions from $defs section"""
-        if not defs:
-            return
-        for def_name, def_schema in defs.items():
-            self.definitions[def_name] = def_schema
-    def resolve_ref(self, ref: str) -> dict[str, Any]:
-        """Resolve a reference to its schema definition"""
-        if ref == "#":
-            self.has_self_ref = True
-            return {"$self_ref": True}
-        elif ref.startswith("#/$defs/"):
-            def_name = ref.split("/")[-1]
-            if def_name not in self.definitions:
-                raise ValueError(f"Reference not found: {ref}")
-            return self.definitions[def_name]
-        else:
-            raise ValueError(f"Unsupported reference format: {ref}")
-def _format_description(description: str, indent: str = "") -> str:
-    return "\n".join([
-        f"{indent}// {line}" if line else ""
-        for line in description.split("\n")
-    ])
-class _BaseType:
-    description: str
-    constraints: dict[str, Any]
-    def __init__(
-            self,
-            extra_props: dict[str, Any],
-            *,
-            allowed_constraint_keys: Sequence[str] = (),
-    ):
-        self.description = extra_props.get("description", "")
-        self.constraints = {
-            k: v
-            for k, v in extra_props.items() if k in allowed_constraint_keys
-        }
-    def to_typescript_style(self, indent: str = "") -> str:
-        raise NotImplementedError
-    def format_docstring(self, indent: str) -> str:
-        lines = []
-        if self.description:
-            lines.append(_format_description(self.description, indent))
-        if self.constraints:
-            constraints_str = ", ".join(f"{k}: {v}" for k, v in sorted(
-                self.constraints.items(), key=lambda kv: kv[0]))
-            lines.append(f"{indent}// {constraints_str}")
-        return "".join(x + "\n" for x in lines)
-class _ParameterTypeScalar(_BaseType):
-    type: str
-    def __init__(self, type: str, extra_props: dict[str, Any] | None = None):
-        self.type = type
-        allowed_constraint_keys: list[str] = []
-        if self.type == "string":
-            allowed_constraint_keys = ["maxLength", "minLength", "pattern"]
-        elif self.type in ("number", "integer"):
-            allowed_constraint_keys = ["maximum", "minimum"]
-        super().__init__(extra_props or {},
-                         allowed_constraint_keys=allowed_constraint_keys)
-    def to_typescript_style(self, indent: str = "") -> str:
-        # Map integer to number in TypeScript
-        if self.type == "integer":
-            return "number"
-        return self.type
-class _ParameterTypeObject(_BaseType):
-    properties: list["_Parameter"]
-    additional_properties: Any | None = None
-    def __init__(self,
-                 json_schema_object: dict[str, Any],
-                 registry: _SchemaRegistry | None = None):
-        super().__init__(json_schema_object)
-        self.properties = []
-        self.additional_properties = None
-        if not json_schema_object:
-            return
-        if "$defs" in json_schema_object and registry:
-            registry.register_definitions(json_schema_object["$defs"])
-        self.additional_properties = json_schema_object.get(
-            "additionalProperties")
-        if isinstance(self.additional_properties, dict):
-            self.additional_properties = _parse_parameter_type(
-                self.additional_properties, registry)
-        if "properties" not in json_schema_object:
-            return
-        required_parameters = json_schema_object.get("required", [])
-        optional_parameters = set(
-            json_schema_object["properties"].keys()) - set(required_parameters)
-        self.properties = [
-            _Parameter(
-                name=name,
-                type=_parse_parameter_type(prop, registry),
-                optional=name in optional_parameters,
-                default=prop.get("default")
-                if isinstance(prop, dict) else None,
-            ) for name, prop in json_schema_object["properties"].items()
-        ]
-    def to_typescript_style(self, indent: str = "") -> str:
-        # sort by optional, make the required parameters first
-        parameters = [p for p in self.properties if not p.optional]
-        opt_params = [p for p in self.properties if p.optional]
-        parameters = sorted(parameters, key=lambda p: p.name)
-        parameters.extend(sorted(opt_params, key=lambda p: p.name))
-        param_strs = []
-        for p in parameters:
-            one = p.to_typescript_style(indent=indent + _TS_INDENT)
-            param_strs.append(one)
-        if self.additional_properties is not None:
-            ap_type_str = "any"
-            if self.additional_properties is True:
-                ap_type_str = "any"
-            elif self.additional_properties is False:
-                ap_type_str = "never"
-            elif isinstance(self.additional_properties, _ParameterType):
-                ap_type_str = self.additional_properties.to_typescript_style(
-                    indent=indent + _TS_INDENT)
-            else:
-                raise ValueError(
-                    f"Unknown additionalProperties: {self.additional_properties}"
-                )
-            param_strs.append(
-                f"{indent + _TS_INDENT}[k: string]: {ap_type_str}")
-        if not param_strs:
-            return "{}"
-        params_str = _TS_FIELD_DELIMITER.join(param_strs)
-        if params_str:
-            # add new line before and after
-            params_str = f"\n{params_str}\n"
-        # always wrap with object
-        return f"{{{params_str}{indent}}}"
-class _ParameterTypeArray(_BaseType):
-    item: "_ParameterType"
-    def __init__(self,
-                 json_schema_object: dict[str, Any],
-                 registry: _SchemaRegistry | None = None):
-        super().__init__(json_schema_object,
-                         allowed_constraint_keys=("minItems", "maxItems"))
-        if json_schema_object.get("items"):
-            self.item = _parse_parameter_type(json_schema_object["items"],
-                                              registry)
-        else:
-            self.item = _ParameterTypeScalar(type="any")
-    def to_typescript_style(self, indent: str = "") -> str:
-        item_docstring = self.item.format_docstring(indent + _TS_INDENT)
-        if item_docstring:
-            return ("Array<\n" + item_docstring + indent + _TS_INDENT +
-                    self.item.to_typescript_style(indent=indent + _TS_INDENT) +
-                    "\n" + indent + ">")
-        else:
-            return f"Array<{self.item.to_typescript_style(indent=indent)}>"
-class _ParameterTypeEnum(_BaseType):
-    # support scalar types only
-    enum: list[str | int | float | bool | None]
-    def __init__(self, json_schema_object: dict[str, Any]):
-        super().__init__(json_schema_object)
-        self.enum = json_schema_object["enum"]
-        # Validate enum values against declared type if present
-        if "type" in json_schema_object:
-            typ = json_schema_object["type"]
-            if isinstance(typ, list):
-                if len(typ) == 1:
-                    typ = typ[0]
-                elif len(typ) == 2:
-                    if "null" not in typ:
-                        raise ValueError(f"Enum type {typ} is not supported")
-                    else:
-                        typ = typ[0] if typ[0] != "null" else typ[1]
-                else:
-                    raise ValueError(f"Enum type {typ} is not supported")
-            for val in self.enum:
-                if val is None:
-                    continue
-                if typ == "string" and not isinstance(val, str):
-                    raise ValueError(f"Enum value {val} is not a string")
-                elif typ == "number" and not isinstance(val, (int, float)):
-                    raise ValueError(f"Enum value {val} is not a number")
-                elif typ == "integer" and not isinstance(val, int):
-                    raise ValueError(f"Enum value {val} is not an integer")
-                elif typ == "boolean" and not isinstance(val, bool):
-                    raise ValueError(f"Enum value {val} is not a boolean")
-    def to_typescript_style(self, indent: str = "") -> str:
-        return " | ".join(
-            [f'"{e}"' if isinstance(e, str) else str(e) for e in self.enum])
-class _ParameterTypeAnyOf(_BaseType):
-    types: list["_ParameterType"]
-    def __init__(
-        self,
-        json_schema_object: dict[str, Any],
-        registry: _SchemaRegistry | None = None,
-    ):
-        super().__init__(json_schema_object)
-        self.types = [
-            _parse_parameter_type(t, registry)
-            for t in json_schema_object["anyOf"]
-        ]
-    def to_typescript_style(self, indent: str = "") -> str:
-        return " | ".join(
-            [t.to_typescript_style(indent=indent) for t in self.types])
-class _ParameterTypeUnion(_BaseType):
-    types: list[str]
-    def __init__(self, json_schema_object: dict[str, Any]):
-        super().__init__(json_schema_object)
-        mapping = {
-            "string": "string",
-            "number": "number",
-            "integer": "number",
-            "boolean": "boolean",
-            "null": "null",
-            "object": "{}",
-            "array": "Array<any>",
-        }
-        self.types = [mapping[t] for t in json_schema_object["type"]]
-    def to_typescript_style(self, indent: str = "") -> str:
-        return " | ".join(self.types)
-class _ParameterTypeRef(_BaseType):
-    ref_name: str
-    is_self_ref: bool = False
-    def __init__(self, json_schema_object: dict[str, Any],
-                 registry: _SchemaRegistry):
-        super().__init__(json_schema_object)
-        ref = json_schema_object["$ref"]
-        resolved_schema = registry.resolve_ref(ref)
-        if resolved_schema.get("$self_ref", False):
-            self.ref_name = "parameters"
-            self.is_self_ref = True
-        else:
-            self.ref_name = ref.split("/")[-1]
-    def to_typescript_style(self, indent: str = "") -> str:
-        return self.ref_name
-_ParameterType = (_ParameterTypeScalar
-                  | _ParameterTypeObject
-                  | _ParameterTypeArray
-                  | _ParameterTypeEnum
-                  | _ParameterTypeAnyOf
-                  | _ParameterTypeUnion
-                  | _ParameterTypeRef)
-@dataclasses.dataclass
-class _Parameter:
-    """
-    A parameter in a function, or a field in a object.
-    It consists of the type as well as the name.
-    """
-    type: _ParameterType
-    name: str = "_"
-    optional: bool = True
-    default: Any | None = None
-    @classmethod
-    def parse_extended(cls, attributes: dict[str, Any]) -> "_Parameter":
-        if not attributes:
-            raise ValueError("attributes is empty")
-        return cls(
-            name=attributes.get("name", "_"),
-            type=_parse_parameter_type(attributes),
-            optional=attributes.get("optional", False),
-            default=attributes.get("default"),
-        )
-    def to_typescript_style(self, indent: str = "") -> str:
-        comments = self.type.format_docstring(indent)
-        if self.default is not None:
-            default_repr = (json.dumps(self.default, ensure_ascii=False)
-                            if not isinstance(self.default, (int, float, bool))
-                            else repr(self.default))
-            comments += f"{indent}// Default: {default_repr}\n"
-        return (
-            comments +
-            f"{indent}{self.name}{'?' if self.optional else ''}: {self.type.to_typescript_style(indent=indent)}"
-        )
-def _parse_parameter_type(
-        json_schema_object: dict[str, Any] | bool,
-        registry: _SchemaRegistry | None = None) -> _ParameterType:
-    if isinstance(json_schema_object, bool):
-        if json_schema_object:
-            return _ParameterTypeScalar(type="any")
-        else:
-            logger.warning(
-                f"Warning: Boolean value {json_schema_object} is not supported, use null instead."
-            )
-            return _ParameterTypeScalar(type="null")
-    if "$ref" in json_schema_object and registry:
-        return _ParameterTypeRef(json_schema_object, registry)
-    if "anyOf" in json_schema_object:
-        return _ParameterTypeAnyOf(json_schema_object, registry)
-    elif "enum" in json_schema_object:
-        return _ParameterTypeEnum(json_schema_object)
-    elif "type" in json_schema_object:
-        typ = json_schema_object["type"]
-        if isinstance(typ, list):
-            return _ParameterTypeUnion(json_schema_object)
-        elif typ == "object":
-            return _ParameterTypeObject(json_schema_object, registry)
-        elif typ == "array":
-            return _ParameterTypeArray(json_schema_object, registry)
-        else:
-            return _ParameterTypeScalar(typ, json_schema_object)
-    elif json_schema_object == {}:
-        return _ParameterTypeScalar(type="any")
-    else:
-        raise ValueError(f"Invalid JSON Schema object: {json_schema_object}")
-def _openai_function_to_typescript_style(function: dict[str, Any], ) -> str:
-    """Convert OpenAI function definition (dict) to TypeScript style string."""
-    registry = _SchemaRegistry()
-    parameters = function.get("parameters") or {}
-    parsed = _ParameterTypeObject(parameters, registry)
-    interfaces = []
-    root_interface_name = None
-    if registry.has_self_ref:
-        root_interface_name = "parameters"
-        params_str = _TS_FIELD_DELIMITER.join([
-            p.to_typescript_style(indent=_TS_INDENT) for p in parsed.properties
-        ])
-        params_str = f"\n{params_str}\n" if params_str else ""
-        interface_def = f"interface {root_interface_name} {{{params_str}}}"
-        interfaces.append(interface_def)
-    definitions_copy = dict(registry.definitions)
-    for def_name, def_schema in definitions_copy.items():
-        obj_type = _parse_parameter_type(def_schema, registry)
-        params_str = obj_type.to_typescript_style()
-        description_part = ""
-        if obj_description := def_schema.get("description", ""):
-            description_part = _format_description(obj_description) + "\n"
-        interface_def = f"{description_part}interface {def_name} {params_str}"
-        interfaces.append(interface_def)
-    interface_str = "\n".join(interfaces)
-    function_name = function.get("name", "function")
-    if root_interface_name:
-        type_def = f"type {function_name} = (_: {root_interface_name}) => any;"
-    else:
-        params_str = parsed.to_typescript_style()
-        type_def = f"type {function_name} = (_: {params_str}) => any;"
-    description = function.get("description")
-    return "\n".join(
-        filter(
-            bool,
-            [
-                interface_str,
-                ((description and _format_description(description)) or ""),
-                type_def,
-            ],
-        ))
-def encode_tools_to_typescript_style(tools: list[dict[str, Any]], ) -> str:
-    """
-    Convert tools (list of dict) to TypeScript style string.
-    Supports OpenAI format: {"type": "function", "function": {...}}
-    Args:
-        tools: List of tool definitions in dict format
-    Returns:
-        TypeScript style string representation of the tools
-    """
-    if not tools:
-        return ""
-    functions = []
-    for tool in tools:
-        tool_type = tool.get("type")
-        if tool_type == "function":
-            func_def = tool.get("function", {})
-            if func_def:
-                functions.append(
-                    _openai_function_to_typescript_style(func_def))
-        else:
-            # Skip unsupported tool types (like "_plugin")
-            continue
-    if not functions:
-        return ""
-    functions_str = "\n".join(functions)
-    result = "# Tools\n\n"
-    if functions_str:
-        result += "## functions\nnamespace functions {\n"
-        result += functions_str + "\n"
-        result += "}\n"
-    return result

+"""
+Encode structured tool declaration to typescript style string.
+"""
+import dataclasses
+import json
+import logging
+from collections.abc import Sequence
+from typing import Any
+logger = logging.getLogger(__name__)
+_TS_INDENT = "  "
+_TS_FIELD_DELIMITER = ",\n"
+class _SchemaRegistry:
+    """Registry for schema definitions to handle $ref resolution"""
+    def __init__(self):
+        self.definitions = {}
+        self.has_self_ref = False
+    def register_definitions(self, defs: dict[str, Any]):
+        """Register schema definitions from $defs section"""
+        if not defs:
+            return
+        for def_name, def_schema in defs.items():
+            self.definitions[def_name] = def_schema
+    def resolve_ref(self, ref: str) -> dict[str, Any]:
+        """Resolve a reference to its schema definition"""
+        if ref == "#":
+            self.has_self_ref = True
+            return {"$self_ref": True}
+        elif ref.startswith("#/$defs/"):
+            def_name = ref.split("/")[-1]
+            if def_name not in self.definitions:
+                raise ValueError(f"Reference not found: {ref}")
+            return self.definitions[def_name]
+        else:
+            raise ValueError(f"Unsupported reference format: {ref}")
+def _format_description(description: str, indent: str = "") -> str:
+    return "\n".join(
+        [f"{indent}// {line}" if line else "" for line in description.split("\n")]
+    )
+class _BaseType:
+    description: str
+    constraints: dict[str, Any]
+    def __init__(
+        self,
+        extra_props: dict[str, Any],
+        *,
+        allowed_constraint_keys: Sequence[str] = (),
+    ):
+        self.description = extra_props.get("description", "")
+        self.constraints = {
+            k: v for k, v in extra_props.items() if k in allowed_constraint_keys
+        }
+    def to_typescript_style(self, indent: str = "") -> str:
+        raise NotImplementedError
+    def format_docstring(self, indent: str) -> str:
+        lines = []
+        if self.description:
+            lines.append(_format_description(self.description, indent))
+        if self.constraints:
+            constraints_str = ", ".join(
+                f"{k}: {v}"
+                for k, v in sorted(self.constraints.items(), key=lambda kv: kv[0])
+            )
+            lines.append(f"{indent}// {constraints_str}")
+        return "".join(x + "\n" for x in lines)
+class _ParameterTypeScalar(_BaseType):
+    type: str
+    def __init__(self, type: str, extra_props: dict[str, Any] | None = None):
+        self.type = type
+        allowed_constraint_keys: list[str] = []
+        if self.type == "string":
+            allowed_constraint_keys = ["maxLength", "minLength", "pattern"]
+        elif self.type in ("number", "integer"):
+            allowed_constraint_keys = ["maximum", "minimum"]
+        super().__init__(
+            extra_props or {}, allowed_constraint_keys=allowed_constraint_keys
+        )
+    def to_typescript_style(self, indent: str = "") -> str:
+        # Map integer to number in TypeScript
+        if self.type == "integer":
+            return "number"
+        return self.type
+class _ParameterTypeObject(_BaseType):
+    properties: list["_Parameter"]
+    additional_properties: Any | None = None
+    def __init__(
+        self,
+        json_schema_object: dict[str, Any],
+        registry: _SchemaRegistry | None = None,
+    ):
+        super().__init__(json_schema_object)
+        self.properties = []
+        self.additional_properties = None
+        if not json_schema_object:
+            return
+        if "$defs" in json_schema_object and registry:
+            registry.register_definitions(json_schema_object["$defs"])
+        self.additional_properties = json_schema_object.get("additionalProperties")
+        if isinstance(self.additional_properties, dict):
+            self.additional_properties = _parse_parameter_type(
+                self.additional_properties, registry
+            )
+        if "properties" not in json_schema_object:
+            return
+        required_parameters = json_schema_object.get("required", [])
+        optional_parameters = set(json_schema_object["properties"].keys()) - set(
+            required_parameters
+        )
+        self.properties = [
+            _Parameter(
+                name=name,
+                type=_parse_parameter_type(prop, registry),
+                optional=name in optional_parameters,
+                default=prop.get("default") if isinstance(prop, dict) else None,
+            )
+            for name, prop in json_schema_object["properties"].items()
+        ]
+    def to_typescript_style(self, indent: str = "") -> str:
+        # sort by optional, make the required parameters first
+        parameters = [p for p in self.properties if not p.optional]
+        opt_params = [p for p in self.properties if p.optional]
+        parameters = sorted(parameters, key=lambda p: p.name)
+        parameters.extend(sorted(opt_params, key=lambda p: p.name))
+        param_strs = []
+        for p in parameters:
+            one = p.to_typescript_style(indent=indent + _TS_INDENT)
+            param_strs.append(one)
+        if self.additional_properties is not None:
+            ap_type_str = "any"
+            if self.additional_properties is True:
+                ap_type_str = "any"
+            elif self.additional_properties is False:
+                ap_type_str = "never"
+            elif isinstance(self.additional_properties, _ParameterType):
+                ap_type_str = self.additional_properties.to_typescript_style(
+                    indent=indent + _TS_INDENT
+                )
+            else:
+                raise ValueError(
+                    f"Unknown additionalProperties: {self.additional_properties}"
+                )
+            param_strs.append(f"{indent + _TS_INDENT}[k: string]: {ap_type_str}")
+        if not param_strs:
+            return "{}"
+        params_str = _TS_FIELD_DELIMITER.join(param_strs)
+        if params_str:
+            # add new line before and after
+            params_str = f"\n{params_str}\n"
+        # always wrap with object
+        return f"{{{params_str}{indent}}}"
+class _ParameterTypeArray(_BaseType):
+    item: "_ParameterType"
+    def __init__(
+        self,
+        json_schema_object: dict[str, Any],
+        registry: _SchemaRegistry | None = None,
+    ):
+        super().__init__(
+            json_schema_object, allowed_constraint_keys=("minItems", "maxItems")
+        )
+        if json_schema_object.get("items"):
+            self.item = _parse_parameter_type(json_schema_object["items"], registry)
+        else:
+            self.item = _ParameterTypeScalar(type="any")
+    def to_typescript_style(self, indent: str = "") -> str:
+        item_docstring = self.item.format_docstring(indent + _TS_INDENT)
+        if item_docstring:
+            return (
+                "Array<\n"
+                + item_docstring
+                + indent
+                + _TS_INDENT
+                + self.item.to_typescript_style(indent=indent + _TS_INDENT)
+                + "\n"
+                + indent
+                + ">"
+            )
+        else:
+            return f"Array<{self.item.to_typescript_style(indent=indent)}>"
+class _ParameterTypeEnum(_BaseType):
+    # support scalar types only
+    enum: list[str | int | float | bool | None]
+    def __init__(self, json_schema_object: dict[str, Any]):
+        super().__init__(json_schema_object)
+        self.enum = json_schema_object["enum"]
+        # Validate enum values against declared type if present
+        if "type" in json_schema_object:
+            typ = json_schema_object["type"]
+            if isinstance(typ, list):
+                if len(typ) == 1:
+                    typ = typ[0]
+                elif len(typ) == 2:
+                    if "null" not in typ:
+                        raise ValueError(f"Enum type {typ} is not supported")
+                    else:
+                        typ = typ[0] if typ[0] != "null" else typ[1]
+                else:
+                    raise ValueError(f"Enum type {typ} is not supported")
+            for val in self.enum:
+                if val is None:
+                    continue
+                if typ == "string" and not isinstance(val, str):
+                    raise ValueError(f"Enum value {val} is not a string")
+                elif typ == "number" and not isinstance(val, (int, float)):
+                    raise ValueError(f"Enum value {val} is not a number")
+                elif typ == "integer" and not isinstance(val, int):
+                    raise ValueError(f"Enum value {val} is not an integer")
+                elif typ == "boolean" and not isinstance(val, bool):
+                    raise ValueError(f"Enum value {val} is not a boolean")
+    def to_typescript_style(self, indent: str = "") -> str:
+        return " | ".join(
+            [f'"{e}"' if isinstance(e, str) else str(e) for e in self.enum]
+        )
+class _ParameterTypeAnyOf(_BaseType):
+    types: list["_ParameterType"]
+    def __init__(
+        self,
+        json_schema_object: dict[str, Any],
+        registry: _SchemaRegistry | None = None,
+    ):
+        super().__init__(json_schema_object)
+        self.types = [
+            _parse_parameter_type(t, registry) for t in json_schema_object["anyOf"]
+        ]
+    def to_typescript_style(self, indent: str = "") -> str:
+        return " | ".join([t.to_typescript_style(indent=indent) for t in self.types])
+class _ParameterTypeUnion(_BaseType):
+    types: list[str]
+    def __init__(self, json_schema_object: dict[str, Any]):
+        super().__init__(json_schema_object)
+        mapping = {
+            "string": "string",
+            "number": "number",
+            "integer": "number",
+            "boolean": "boolean",
+            "null": "null",
+            "object": "{}",
+            "array": "Array<any>",
+        }
+        self.types = [mapping[t] for t in json_schema_object["type"]]
+    def to_typescript_style(self, indent: str = "") -> str:
+        return " | ".join(self.types)
+class _ParameterTypeRef(_BaseType):
+    ref_name: str
+    is_self_ref: bool = False
+    def __init__(self, json_schema_object: dict[str, Any], registry: _SchemaRegistry):
+        super().__init__(json_schema_object)
+        ref = json_schema_object["$ref"]
+        resolved_schema = registry.resolve_ref(ref)
+        if resolved_schema.get("$self_ref", False):
+            self.ref_name = "parameters"
+            self.is_self_ref = True
+        else:
+            self.ref_name = ref.split("/")[-1]
+    def to_typescript_style(self, indent: str = "") -> str:
+        return self.ref_name
+_ParameterType = (
+    _ParameterTypeScalar
+    | _ParameterTypeObject
+    | _ParameterTypeArray
+    | _ParameterTypeEnum
+    | _ParameterTypeAnyOf
+    | _ParameterTypeUnion
+    | _ParameterTypeRef
+)
+@dataclasses.dataclass
+class _Parameter:
+    """
+    A parameter in a function, or a field in a object.
+    It consists of the type as well as the name.
+    """
+    type: _ParameterType
+    name: str = "_"
+    optional: bool = True
+    default: Any | None = None
+    @classmethod
+    def parse_extended(cls, attributes: dict[str, Any]) -> "_Parameter":
+        if not attributes:
+            raise ValueError("attributes is empty")
+        return cls(
+            name=attributes.get("name", "_"),
+            type=_parse_parameter_type(attributes),
+            optional=attributes.get("optional", False),
+            default=attributes.get("default"),
+        )
+    def to_typescript_style(self, indent: str = "") -> str:
+        comments = self.type.format_docstring(indent)
+        if self.default is not None:
+            default_repr = (
+                json.dumps(self.default, ensure_ascii=False)
+                if not isinstance(self.default, (int, float, bool))
+                else repr(self.default)
+            )
+            comments += f"{indent}// Default: {default_repr}\n"
+        return (
+            comments
+            + f"{indent}{self.name}{'?' if self.optional else ''}: {self.type.to_typescript_style(indent=indent)}"
+        )
+def _parse_parameter_type(
+    json_schema_object: dict[str, Any] | bool, registry: _SchemaRegistry | None = None
+) -> _ParameterType:
+    if isinstance(json_schema_object, bool):
+        if json_schema_object:
+            return _ParameterTypeScalar(type="any")
+        else:
+            logger.warning(
+                f"Warning: Boolean value {json_schema_object} is not supported, use null instead."
+            )
+            return _ParameterTypeScalar(type="null")
+    if "$ref" in json_schema_object and registry:
+        return _ParameterTypeRef(json_schema_object, registry)
+    if "anyOf" in json_schema_object:
+        return _ParameterTypeAnyOf(json_schema_object, registry)
+    elif "enum" in json_schema_object:
+        return _ParameterTypeEnum(json_schema_object)
+    elif "type" in json_schema_object:
+        typ = json_schema_object["type"]
+        if isinstance(typ, list):
+            return _ParameterTypeUnion(json_schema_object)
+        elif typ == "object":
+            return _ParameterTypeObject(json_schema_object, registry)
+        elif typ == "array":
+            return _ParameterTypeArray(json_schema_object, registry)
+        else:
+            return _ParameterTypeScalar(typ, json_schema_object)
+    elif json_schema_object == {}:
+        return _ParameterTypeScalar(type="any")
+    else:
+        raise ValueError(f"Invalid JSON Schema object: {json_schema_object}")
+def _openai_function_to_typescript_style(
+    function: dict[str, Any],
+) -> str:
+    """Convert OpenAI function definition (dict) to TypeScript style string."""
+    registry = _SchemaRegistry()
+    parameters = function.get("parameters") or {}
+    parsed = _ParameterTypeObject(parameters, registry)
+    interfaces = []
+    root_interface_name = None
+    if registry.has_self_ref:
+        root_interface_name = "parameters"
+        params_str = _TS_FIELD_DELIMITER.join(
+            [p.to_typescript_style(indent=_TS_INDENT) for p in parsed.properties]
+        )
+        params_str = f"\n{params_str}\n" if params_str else ""
+        interface_def = f"interface {root_interface_name} {{{params_str}}}"
+        interfaces.append(interface_def)
+    definitions_copy = dict(registry.definitions)
+    for def_name, def_schema in definitions_copy.items():
+        obj_type = _parse_parameter_type(def_schema, registry)
+        params_str = obj_type.to_typescript_style()
+        description_part = ""
+        if obj_description := def_schema.get("description", ""):
+            description_part = _format_description(obj_description) + "\n"
+        interface_def = f"{description_part}interface {def_name} {params_str}"
+        interfaces.append(interface_def)
+    interface_str = "\n".join(interfaces)
+    raw_function_name = function.get("name", "function")
+    function_name = raw_function_name.replace("-", "_").replace(" ", "_")
+    if root_interface_name:
+        type_def = f"type {function_name} = (_: {root_interface_name}) => any;"
+    else:
+        params_str = parsed.to_typescript_style()
+        type_def = f"type {function_name} = (_: {params_str}) => any;"
+    description = function.get("description")
+    return "\n".join(
+        filter(
+            bool,
+            [
+                interface_str,
+                ((description and _format_description(description)) or ""),
+                type_def,
+            ],
+        )
+    )
+def encode_tools_to_typescript_style(
+    tools: list[dict[str, Any]],
+) -> str:
+    """
+    Convert tools (list of dict) to TypeScript style string.
+    Supports OpenAI format: {"type": "function", "function": {...}}
+    Args:
+        tools: List of tool definitions in dict format
+    Returns:
+        TypeScript style string representation of the tools
+    """
+    if not tools:
+        return ""
+    functions = []
+    for tool in tools:
+        tool_type = tool.get("type")
+        if tool_type == "function":
+            func_def = tool.get("function", {})
+            if func_def:
+                functions.append(_openai_function_to_typescript_style(func_def))
+        else:
+            # Skip unsupported tool types (like "_plugin")
+            continue
+    if not functions:
+        return ""
+    functions_str = "\n".join(functions)
+    result = "# Tools\n\n"
+    if functions_str:
+        result += "## functions\nnamespace functions {\n"
+        result += functions_str + "\n"
+        result += "}\n"
+    return result