Bittoby1040 commited on
Commit
aedf7b8
·
verified ·
1 Parent(s): 93f7ec0

Upload folder using huggingface_hub

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
8
+ <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
10
+ <|kimi_k25_video_placeholder|>
11
+ {% else -%}
12
+ {{ content['text'] }}
13
+ {%- endif -%}
14
+ {%- endfor -%}
15
+ {%- endif -%}
16
+ {%- endmacro -%}
17
+
18
+ {% macro set_roles(message) -%}
19
+ {%- set role_name = message.get('name') or message['role'] -%}
20
+ {%- if message['role'] == 'user' -%}
21
+ <|im_user|>{{role_name}}<|im_middle|>
22
+ {%- elif message['role'] == 'assistant' -%}
23
+ <|im_assistant|>{{role_name}}<|im_middle|>
24
+ {%- else -%}
25
+ <|im_system|>{{role_name}}<|im_middle|>
26
+ {%- endif -%}
27
+ {%- endmacro -%}
28
+
29
+
30
+ {%- macro render_toolcalls(message) -%}
31
+ <|tool_calls_section_begin|>
32
+ {%- for tool_call in message['tool_calls'] -%}
33
+ {%- set formatted_id = tool_call['id'] -%}
34
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
35
+ {%- endfor -%}
36
+ <|tool_calls_section_end|>
37
+ {%- endmacro -%}
38
+
39
+
40
+ {%- set preserve_thinking = preserve_thinking | default(false) -%}
41
+ {# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
42
+ {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
43
+ {%- if not preserve_thinking -%}
44
+ {%- for idx in range(messages|length-1, -1, -1) -%}
45
+ {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
46
+ {%- set ns.last_non_tool_call_assistant_msg = idx -%}
47
+ {%- break -%}
48
+ {%- endif -%}
49
+ {%- endfor -%}
50
+ {%- endif -%}
51
+
52
+ {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
53
+ {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
54
+ {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
55
+
56
+ {%- if tools -%}
57
+ {%- if tools_ts_str -%}
58
+ <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
59
+ {%- else -%}
60
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
61
+ {%- endif -%}
62
+ {%- endif -%}
63
+
64
+
65
+ {%- for message in hist_msgs -%}
66
+ {{set_roles(message)}}
67
+ {%- if message['role'] == 'assistant' -%}
68
+ <think></think>{{render_content(message)}}
69
+ {%- if message.get('tool_calls') -%}
70
+ {{render_toolcalls(message)}}
71
+ {%- endif -%}
72
+ {%- elif message['role'] == 'tool' -%}
73
+ {%- set tool_call_id = message.tool_call_id -%}
74
+ ## Return of {{ tool_call_id }}
75
+ {{render_content(message)}}
76
+ {%- elif message['content'] is not none -%}
77
+ {{render_content(message)}}
78
+ {%- endif -%}
79
+ <|im_end|>
80
+ {%- endfor -%}
81
+
82
+ {%- for message in suffix_msgs -%}
83
+ {{set_roles(message)}}
84
+ {%- if message['role'] == 'assistant' -%}
85
+ {%- if thinking is defined and thinking is false and preserve_thinking is false -%}
86
+ <think></think>{{render_content(message)}}
87
+ {%- else -%}
88
+ {%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
89
+ <think>{{rc}}</think>{{render_content(message)}}
90
+ {%- endif -%}
91
+ {%- if message.get('tool_calls') -%}
92
+ {{render_toolcalls(message)}}
93
+ {%- endif -%}
94
+ {%- elif message['role'] == 'tool' -%}
95
+ {%- set tool_call_id = message.tool_call_id -%}
96
+ ## Return of {{ tool_call_id }}
97
+ {{render_content(message)}}
98
+ {%- elif message['content'] is not none -%}
99
+ {{render_content(message)}}
100
+ {%- endif -%}
101
+ <|im_end|>
102
+ {%- endfor -%}
103
+
104
+
105
+ {%- if add_generation_prompt -%}
106
+ <|im_assistant|>assistant<|im_middle|>
107
+ {%- if thinking is defined and thinking is false -%}
108
+ <think></think>
109
+ {%- else -%}
110
+ <think>
111
+ {%- endif -%}
112
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeepseekV3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "aux_loss_alpha": 0.001,
8
+ "bos_token_id": 163584,
9
+ "dtype": "float16",
10
+ "eos_token_id": 163585,
11
+ "ep_size": 1,
12
+ "first_k_dense_replace": 1,
13
+ "head_dim": 64,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 2048,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 11264,
18
+ "kv_lora_rank": 512,
19
+ "max_position_embeddings": 131072,
20
+ "model_type": "deepseek_v3",
21
+ "moe_intermediate_size": 1408,
22
+ "moe_layer_freq": 1,
23
+ "n_group": 1,
24
+ "n_routed_experts": 64,
25
+ "n_shared_experts": 2,
26
+ "norm_topk_prob": true,
27
+ "num_attention_heads": 16,
28
+ "num_experts_per_tok": 6,
29
+ "num_hidden_layers": 27,
30
+ "num_key_value_heads": 16,
31
+ "num_nextn_predict_layers": 1,
32
+ "num_shared_experts": 2,
33
+ "pad_token_id": 163839,
34
+ "pretraining_tp": 1,
35
+ "q_lora_rank": null,
36
+ "qk_head_dim": 192,
37
+ "qk_nope_head_dim": 128,
38
+ "qk_rope_head_dim": 64,
39
+ "rms_norm_eps": 1e-05,
40
+ "rope_interleave": true,
41
+ "rope_parameters": {
42
+ "rope_theta": 800000.0,
43
+ "rope_type": "default"
44
+ },
45
+ "routed_scaling_factor": 2.446,
46
+ "scoring_func": "sigmoid",
47
+ "seq_aux": true,
48
+ "tie_word_embeddings": false,
49
+ "topk_group": 1,
50
+ "topk_method": "noaux_tc",
51
+ "transformers_version": "5.8.1",
52
+ "use_cache": false,
53
+ "v_head_dim": 128,
54
+ "vocab_size": 163840
55
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 163584,
4
+ "eos_token_id": 163585,
5
+ "pad_token_id": 163839,
6
+ "transformers_version": "5.8.1"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e0b6791300386330613a41b9fe4632ec4c99c7cd1ee51e9d9ec9a3523fa64c
3
+ size 31920888072
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_kimi.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections import OrderedDict
3
+ from logging import getLogger
4
+ from pathlib import Path
5
+ from shutil import copyfile
6
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast
7
+
8
+ import tiktoken
9
+ from tiktoken.load import load_tiktoken_bpe
10
+ from tokenizers import AddedToken
11
+ from transformers.convert_slow_tokenizer import bytes_to_unicode
12
+ from transformers.tokenization_utils import PreTrainedTokenizer
13
+
14
+ from .tool_declaration_ts import encode_tools_to_typescript_style
15
+
16
+ logger = getLogger(__name__)
17
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
18
+
19
+
20
+ class TikTokenTokenizer(PreTrainedTokenizer):
21
+ """
22
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
23
+
24
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
25
+ this superclass for more information regarding those methods.
26
+
27
+ Args:
28
+ vocab_file (`str`):
29
+ The path to the Tiktoken model file.
30
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
31
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
32
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
33
+ The end of sequence token.
34
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
35
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
36
+ token instead. The second to last item in special_tokens.
37
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
38
+ The token used for padding, for example when batching sequences of different lengths.
39
+ additional_special_tokens (list of `str`, *optional*):
40
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
41
+ skipped when decoding if `skip_special_tokens` is set to `True`.
42
+ """
43
+
44
+ vocab_files_names = VOCAB_FILES_NAMES
45
+
46
+ model_input_names = ["input_ids", "attention_mask"]
47
+
48
+ special_tokens: Dict[str, int]
49
+
50
+ num_reserved_special_tokens = 256
51
+
52
+ pat_str = "|".join([
53
+ r"""[\p{Han}]+""",
54
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
55
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
56
+ r"""\p{N}{1,3}""",
57
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
58
+ r"""\s*[\r\n]+""",
59
+ r"""\s+(?!\S)""",
60
+ r"""\s+""",
61
+ ])
62
+
63
+ def __init__(
64
+ self,
65
+ vocab_file,
66
+ bos_token: Union[str, AddedToken] = "[BOS]",
67
+ eos_token: Union[str, AddedToken] = "[EOS]",
68
+ unk_token: Union[str, AddedToken, None] = None,
69
+ pad_token: Union[str, AddedToken, None] = None,
70
+ additional_special_tokens: List[str] = None,
71
+ added_tokens_decoder: Optional[dict] = None,
72
+ **kwargs,
73
+ ):
74
+ assert os.path.isfile(vocab_file), vocab_file
75
+
76
+ if additional_special_tokens is None:
77
+ additional_special_tokens = [
78
+ "<|im_end|>",
79
+ "<|im_user|>",
80
+ "<|im_assistant|>",
81
+ "<|start_header_id|>",
82
+ "<|end_header_id|>",
83
+ "[EOT]",
84
+ "<|im_system|>",
85
+ "<|im_middle|>",
86
+ ]
87
+
88
+ if added_tokens_decoder:
89
+ special_tokens_mapping = {
90
+ i: added_tokens_decoder[i].content
91
+ for i in added_tokens_decoder
92
+ }
93
+ else:
94
+ special_tokens_mapping = {}
95
+
96
+ self.vocab_file = vocab_file
97
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
98
+ num_base_tokens = len(mergeable_ranks)
99
+ self.special_tokens = {
100
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
101
+ for i in range(num_base_tokens, num_base_tokens +
102
+ self.num_reserved_special_tokens)
103
+ }
104
+
105
+ self.model = tiktoken.Encoding(
106
+ name=Path(vocab_file).name,
107
+ pat_str=self.pat_str,
108
+ mergeable_ranks=mergeable_ranks,
109
+ special_tokens=self.special_tokens,
110
+ )
111
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
112
+
113
+ self.n_words: int = self.model.n_vocab
114
+ # BOS / EOS token IDs
115
+ self.bos_id: int = self.special_tokens[str(bos_token)]
116
+ self.eos_id: int = self.special_tokens[str(eos_token)]
117
+ logger.info(
118
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
119
+ )
120
+
121
+ self.pad_id: int = self.special_tokens[str(pad_token)]
122
+ self.unk_id: int = self.special_tokens[str(unk_token)]
123
+
124
+ self.byte_encoder = bytes_to_unicode()
125
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
126
+
127
+ self.decoder = {}
128
+ for i in range(self.n_words):
129
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
130
+ decoding = ''.join([
131
+ self.byte_encoder[ord(char)] for char in
132
+ self.model.decode_single_token_bytes(i).decode('latin-1')
133
+ ])
134
+ self.decoder[i] = decoding
135
+
136
+ self.encoder = {}
137
+ for i in range(self.n_words):
138
+ if i in self.decoder:
139
+ self.encoder[self.decoder[i]] = i
140
+
141
+ self._token_config_cache = OrderedDict()
142
+ self._cache_max_size = 128
143
+
144
+ super().__init__(
145
+ bos_token=bos_token,
146
+ eos_token=eos_token,
147
+ unk_token=unk_token,
148
+ pad_token=pad_token,
149
+ additional_special_tokens=additional_special_tokens,
150
+ added_tokens_decoder=added_tokens_decoder,
151
+ **kwargs,
152
+ )
153
+ self.all_special_ids_set = set(self.all_special_ids)
154
+
155
+ def encode(self,
156
+ text: str,
157
+ allow_special_tokens: bool = True,
158
+ **kwargs) -> List[int]:
159
+ """
160
+ Encodes a string into a list of token IDs.
161
+
162
+ Args:
163
+ text (str): The input string to be encoded.
164
+
165
+ Returns:
166
+ list[int]: A list of token IDs.
167
+ """
168
+ # If there are other args, we should call super().encode because there are a lot of code
169
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
170
+ # NOTE: our encode method is not compatible with the super().encode method,
171
+ # e.g. split_special_tokens' default is True in our encode method.
172
+ if len(kwargs) > 0:
173
+ logger.warning(f"Calling super().encode with {kwargs}")
174
+ return super().encode(text, **kwargs)
175
+
176
+ assert type(text) is str
177
+
178
+ # The tiktoken tokenizer can handle <=400k chars without
179
+ # pyo3_runtime.PanicException.
180
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
181
+
182
+ # https://github.com/openai/tiktoken/issues/195
183
+ # Here we iterate over subsequences and split if we exceed the limit
184
+ # of max consecutive non-whitespace or whitespace characters.
185
+ MAX_NO_WHITESPACES_CHARS = 25_000
186
+
187
+ texts = self.pre_tokenizer_process(text)
188
+
189
+ all_substrs = []
190
+ for text in texts:
191
+ substrs = (
192
+ substr for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
193
+ for substr in self._split_whitespaces_or_nonwhitespaces(
194
+ text[i:i +
195
+ TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS))
196
+ all_substrs.extend(substrs)
197
+
198
+ t: List[int] = []
199
+ for substr in all_substrs:
200
+ if allow_special_tokens:
201
+ t.extend(
202
+ # we should consider special token as a common token
203
+ self.model.encode(
204
+ substr,
205
+ allowed_special="all",
206
+ ))
207
+ else:
208
+ t.extend(
209
+ # we should consider special token as a common token
210
+ self.model.encode(
211
+ substr,
212
+ disallowed_special=(),
213
+ ))
214
+
215
+ return t
216
+
217
+ def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
218
+ """
219
+ Decodes a list of token IDs into a string.
220
+
221
+ Args:
222
+ token_ids (List[int]): The list of token IDs to be decoded.
223
+
224
+ Returns:
225
+ str: The decoded string.
226
+ """
227
+ # If there are other args, we should call super().decode because there are a lot of code
228
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
229
+ if len(kwargs) > 0:
230
+ return super().decode(token_ids, **kwargs)
231
+
232
+ if type(token_ids) is int:
233
+ token_ids = [token_ids]
234
+
235
+ return self.model.decode(cast(List[int], token_ids))
236
+
237
+ @staticmethod
238
+ def _split_whitespaces_or_nonwhitespaces(
239
+ s: str, max_consecutive_slice_len: int) -> Iterator[str]:
240
+ """
241
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
242
+ consecutive whitespaces or consecutive non-whitespaces.
243
+ """
244
+ current_slice_len = 0
245
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
246
+ slice_start = 0
247
+
248
+ for i in range(len(s)):
249
+ is_now_space = s[i].isspace()
250
+
251
+ if current_slice_is_space ^ is_now_space:
252
+ current_slice_len = 1
253
+ current_slice_is_space = is_now_space
254
+ else:
255
+ current_slice_len += 1
256
+ if current_slice_len > max_consecutive_slice_len:
257
+ yield s[slice_start:i]
258
+ slice_start = i
259
+ current_slice_len = 1
260
+ yield s[slice_start:]
261
+
262
+ def pre_tokenizer_process(self, text: str) -> List[str]:
263
+ """
264
+ pre-tokenizes the input text into a list of tokens.
265
+ This method is used to split the input text into smaller chunks for internal processing.
266
+ """
267
+ return [text]
268
+
269
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
270
+
271
+ @property
272
+ def vocab_size(self) -> int:
273
+ return self.n_words
274
+
275
+ def get_vocab(self) -> Dict[str, int]:
276
+ return self.encoder
277
+
278
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
279
+ return [self.decoder[t] for t in self.encode(text)]
280
+
281
+ def _convert_token_to_id(self, token: str) -> int:
282
+ return self.encoder.get(token, self.unk_id)
283
+
284
+ def _convert_id_to_token(self, index: int) -> str:
285
+ return self.decoder.get(index)
286
+
287
+ @staticmethod
288
+ def clean_up_tokenization(out_string: str) -> str:
289
+ return out_string
290
+
291
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
292
+ text = ''.join(tokens)
293
+ text = bytearray([self.byte_decoder[c]
294
+ for c in text]).decode('utf-8', 'replace')
295
+ return text
296
+
297
+ def save_vocabulary(self,
298
+ save_directory: str,
299
+ filename_prefix: Optional[str] = None) -> Tuple[str]:
300
+ if not os.path.isdir(save_directory):
301
+ raise ValueError(
302
+ f"vocabulary path ({save_directory}) should be a directory")
303
+ out_vocab_file = os.path.join(
304
+ save_directory,
305
+ (filename_prefix + "-" if filename_prefix else "") +
306
+ VOCAB_FILES_NAMES["vocab_file"])
307
+
308
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
309
+ out_vocab_file) and os.path.isfile(self.vocab_file):
310
+ copyfile(self.vocab_file, out_vocab_file)
311
+
312
+ return (out_vocab_file, )
313
+
314
+ def apply_chat_template(self,
315
+ conversation,
316
+ tools: Optional[list[dict]] = None,
317
+ tokenize: bool = False,
318
+ add_generation_prompt: bool = True,
319
+ thinking: bool = True,
320
+ preserve_thinking: bool = False,
321
+ **kwargs):
322
+
323
+ tools = deep_sort_dict(tools)
324
+
325
+ # Convert tools to TypeScript style string if tools are provided
326
+ tools_ts_str = None
327
+ if tools:
328
+ try:
329
+ tools_ts_str = encode_tools_to_typescript_style(tools)
330
+
331
+ except Exception as e:
332
+ print(f"Failed to convert tools to TypeScript style: {e}")
333
+ tools_ts_str = None
334
+
335
+ # Store the TypeScript string in kwargs so it can be accessed by the template
336
+ if tools_ts_str is not None:
337
+ kwargs['tools_ts_str'] = tools_ts_str
338
+ return super().apply_chat_template(
339
+ conversation,
340
+ tools=tools,
341
+ tokenize=tokenize,
342
+ add_generation_prompt=add_generation_prompt,
343
+ thinking=thinking,
344
+ preserve_thinking=preserve_thinking,
345
+ **kwargs)
346
+
347
+
348
+ def deep_sort_dict(obj: Any) -> Any:
349
+ if isinstance(obj, dict):
350
+ return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
351
+ if isinstance(obj, list):
352
+ return [deep_sort_dict(item) for item in obj]
353
+ return obj
tokenizer_config.json ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163590": {
44
+ "content": "<|start_header_id|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163591": {
52
+ "content": "<|end_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163593": {
60
+ "content": "[EOT]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163594": {
68
+ "content": "<|im_system|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163595": {
76
+ "content": "<|tool_calls_section_begin|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "163596": {
84
+ "content": "<|tool_calls_section_end|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "163597": {
92
+ "content": "<|tool_call_begin|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "163598": {
100
+ "content": "<|tool_call_argument_begin|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "163599": {
108
+ "content": "<|tool_call_end|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "163601": {
116
+ "content": "<|im_middle|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "163602": {
124
+ "content": "<|media_begin|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "163603": {
132
+ "content": "<|media_content|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "163604": {
140
+ "content": "<|media_end|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "163605": {
148
+ "content": "<|media_pad|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "163606": {
156
+ "content": "<think>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": false
162
+ },
163
+ "163607": {
164
+ "content": "</think>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": false
170
+ },
171
+ "163838": {
172
+ "content": "[UNK]",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "163839": {
180
+ "content": "[PAD]",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ }
187
+ },
188
+ "auto_map": {
189
+ "AutoTokenizer": [
190
+ "tokenization_kimi.TikTokenTokenizer",
191
+ null
192
+ ]
193
+ },
194
+ "backend": "custom",
195
+ "bos_token": "[BOS]",
196
+ "clean_up_tokenization_spaces": false,
197
+ "eos_token": "[EOS]",
198
+ "extra_special_tokens": [
199
+ "<|im_end|>",
200
+ "<|im_user|>",
201
+ "<|im_assistant|>",
202
+ "<|start_header_id|>",
203
+ "<|end_header_id|>",
204
+ "[EOT]",
205
+ "<|im_system|>",
206
+ "<|im_middle|>"
207
+ ],
208
+ "is_local": false,
209
+ "local_files_only": false,
210
+ "model_max_length": 1000000000000000019884624838656,
211
+ "pad_token": "[PAD]",
212
+ "tokenizer_class": "TikTokenTokenizer",
213
+ "unk_token": "[UNK]"
214
+ }
tool_declaration_ts.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Encode structured tool declaration to typescript style string.
3
+ """
4
+ import dataclasses
5
+ import json
6
+ import logging
7
+ from collections.abc import Sequence
8
+ from typing import Any
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _TS_INDENT = " "
13
+ _TS_FIELD_DELIMITER = ",\n"
14
+
15
+
16
+ class _SchemaRegistry:
17
+ """Registry for schema definitions to handle $ref resolution"""
18
+
19
+ def __init__(self):
20
+ self.definitions = {}
21
+ self.has_self_ref = False
22
+
23
+ def register_definitions(self, defs: dict[str, Any]):
24
+ """Register schema definitions from $defs section"""
25
+ if not defs:
26
+ return
27
+ for def_name, def_schema in defs.items():
28
+ self.definitions[def_name] = def_schema
29
+
30
+ def resolve_ref(self, ref: str) -> dict[str, Any]:
31
+ """Resolve a reference to its schema definition"""
32
+ if ref == "#":
33
+ self.has_self_ref = True
34
+ return {"$self_ref": True}
35
+ elif ref.startswith("#/$defs/"):
36
+ def_name = ref.split("/")[-1]
37
+ if def_name not in self.definitions:
38
+ raise ValueError(f"Reference not found: {ref}")
39
+ return self.definitions[def_name]
40
+ else:
41
+ raise ValueError(f"Unsupported reference format: {ref}")
42
+
43
+
44
+ def _format_description(description: str, indent: str = "") -> str:
45
+ return "\n".join([
46
+ f"{indent}// {line}" if line else ""
47
+ for line in description.split("\n")
48
+ ])
49
+
50
+
51
+ class _BaseType:
52
+ description: str
53
+ constraints: dict[str, Any]
54
+
55
+ def __init__(
56
+ self,
57
+ extra_props: dict[str, Any],
58
+ *,
59
+ allowed_constraint_keys: Sequence[str] = (),
60
+ ):
61
+ self.description = extra_props.get("description", "")
62
+ self.constraints = {
63
+ k: v
64
+ for k, v in extra_props.items() if k in allowed_constraint_keys
65
+ }
66
+
67
+ def to_typescript_style(self, indent: str = "") -> str:
68
+ raise NotImplementedError
69
+
70
+ def format_docstring(self, indent: str) -> str:
71
+ lines = []
72
+ if self.description:
73
+ lines.append(_format_description(self.description, indent))
74
+ if self.constraints:
75
+ constraints_str = ", ".join(f"{k}: {v}" for k, v in sorted(
76
+ self.constraints.items(), key=lambda kv: kv[0]))
77
+ lines.append(f"{indent}// {constraints_str}")
78
+
79
+ return "".join(x + "\n" for x in lines)
80
+
81
+
82
+ class _ParameterTypeScalar(_BaseType):
83
+ type: str
84
+
85
+ def __init__(self, type: str, extra_props: dict[str, Any] | None = None):
86
+ self.type = type
87
+
88
+ allowed_constraint_keys: list[str] = []
89
+ if self.type == "string":
90
+ allowed_constraint_keys = ["maxLength", "minLength", "pattern"]
91
+ elif self.type in ("number", "integer"):
92
+ allowed_constraint_keys = ["maximum", "minimum"]
93
+
94
+ super().__init__(extra_props or {},
95
+ allowed_constraint_keys=allowed_constraint_keys)
96
+
97
+ def to_typescript_style(self, indent: str = "") -> str:
98
+ # Map integer to number in TypeScript
99
+ if self.type == "integer":
100
+ return "number"
101
+ return self.type
102
+
103
+
104
+ class _ParameterTypeObject(_BaseType):
105
+ properties: list["_Parameter"]
106
+ additional_properties: Any | None = None
107
+
108
+ def __init__(self,
109
+ json_schema_object: dict[str, Any],
110
+ registry: _SchemaRegistry | None = None):
111
+ super().__init__(json_schema_object)
112
+
113
+ self.properties = []
114
+ self.additional_properties = None
115
+
116
+ if not json_schema_object:
117
+ return
118
+
119
+ if "$defs" in json_schema_object and registry:
120
+ registry.register_definitions(json_schema_object["$defs"])
121
+
122
+ self.additional_properties = json_schema_object.get(
123
+ "additionalProperties")
124
+ if isinstance(self.additional_properties, dict):
125
+ self.additional_properties = _parse_parameter_type(
126
+ self.additional_properties, registry)
127
+
128
+ if "properties" not in json_schema_object:
129
+ return
130
+
131
+ required_parameters = json_schema_object.get("required", [])
132
+ optional_parameters = set(
133
+ json_schema_object["properties"].keys()) - set(required_parameters)
134
+
135
+ self.properties = [
136
+ _Parameter(
137
+ name=name,
138
+ type=_parse_parameter_type(prop, registry),
139
+ optional=name in optional_parameters,
140
+ default=prop.get("default")
141
+ if isinstance(prop, dict) else None,
142
+ ) for name, prop in json_schema_object["properties"].items()
143
+ ]
144
+
145
+ def to_typescript_style(self, indent: str = "") -> str:
146
+ # sort by optional, make the required parameters first
147
+ parameters = [p for p in self.properties if not p.optional]
148
+ opt_params = [p for p in self.properties if p.optional]
149
+
150
+ parameters = sorted(parameters, key=lambda p: p.name)
151
+ parameters.extend(sorted(opt_params, key=lambda p: p.name))
152
+
153
+ param_strs = []
154
+ for p in parameters:
155
+ one = p.to_typescript_style(indent=indent + _TS_INDENT)
156
+ param_strs.append(one)
157
+
158
+ if self.additional_properties is not None:
159
+ ap_type_str = "any"
160
+ if self.additional_properties is True:
161
+ ap_type_str = "any"
162
+ elif self.additional_properties is False:
163
+ ap_type_str = "never"
164
+ elif isinstance(self.additional_properties, _ParameterType):
165
+ ap_type_str = self.additional_properties.to_typescript_style(
166
+ indent=indent + _TS_INDENT)
167
+ else:
168
+ raise ValueError(
169
+ f"Unknown additionalProperties: {self.additional_properties}"
170
+ )
171
+ param_strs.append(
172
+ f"{indent + _TS_INDENT}[k: string]: {ap_type_str}")
173
+
174
+ if not param_strs:
175
+ return "{}"
176
+
177
+ params_str = _TS_FIELD_DELIMITER.join(param_strs)
178
+ if params_str:
179
+ # add new line before and after
180
+ params_str = f"\n{params_str}\n"
181
+ # always wrap with object
182
+ return f"{{{params_str}{indent}}}"
183
+
184
+
185
+ class _ParameterTypeArray(_BaseType):
186
+ item: "_ParameterType"
187
+
188
+ def __init__(self,
189
+ json_schema_object: dict[str, Any],
190
+ registry: _SchemaRegistry | None = None):
191
+ super().__init__(json_schema_object,
192
+ allowed_constraint_keys=("minItems", "maxItems"))
193
+ if json_schema_object.get("items"):
194
+ self.item = _parse_parameter_type(json_schema_object["items"],
195
+ registry)
196
+ else:
197
+ self.item = _ParameterTypeScalar(type="any")
198
+
199
+ def to_typescript_style(self, indent: str = "") -> str:
200
+ item_docstring = self.item.format_docstring(indent + _TS_INDENT)
201
+ if item_docstring:
202
+ return ("Array<\n" + item_docstring + indent + _TS_INDENT +
203
+ self.item.to_typescript_style(indent=indent + _TS_INDENT) +
204
+ "\n" + indent + ">")
205
+ else:
206
+ return f"Array<{self.item.to_typescript_style(indent=indent)}>"
207
+
208
+
209
+ class _ParameterTypeEnum(_BaseType):
210
+ # support scalar types only
211
+ enum: list[str | int | float | bool | None]
212
+
213
+ def __init__(self, json_schema_object: dict[str, Any]):
214
+ super().__init__(json_schema_object)
215
+ self.enum = json_schema_object["enum"]
216
+
217
+ # Validate enum values against declared type if present
218
+ if "type" in json_schema_object:
219
+ typ = json_schema_object["type"]
220
+ if isinstance(typ, list):
221
+ if len(typ) == 1:
222
+ typ = typ[0]
223
+ elif len(typ) == 2:
224
+ if "null" not in typ:
225
+ raise ValueError(f"Enum type {typ} is not supported")
226
+ else:
227
+ typ = typ[0] if typ[0] != "null" else typ[1]
228
+ else:
229
+ raise ValueError(f"Enum type {typ} is not supported")
230
+ for val in self.enum:
231
+ if val is None:
232
+ continue
233
+ if typ == "string" and not isinstance(val, str):
234
+ raise ValueError(f"Enum value {val} is not a string")
235
+ elif typ == "number" and not isinstance(val, (int, float)):
236
+ raise ValueError(f"Enum value {val} is not a number")
237
+ elif typ == "integer" and not isinstance(val, int):
238
+ raise ValueError(f"Enum value {val} is not an integer")
239
+ elif typ == "boolean" and not isinstance(val, bool):
240
+ raise ValueError(f"Enum value {val} is not a boolean")
241
+
242
+ def to_typescript_style(self, indent: str = "") -> str:
243
+ return " | ".join(
244
+ [f'"{e}"' if isinstance(e, str) else str(e) for e in self.enum])
245
+
246
+
247
+ class _ParameterTypeAnyOf(_BaseType):
248
+ types: list["_ParameterType"]
249
+
250
+ def __init__(
251
+ self,
252
+ json_schema_object: dict[str, Any],
253
+ registry: _SchemaRegistry | None = None,
254
+ ):
255
+ super().__init__(json_schema_object)
256
+ self.types = [
257
+ _parse_parameter_type(t, registry)
258
+ for t in json_schema_object["anyOf"]
259
+ ]
260
+
261
+ def to_typescript_style(self, indent: str = "") -> str:
262
+ return " | ".join(
263
+ [t.to_typescript_style(indent=indent) for t in self.types])
264
+
265
+
266
+ class _ParameterTypeUnion(_BaseType):
267
+ types: list[str]
268
+
269
+ def __init__(self, json_schema_object: dict[str, Any]):
270
+ super().__init__(json_schema_object)
271
+
272
+ mapping = {
273
+ "string": "string",
274
+ "number": "number",
275
+ "integer": "number",
276
+ "boolean": "boolean",
277
+ "null": "null",
278
+ "object": "{}",
279
+ "array": "Array<any>",
280
+ }
281
+ self.types = [mapping[t] for t in json_schema_object["type"]]
282
+
283
+ def to_typescript_style(self, indent: str = "") -> str:
284
+ return " | ".join(self.types)
285
+
286
+
287
+ class _ParameterTypeRef(_BaseType):
288
+ ref_name: str
289
+ is_self_ref: bool = False
290
+
291
+ def __init__(self, json_schema_object: dict[str, Any],
292
+ registry: _SchemaRegistry):
293
+ super().__init__(json_schema_object)
294
+
295
+ ref = json_schema_object["$ref"]
296
+ resolved_schema = registry.resolve_ref(ref)
297
+
298
+ if resolved_schema.get("$self_ref", False):
299
+ self.ref_name = "parameters"
300
+ self.is_self_ref = True
301
+ else:
302
+ self.ref_name = ref.split("/")[-1]
303
+
304
+ def to_typescript_style(self, indent: str = "") -> str:
305
+ return self.ref_name
306
+
307
+
308
+ _ParameterType = (_ParameterTypeScalar
309
+ | _ParameterTypeObject
310
+ | _ParameterTypeArray
311
+ | _ParameterTypeEnum
312
+ | _ParameterTypeAnyOf
313
+ | _ParameterTypeUnion
314
+ | _ParameterTypeRef)
315
+
316
+
317
+ @dataclasses.dataclass
318
+ class _Parameter:
319
+ """
320
+ A parameter in a function, or a field in a object.
321
+ It consists of the type as well as the name.
322
+ """
323
+
324
+ type: _ParameterType
325
+ name: str = "_"
326
+ optional: bool = True
327
+ default: Any | None = None
328
+
329
+ @classmethod
330
+ def parse_extended(cls, attributes: dict[str, Any]) -> "_Parameter":
331
+ if not attributes:
332
+ raise ValueError("attributes is empty")
333
+
334
+ return cls(
335
+ name=attributes.get("name", "_"),
336
+ type=_parse_parameter_type(attributes),
337
+ optional=attributes.get("optional", False),
338
+ default=attributes.get("default"),
339
+ )
340
+
341
+ def to_typescript_style(self, indent: str = "") -> str:
342
+ comments = self.type.format_docstring(indent)
343
+
344
+ if self.default is not None:
345
+ default_repr = (json.dumps(self.default, ensure_ascii=False)
346
+ if not isinstance(self.default, (int, float, bool))
347
+ else repr(self.default))
348
+ comments += f"{indent}// Default: {default_repr}\n"
349
+
350
+ return (
351
+ comments +
352
+ f"{indent}{self.name}{'?' if self.optional else ''}: {self.type.to_typescript_style(indent=indent)}"
353
+ )
354
+
355
+
356
+ def _parse_parameter_type(
357
+ json_schema_object: dict[str, Any] | bool,
358
+ registry: _SchemaRegistry | None = None) -> _ParameterType:
359
+ if isinstance(json_schema_object, bool):
360
+ if json_schema_object:
361
+ return _ParameterTypeScalar(type="any")
362
+ else:
363
+ logger.warning(
364
+ f"Warning: Boolean value {json_schema_object} is not supported, use null instead."
365
+ )
366
+ return _ParameterTypeScalar(type="null")
367
+
368
+ if "$ref" in json_schema_object and registry:
369
+ return _ParameterTypeRef(json_schema_object, registry)
370
+
371
+ if "anyOf" in json_schema_object:
372
+ return _ParameterTypeAnyOf(json_schema_object, registry)
373
+ elif "enum" in json_schema_object:
374
+ return _ParameterTypeEnum(json_schema_object)
375
+ elif "type" in json_schema_object:
376
+ typ = json_schema_object["type"]
377
+ if isinstance(typ, list):
378
+ return _ParameterTypeUnion(json_schema_object)
379
+ elif typ == "object":
380
+ return _ParameterTypeObject(json_schema_object, registry)
381
+ elif typ == "array":
382
+ return _ParameterTypeArray(json_schema_object, registry)
383
+ else:
384
+ return _ParameterTypeScalar(typ, json_schema_object)
385
+ elif json_schema_object == {}:
386
+ return _ParameterTypeScalar(type="any")
387
+ else:
388
+ raise ValueError(f"Invalid JSON Schema object: {json_schema_object}")
389
+
390
+
391
+ def _openai_function_to_typescript_style(function: dict[str, Any], ) -> str:
392
+ """Convert OpenAI function definition (dict) to TypeScript style string."""
393
+ registry = _SchemaRegistry()
394
+ parameters = function.get("parameters") or {}
395
+ parsed = _ParameterTypeObject(parameters, registry)
396
+
397
+ interfaces = []
398
+ root_interface_name = None
399
+ if registry.has_self_ref:
400
+ root_interface_name = "parameters"
401
+ params_str = _TS_FIELD_DELIMITER.join([
402
+ p.to_typescript_style(indent=_TS_INDENT) for p in parsed.properties
403
+ ])
404
+ params_str = f"\n{params_str}\n" if params_str else ""
405
+ interface_def = f"interface {root_interface_name} {{{params_str}}}"
406
+ interfaces.append(interface_def)
407
+
408
+ definitions_copy = dict(registry.definitions)
409
+ for def_name, def_schema in definitions_copy.items():
410
+ obj_type = _parse_parameter_type(def_schema, registry)
411
+ params_str = obj_type.to_typescript_style()
412
+
413
+ description_part = ""
414
+ if obj_description := def_schema.get("description", ""):
415
+ description_part = _format_description(obj_description) + "\n"
416
+
417
+ interface_def = f"{description_part}interface {def_name} {params_str}"
418
+ interfaces.append(interface_def)
419
+
420
+ interface_str = "\n".join(interfaces)
421
+ function_name = function.get("name", "function")
422
+ if root_interface_name:
423
+ type_def = f"type {function_name} = (_: {root_interface_name}) => any;"
424
+ else:
425
+ params_str = parsed.to_typescript_style()
426
+ type_def = f"type {function_name} = (_: {params_str}) => any;"
427
+
428
+ description = function.get("description")
429
+ return "\n".join(
430
+ filter(
431
+ bool,
432
+ [
433
+ interface_str,
434
+ ((description and _format_description(description)) or ""),
435
+ type_def,
436
+ ],
437
+ ))
438
+
439
+
440
+ def encode_tools_to_typescript_style(tools: list[dict[str, Any]], ) -> str:
441
+ """
442
+ Convert tools (list of dict) to TypeScript style string.
443
+
444
+ Supports OpenAI format: {"type": "function", "function": {...}}
445
+
446
+ Args:
447
+ tools: List of tool definitions in dict format
448
+
449
+ Returns:
450
+ TypeScript style string representation of the tools
451
+ """
452
+ if not tools:
453
+ return ""
454
+
455
+ functions = []
456
+
457
+ for tool in tools:
458
+ tool_type = tool.get("type")
459
+ if tool_type == "function":
460
+ func_def = tool.get("function", {})
461
+ if func_def:
462
+ functions.append(
463
+ _openai_function_to_typescript_style(func_def))
464
+ else:
465
+ # Skip unsupported tool types (like "_plugin")
466
+ continue
467
+
468
+ if not functions:
469
+ return ""
470
+
471
+ functions_str = "\n".join(functions)
472
+ result = "# Tools\n\n"
473
+
474
+ if functions_str:
475
+ result += "## functions\nnamespace functions {\n"
476
+ result += functions_str + "\n"
477
+ result += "}\n"
478
+
479
+ return result