jeffreymeetkai commited on
Commit
0d1c0ff
·
verified ·
1 Parent(s): 652c490

Delete tokenization_functionary.py

Browse files
Files changed (1) hide show
  1. tokenization_functionary.py +0 -511
tokenization_functionary.py DELETED
@@ -1,511 +0,0 @@
1
- # Copyright (c) 2024, MeetKai Inc. All rights reserved.
2
-
3
- from copy import deepcopy
4
- import json
5
- from typing import Any, Dict, List, Literal, Optional, Union
6
-
7
- import jsonref
8
- from pydantic import BaseModel, Field, model_validator
9
- from typing_extensions import Self
10
-
11
- from transformers.tokenization_utils_base import BatchEncoding
12
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
13
- from transformers.utils import TensorType, logging
14
-
15
-
16
- logger = logging.get_logger(__name__)
17
- SYSTEM_PROMPT = """You are capable of executing available function(s) if required.
18
- Only execute function(s) when absolutely necessary.
19
- Ask for the required input to:recipient==all
20
- Use JSON for function arguments.
21
- Respond in this format:
22
- >>>${recipient}
23
- ${content}
24
- Available functions:
25
- """
26
- CODE_INTERPRETER_SYSTEM_PROMPT = """When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."""
27
-
28
- class Function(BaseModel):
29
- name: str
30
- description: Optional[str] = Field(default="")
31
- parameters: Optional[dict] = None
32
-
33
-
34
- class Tool(BaseModel):
35
- type: Literal["function", "code_interpreter"]
36
- function: Optional[Function] = None
37
-
38
- @model_validator(mode="after")
39
- def check_type_function_matches(self) -> Self:
40
- if self.type == "function":
41
- assert self.function is not None, '"function" must contain function description when `"type": "function"`'
42
- else:
43
- assert self.function is None, '"function" must not be provided when `"type": "code_interpreter"`'
44
- return self
45
-
46
-
47
- def convert_data_type(param_type: str) -> str:
48
- """convert data_type to typescript data type
49
- Args:
50
- param_type (str): param_type
51
- Returns:
52
- str: param type in typescript
53
- """
54
- if param_type == "integer" or param_type == "float":
55
- return "number"
56
- return param_type
57
-
58
-
59
- def get_param_type(param: Dict) -> str:
60
- """get param_type of parameter
61
- Args:
62
- param (Dict): param dict in properties
63
- Returns:
64
- str: _description_
65
- """
66
- param_type = "any"
67
- if "type" in param:
68
- raw_param_type = param["type"]
69
- if type(raw_param_type) is list:
70
- param_type = " | ".join(raw_param_type)
71
- else:
72
- param_type = raw_param_type
73
-
74
- else: # in many cases, the json schema contains: oneOf instead of "type"
75
- if "oneOf" in param:
76
- one_of_types = []
77
- for item in param["oneOf"]:
78
- if "type" in item:
79
- one_of_types.append(convert_data_type(item["type"]))
80
- one_of_types = list(set(one_of_types))
81
- param_type = " | ".join(one_of_types)
82
- return convert_data_type(param_type)
83
-
84
-
85
- def get_format_param(param: Dict) -> Optional[str]:
86
- """Get "format" from param. There are cases where format is not directly in param but in oneOf
87
- Args:
88
- param (Dict): _description_
89
- Returns:
90
- Optional[str]: _description_
91
- """
92
- if "format" in param:
93
- return param["format"]
94
- if "oneOf" in param:
95
- formats = []
96
- for item in param["oneOf"]:
97
- if "format" in item:
98
- formats.append(item["format"])
99
- if len(formats) > 0:
100
- return " or ".join(formats)
101
- return None
102
-
103
-
104
- def get_param_info(param: Dict) -> Optional[str]:
105
- """get additional information about parameter such as: format, default value, min, max, ...
106
- Args:
107
- param (Dict): _description_
108
- Returns:
109
- Optional[str]: _description_
110
- """
111
- param_type = param.get("type", "any")
112
- info_list = []
113
- if "description" in param:
114
- desc = param["description"]
115
- if not desc.endswith("."):
116
- desc += "."
117
- info_list.append(desc)
118
-
119
- if "default" in param:
120
- default_value = param["default"]
121
- if param_type == "string":
122
- default_value = f'"{default_value}"' # if string --> add ""
123
- info_list.append(f"Default={default_value}.")
124
-
125
- format_param = get_format_param(param)
126
- if format_param is not None:
127
- info_list.append("Format=" + format_param)
128
-
129
- for field, field_name in [
130
- ("maximum", "Maximum"),
131
- ("minimum", "Minimum"),
132
- ("maxLength", "Maximum length"),
133
- ("minLength", "Minimum length"),
134
- ]:
135
- if field in param:
136
- info_list.append(f"{field_name}=" + str(param[field]))
137
-
138
- if len(info_list) > 0:
139
- result = "// " + " ".join(info_list)
140
- result = result.replace("\n", " ")
141
- return result
142
- return None
143
-
144
-
145
- def append_new_param_info(
146
- info_list: List[str],
147
- param_declaration: str,
148
- comment_info: Optional[str],
149
- examples_info: List,
150
- depth: int,
151
- ):
152
- """Append a new parameter with comment to the info_list
153
- Args:
154
- info_lines (List[str]): current info_list
155
- param_declaration (str): param: type
156
- comment_info (Optional[str]): information of comment
157
- examples_info (List): information of examples given
158
- depth (int): level of nested param
159
- """
160
- offset = ""
161
- if depth >= 1:
162
- offset = "".join([" " for _ in range(depth)])
163
- if comment_info is not None:
164
- # if depth == 0: # format: //comment\nparam: type
165
- info_list.append(f"{offset}{comment_info}")
166
- if len(examples_info) > 0:
167
- for example in examples_info:
168
- info_list.append(f"{offset}{example}")
169
- info_list.append(f"{offset}{param_declaration}")
170
- # else: # format: param: type // comment
171
- # info_list.append(f"{offset}{param_declaration} {comment_info}")
172
- else:
173
- info_list.append(f"{offset}{param_declaration}")
174
-
175
-
176
- def get_examples_info(param_name: str, examples: List) -> List:
177
- """get information about examples provided
178
- Args:
179
- param_name (str): _description_
180
- examples (List): _description_
181
- Returns:
182
- List: _description_
183
- """
184
- examples_list = [f"// Example {param_name}:"]
185
- for example in examples:
186
- if isinstance(example, dict) or isinstance(example, list):
187
- example_str = json.dumps(example, ensure_ascii=False).replace('\n', '\\n')
188
- else:
189
- example_str = str(example).replace('\n', '\\n')
190
- examples_list.append(f"// {example_str}")
191
-
192
- return examples_list
193
-
194
-
195
- def get_enum_option_str(enum_options: List) -> str:
196
- """get enum option separated by: "|"
197
- Args:
198
- enum_options (List): list of options
199
- Returns:
200
- _type_: concatenation of options separated by "|"
201
- """
202
- # if each option is string --> add quote
203
- return " | ".join([f'"{v}"' if type(v) is str else str(v) for v in enum_options])
204
-
205
-
206
- def get_array_typescript(
207
- param_name: Optional[str], param_dic: dict, depth: int = 0
208
- ) -> str:
209
- """recursive implementation for generating type script of array
210
- Args:
211
- param_name (Optional[str]): name of param, optional
212
- param_dic (dict): param_dic
213
- depth (int, optional): nested level. Defaults to 0.
214
- Returns:
215
- _type_: typescript of array
216
- """
217
- offset = ""
218
- if depth >= 1:
219
- offset = "".join([" " for _ in range(depth)])
220
- items_info = param_dic.get("items", {})
221
-
222
- if len(items_info) == 0:
223
- if param_name is not None:
224
- return f"{offset}{param_name}: []"
225
- else:
226
- return "[]"
227
- array_type = get_param_type(items_info)
228
- if array_type == "object":
229
- info_lines = []
230
- child_lines = get_parameter_typescript(
231
- items_info.get("properties", {}), items_info.get("required", []), depth + 1
232
- )
233
- # if comment_info is not None:
234
- # info_lines.append(f"{offset}{comment_info}")
235
- if param_name is not None:
236
- info_lines.append(f"{offset}{param_name}" + ": {")
237
- else:
238
- info_lines.append(f"{offset}" + "{")
239
- info_lines.extend(child_lines)
240
- info_lines.append(f"{offset}" + "}[]")
241
- return "\n".join(info_lines)
242
-
243
- elif array_type == "array":
244
- item_info = get_array_typescript(None, items_info, depth + 1)
245
- if param_name is None:
246
- return f"{item_info}[]"
247
- return f"{offset}{param_name}: {item_info.strip()}[]"
248
-
249
- else:
250
- if "enum" in items_info:
251
- item_type = get_enum_option_str(items_info["enum"])
252
- if param_name is None:
253
- return f"({item_type})[]"
254
- else:
255
- return f"{offset}{param_name}: ({item_type})[]"
256
- else:
257
- if param_name is None:
258
- return f"{array_type}[]"
259
- else:
260
- return f"{offset}{param_name}: {array_type}[],"
261
-
262
-
263
- def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
264
- """Recursion, returning the information about parameters including data type, description and other information
265
- These kinds of information will be put into the prompt
266
- Args:
267
- properties (_type_): properties in parameters
268
- required_params (_type_): List of required parameters
269
- depth (int, optional): the depth of params (nested level). Defaults to 0.
270
- Returns:
271
- _type_: list of lines containing information about all parameters
272
- """
273
- tp_lines = []
274
- for param_name, param in properties.items():
275
- # Sometimes properties have "required" field as a list of string.
276
- # Even though its supposed to be not under properties. So we skip it
277
- if not isinstance(param, dict):
278
- continue
279
- # Param Description
280
- comment_info = get_param_info(param)
281
- # Param Examples
282
- examples_info = []
283
- if "examples" in param:
284
- examples_info = get_examples_info(param_name, param["examples"])
285
- # Param Name declaration
286
- param_declaration = f"{param_name}"
287
- if isinstance(required_params, list):
288
- if param_name not in required_params:
289
- param_declaration += "?"
290
- param_type = get_param_type(param)
291
-
292
- offset = ""
293
- if depth >= 1:
294
- offset = "".join([" " for _ in range(depth)])
295
-
296
- if param_type == "object": # param_type is object
297
- child_lines = get_parameter_typescript(
298
- param.get("properties", {}), param.get("required", []), depth + 1
299
- )
300
- if comment_info is not None:
301
- tp_lines.append(f"{offset}{comment_info}")
302
- if len(examples_info) > 0:
303
- for example in examples_info:
304
- tp_lines.append(f"{offset}{example}")
305
-
306
- param_declaration += ": {"
307
- tp_lines.append(f"{offset}{param_declaration}")
308
- tp_lines.extend(child_lines)
309
- tp_lines.append(f"{offset}" + "},")
310
-
311
- elif param_type == "array": # param_type is an array
312
- item_info = param.get("items", {})
313
- if "type" not in item_info: # don't know type of array
314
- param_declaration += ": [],"
315
- append_new_param_info(
316
- tp_lines, param_declaration, comment_info, examples_info, depth
317
- )
318
- else:
319
- array_declaration = get_array_typescript(
320
- param_declaration, param, depth
321
- )
322
- if not array_declaration.endswith(","):
323
- array_declaration += ","
324
- if comment_info is not None:
325
- tp_lines.append(f"{offset}{comment_info}")
326
- if len(examples_info) > 0:
327
- for example in examples_info:
328
- tp_lines.append(f"{offset}{example}")
329
- tp_lines.append(array_declaration)
330
- else:
331
- if "enum" in param:
332
- param_type = get_enum_option_str(param["enum"])
333
- # param_type = " | ".join([f'"{v}"' for v in param["enum"]])
334
- if "nullable" in param and param["nullable"] is True:
335
- param_type += " | null"
336
- param_declaration += f": {param_type},"
337
- append_new_param_info(
338
- tp_lines, param_declaration, comment_info, examples_info, depth
339
- )
340
-
341
- return tp_lines
342
-
343
- def generate_schema_from_functions(
344
- functions: List[Function], namespace="functions"
345
- ) -> str:
346
- """
347
- Convert functions schema to a schema that language models can understand.
348
- """
349
-
350
- schema = "// Supported function definitions that should be called when necessary.\n"
351
- schema += f"namespace {namespace} {{\n\n"
352
-
353
- for function in functions:
354
- # Convert a Function object to dict, if necessary
355
- if not isinstance(function, dict):
356
- function = function.model_dump()
357
- function_name = function.get("name", None)
358
- if function_name is None:
359
- continue
360
-
361
- description = function.get("description", "")
362
- schema += f"// {description}\n"
363
- schema += f"type {function_name}"
364
-
365
- parameters = function.get("parameters", None)
366
- if parameters is not None and parameters.get("properties") is not None:
367
- parameters = deepcopy(jsonref.JsonRef.replace_refs(parameters))
368
- schema += " = (_: {\n"
369
- required_params = parameters.get("required", [])
370
- tp_lines = get_parameter_typescript(
371
- parameters.get("properties"),
372
- required_params,
373
- 0,
374
- )
375
- schema += "\n".join(tp_lines)
376
- schema += "\n}) => any;\n\n"
377
- else:
378
- # Doesn't have any parameters
379
- schema += " = () => any;\n\n"
380
-
381
- schema += f"}} // namespace {namespace}"
382
-
383
- return schema
384
-
385
- class FunctionaryTokenizer(PreTrainedTokenizerFast):
386
- def apply_chat_template(
387
- self,
388
- conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], str],
389
- tools: Optional[List[Dict[str, Any]]],
390
- chat_template: Optional[str] = None,
391
- add_generation_prompt: bool = False,
392
- tokenize: bool = True,
393
- padding: bool = False,
394
- truncation: bool = False,
395
- max_length: Optional[int] = None,
396
- return_tensors: Optional[Union[str, TensorType]] = None,
397
- return_dict: bool = False,
398
- tokenizer_kwargs: Optional[Dict[str, Any]] = None,
399
- **kwargs,
400
- ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
401
-
402
- if return_dict and not tokenize:
403
- raise ValueError(
404
- "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
405
- "of tokenizer outputs to return."
406
- )
407
-
408
- if tokenizer_kwargs is None:
409
- tokenizer_kwargs = {}
410
-
411
- using_default_template = False
412
-
413
- # First, handle the cases when the model has a dict of multiple templates
414
- if isinstance(self.chat_template, dict) or (
415
- self.chat_template is None and isinstance(self.default_chat_template, dict)
416
- ):
417
- if self.chat_template is not None:
418
- template_dict = self.chat_template
419
- using_default_dict = False
420
- else:
421
- template_dict = self.default_chat_template
422
- using_default_dict = True
423
- if chat_template is not None and chat_template in template_dict:
424
- # The user can pass the name of a template to the chat template argument instead of an entire template
425
- chat_template = template_dict[chat_template]
426
- if using_default_dict:
427
- using_default_template = True
428
- elif chat_template is None and "default" in template_dict:
429
- chat_template = template_dict["default"]
430
- if using_default_dict:
431
- using_default_template = True
432
- elif chat_template is None:
433
- raise ValueError(
434
- "This model has multiple chat templates with no default specified! Please either pass a chat "
435
- "template or the name of the template you wish to use to the `chat_template` argument. Available "
436
- f"template names are {sorted(template_dict.keys())}."
437
- )
438
- elif chat_template is None:
439
- # These are the cases when the model has a single template
440
- # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
441
- if self.chat_template is not None:
442
- chat_template = self.chat_template
443
- else:
444
- chat_template = self.default_chat_template
445
- using_default_template = True
446
-
447
- if using_default_template:
448
- logger.warning_once(
449
- "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
450
- "very error-prone, because models are often trained with templates different from the class default! "
451
- "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
452
- "point any code depending on them will stop working. We recommend setting a valid chat template before "
453
- "then to ensure that this model continues working without issues."
454
- )
455
-
456
- # Prepare tools/functions into schema
457
- functions_pydantic_to_render = []
458
- has_code_interpreter = False
459
- for i in range(len(tools)):
460
- tool_pydantic = Tool.model_validate(tools[i])
461
- if tool_pydantic.type == "function":
462
- functions_pydantic_to_render.append(tool_pydantic.function)
463
- else:
464
- has_code_interpreter = True
465
- # Insert system prompt
466
- conversation.insert(0, {"role": "system", "content": SYSTEM_PROMPT + generate_schema_from_functions(functions_pydantic_to_render)})
467
- if has_code_interpreter:
468
- conversation.insert(1, {"role": "system", "content": CODE_INTERPRETER_SYSTEM_PROMPT})
469
-
470
- # Compilation function uses a cache to avoid recompiling the same template
471
- compiled_template = self._compile_jinja_template(chat_template)
472
-
473
- if isinstance(conversation, (list, tuple)) and (
474
- isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
475
- ):
476
- conversations = conversation
477
- is_batched = True
478
- else:
479
- conversations = [conversation]
480
- is_batched = False
481
-
482
- rendered = []
483
- template_kwargs = {**self.special_tokens_map, **kwargs} # kwargs overwrite special tokens if both are present
484
- for chat in conversations:
485
- if hasattr(chat, "messages"):
486
- # Indicates it's a Conversation object
487
- chat = chat.messages
488
- rendered_chat = compiled_template.render(
489
- messages=chat, add_generation_prompt=add_generation_prompt, **template_kwargs
490
- )
491
- rendered.append(rendered_chat)
492
-
493
- if not is_batched:
494
- rendered = rendered[0]
495
-
496
- if tokenize:
497
- out = self(
498
- rendered,
499
- padding=padding,
500
- truncation=truncation,
501
- max_length=max_length,
502
- add_special_tokens=False,
503
- return_tensors=return_tensors,
504
- **tokenizer_kwargs,
505
- )
506
- if return_dict:
507
- return out
508
- else:
509
- return out["input_ids"]
510
- else:
511
- return rendered