jeffreymeetkai commited on
Commit
e884c36
·
verified ·
1 Parent(s): ae0a4b1

Delete tokenization_functionary.py

Browse files
Files changed (1) hide show
  1. tokenization_functionary.py +0 -208
tokenization_functionary.py DELETED
@@ -1,208 +0,0 @@
1
- # Copyright (c) 2024, MeetKai Inc. All rights reserved.
2
-
3
- from copy import deepcopy
4
- import datetime
5
- import json
6
- from typing import Any, Dict, List, Literal, Optional, Union
7
-
8
- import jsonref
9
- from pydantic import BaseModel, Field, model_validator
10
- from typing_extensions import Self
11
-
12
- from transformers.tokenization_utils_base import BatchEncoding
13
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
14
- from transformers.utils import TensorType, logging
15
-
16
-
17
- logger = logging.get_logger(__name__)
18
-
19
- def get_instruction_string(custom_tool_definition) -> str:
20
- name, description = (
21
- custom_tool_definition["name"],
22
- custom_tool_definition["description"],
23
- )
24
- return f"Use the function '{name}' to '{description}'"
25
-
26
-
27
- def get_parameters_string(custom_tool_definition) -> str:
28
- return json.dumps(custom_tool_definition)
29
-
30
-
31
- def get_system_prompt_for_custom_tools(custom_tools: List) -> str:
32
- custom_tool_params = ""
33
- for t in custom_tools:
34
- custom_tool_params += get_instruction_string(t) + "\n"
35
- custom_tool_params += get_parameters_string(t) + "\n\n"
36
-
37
- content = f"""
38
- You have access to the following functions:
39
-
40
- {custom_tool_params}
41
- Think very carefully before calling functions.
42
- If a you choose to call a function ONLY reply in the following format:
43
- <{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}
44
- where
45
-
46
- start_tag => `<function`
47
- parameters => a JSON dict with the function argument name as key and function argument value as value.
48
- end_tag => `</function>`
49
-
50
- Here is an example,
51
- <function=example_function_name>{{"example_name": "example_value"}}</function>
52
-
53
- Reminder:
54
- - If looking for real time information use relevant functions before falling back to brave_search
55
- - Function calls MUST follow the specified format, start with <function= and end with </function>
56
- - Required parameters MUST be specified
57
- - Only call one function at a time
58
- - Put the entire function call reply on one line
59
-
60
- """
61
- return content
62
-
63
-
64
- def get_system_message_for_tools(tools: List[Dict], use_code_interpreter) -> List[Dict]:
65
- content = ""
66
- if use_code_interpreter:
67
- content += "Environment: ipython\n"
68
-
69
- current_date = datetime.datetime.now()
70
- formatted_date = current_date.strftime("%d %B %Y")
71
- date_str = f"""
72
- Cutting Knowledge Date: December 2023\n\n"""
73
- content += date_str
74
-
75
- if tools:
76
- custom_message = get_system_prompt_for_custom_tools(tools)
77
- content += custom_message
78
-
79
- return {"role": "system", "content": content}
80
-
81
-
82
- class FunctionaryTokenizer(PreTrainedTokenizerFast):
83
- def apply_chat_template(
84
- self,
85
- conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], str],
86
- tools: Optional[List[Dict[str, Any]]],
87
- chat_template: Optional[str] = None,
88
- add_generation_prompt: bool = False,
89
- tokenize: bool = True,
90
- padding: bool = False,
91
- truncation: bool = False,
92
- max_length: Optional[int] = None,
93
- return_tensors: Optional[Union[str, TensorType]] = None,
94
- return_dict: bool = False,
95
- tokenizer_kwargs: Optional[Dict[str, Any]] = None,
96
- **kwargs,
97
- ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
98
-
99
- if return_dict and not tokenize:
100
- raise ValueError(
101
- "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
102
- "of tokenizer outputs to return."
103
- )
104
-
105
- if tokenizer_kwargs is None:
106
- tokenizer_kwargs = {}
107
-
108
- using_default_template = False
109
-
110
- # First, handle the cases when the model has a dict of multiple templates
111
- if isinstance(self.chat_template, dict) or (
112
- self.chat_template is None and isinstance(self.default_chat_template, dict)
113
- ):
114
- if self.chat_template is not None:
115
- template_dict = self.chat_template
116
- using_default_dict = False
117
- else:
118
- template_dict = self.default_chat_template
119
- using_default_dict = True
120
- if chat_template is not None and chat_template in template_dict:
121
- # The user can pass the name of a template to the chat template argument instead of an entire template
122
- chat_template = template_dict[chat_template]
123
- if using_default_dict:
124
- using_default_template = True
125
- elif chat_template is None and "default" in template_dict:
126
- chat_template = template_dict["default"]
127
- if using_default_dict:
128
- using_default_template = True
129
- elif chat_template is None:
130
- raise ValueError(
131
- "This model has multiple chat templates with no default specified! Please either pass a chat "
132
- "template or the name of the template you wish to use to the `chat_template` argument. Available "
133
- f"template names are {sorted(template_dict.keys())}."
134
- )
135
- elif chat_template is None:
136
- # These are the cases when the model has a single template
137
- # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
138
- if self.chat_template is not None:
139
- chat_template = self.chat_template
140
- else:
141
- chat_template = self.default_chat_template
142
- using_default_template = True
143
-
144
- if using_default_template:
145
- logger.warning_once(
146
- "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
147
- "very error-prone, because models are often trained with templates different from the class default! "
148
- "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
149
- "point any code depending on them will stop working. We recommend setting a valid chat template before "
150
- "then to ensure that this model continues working without issues."
151
- )
152
-
153
- # Prepare tools/functions into schema
154
- functions_pydantic_to_render = []
155
- has_code_interpreter = False
156
- if tools is not None:
157
- for item in tools:
158
- if "function" in item and item["function"] is not None:
159
- functions_pydantic_to_render.append(item["function"])
160
- elif "type" in item and item["type"] == "code_interpreter":
161
- has_code_interpreter = True
162
- else:
163
- functions_pydantic_to_render.append(item)
164
- tools_system_message = get_system_message_for_tools(functions_pydantic_to_render, has_code_interpreter)
165
- conversation.insert(0, tools_system_message)
166
-
167
- # Compilation function uses a cache to avoid recompiling the same template
168
- compiled_template = self._compile_jinja_template(chat_template)
169
-
170
- if isinstance(conversation, (list, tuple)) and (
171
- isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
172
- ):
173
- conversations = conversation
174
- is_batched = True
175
- else:
176
- conversations = [conversation]
177
- is_batched = False
178
-
179
- rendered = []
180
- template_kwargs = {**self.special_tokens_map, **kwargs} # kwargs overwrite special tokens if both are present
181
- for chat in conversations:
182
- if hasattr(chat, "messages"):
183
- # Indicates it's a Conversation object
184
- chat = chat.messages
185
- rendered_chat = compiled_template.render(
186
- messages=chat, add_generation_prompt=add_generation_prompt, **template_kwargs
187
- )
188
- rendered.append(rendered_chat)
189
-
190
- if not is_batched:
191
- rendered = rendered[0]
192
-
193
- if tokenize:
194
- out = self(
195
- rendered,
196
- padding=padding,
197
- truncation=truncation,
198
- max_length=max_length,
199
- add_special_tokens=False,
200
- return_tensors=return_tensors,
201
- **tokenizer_kwargs,
202
- )
203
- if return_dict:
204
- return out
205
- else:
206
- return out["input_ids"]
207
- else:
208
- return rendered