Commit
·
3db8af6
1
Parent(s):
b63d65d
fix tokenization
Browse files- tokenization_functionary.py +32 -28
- tokenizer_config.json +1 -1
tokenization_functionary.py
CHANGED
|
@@ -38,10 +38,8 @@ class Tool(BaseModel):
|
|
| 38 |
|
| 39 |
def convert_data_type(param_type: str) -> str:
|
| 40 |
"""convert data_type to typescript data type
|
| 41 |
-
|
| 42 |
Args:
|
| 43 |
param_type (str): param_type
|
| 44 |
-
|
| 45 |
Returns:
|
| 46 |
str: param type in typescript
|
| 47 |
"""
|
|
@@ -52,10 +50,8 @@ def convert_data_type(param_type: str) -> str:
|
|
| 52 |
|
| 53 |
def get_param_type(param: Dict) -> str:
|
| 54 |
"""get param_type of parameter
|
| 55 |
-
|
| 56 |
Args:
|
| 57 |
param (Dict): param dict in properties
|
| 58 |
-
|
| 59 |
Returns:
|
| 60 |
str: _description_
|
| 61 |
"""
|
|
@@ -80,10 +76,8 @@ def get_param_type(param: Dict) -> str:
|
|
| 80 |
|
| 81 |
def get_format_param(param: Dict) -> Optional[str]:
|
| 82 |
"""Get "format" from param. There are cases where format is not directly in param but in oneOf
|
| 83 |
-
|
| 84 |
Args:
|
| 85 |
param (Dict): _description_
|
| 86 |
-
|
| 87 |
Returns:
|
| 88 |
Optional[str]: _description_
|
| 89 |
"""
|
|
@@ -101,10 +95,8 @@ def get_format_param(param: Dict) -> Optional[str]:
|
|
| 101 |
|
| 102 |
def get_param_info(param: Dict) -> Optional[str]:
|
| 103 |
"""get additional information about parameter such as: format, default value, min, max, ...
|
| 104 |
-
|
| 105 |
Args:
|
| 106 |
param (Dict): _description_
|
| 107 |
-
|
| 108 |
Returns:
|
| 109 |
Optional[str]: _description_
|
| 110 |
"""
|
|
@@ -150,7 +142,6 @@ def append_new_param_info(
|
|
| 150 |
depth: int,
|
| 151 |
):
|
| 152 |
"""Append a new parameter with comment to the info_list
|
| 153 |
-
|
| 154 |
Args:
|
| 155 |
info_lines (List[str]): current info_list
|
| 156 |
param_declaration (str): param: type
|
|
@@ -176,11 +167,9 @@ def append_new_param_info(
|
|
| 176 |
|
| 177 |
def get_examples_info(param_name: str, examples: List) -> List:
|
| 178 |
"""get information about examples provided
|
| 179 |
-
|
| 180 |
Args:
|
| 181 |
param_name (str): _description_
|
| 182 |
examples (List): _description_
|
| 183 |
-
|
| 184 |
Returns:
|
| 185 |
List: _description_
|
| 186 |
"""
|
|
@@ -197,10 +186,8 @@ def get_examples_info(param_name: str, examples: List) -> List:
|
|
| 197 |
|
| 198 |
def get_enum_option_str(enum_options: List) -> str:
|
| 199 |
"""get enum option separated by: "|"
|
| 200 |
-
|
| 201 |
Args:
|
| 202 |
enum_options (List): list of options
|
| 203 |
-
|
| 204 |
Returns:
|
| 205 |
_type_: concatenation of options separated by "|"
|
| 206 |
"""
|
|
@@ -212,12 +199,10 @@ def get_array_typescript(
|
|
| 212 |
param_name: Optional[str], param_dic: dict, depth: int = 0
|
| 213 |
) -> str:
|
| 214 |
"""recursive implementation for generating type script of array
|
| 215 |
-
|
| 216 |
Args:
|
| 217 |
param_name (Optional[str]): name of param, optional
|
| 218 |
param_dic (dict): param_dic
|
| 219 |
depth (int, optional): nested level. Defaults to 0.
|
| 220 |
-
|
| 221 |
Returns:
|
| 222 |
_type_: typescript of array
|
| 223 |
"""
|
|
@@ -270,12 +255,10 @@ def get_array_typescript(
|
|
| 270 |
def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
|
| 271 |
"""Recursion, returning the information about parameters including data type, description and other information
|
| 272 |
These kinds of information will be put into the prompt
|
| 273 |
-
|
| 274 |
Args:
|
| 275 |
properties (_type_): properties in parameters
|
| 276 |
required_params (_type_): List of required parameters
|
| 277 |
depth (int, optional): the depth of params (nested level). Defaults to 0.
|
| 278 |
-
|
| 279 |
Returns:
|
| 280 |
_type_: list of lines containing information about all parameters
|
| 281 |
"""
|
|
@@ -461,20 +444,41 @@ class FunctionaryTokenizer(PreTrainedTokenizerFast):
|
|
| 461 |
"point any code depending on them will stop working. We recommend setting a valid chat template before "
|
| 462 |
"then to ensure that this model continues working without issues."
|
| 463 |
)
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
# Prepare tools/functions into schema
|
| 466 |
functions_pydantic_to_render = []
|
| 467 |
has_code_interpreter = False
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
|
| 479 |
# Compilation function uses a cache to avoid recompiling the same template
|
| 480 |
compiled_template = self._compile_jinja_template(chat_template)
|
|
|
|
| 38 |
|
| 39 |
def convert_data_type(param_type: str) -> str:
|
| 40 |
"""convert data_type to typescript data type
|
|
|
|
| 41 |
Args:
|
| 42 |
param_type (str): param_type
|
|
|
|
| 43 |
Returns:
|
| 44 |
str: param type in typescript
|
| 45 |
"""
|
|
|
|
| 50 |
|
| 51 |
def get_param_type(param: Dict) -> str:
|
| 52 |
"""get param_type of parameter
|
|
|
|
| 53 |
Args:
|
| 54 |
param (Dict): param dict in properties
|
|
|
|
| 55 |
Returns:
|
| 56 |
str: _description_
|
| 57 |
"""
|
|
|
|
| 76 |
|
| 77 |
def get_format_param(param: Dict) -> Optional[str]:
|
| 78 |
"""Get "format" from param. There are cases where format is not directly in param but in oneOf
|
|
|
|
| 79 |
Args:
|
| 80 |
param (Dict): _description_
|
|
|
|
| 81 |
Returns:
|
| 82 |
Optional[str]: _description_
|
| 83 |
"""
|
|
|
|
| 95 |
|
| 96 |
def get_param_info(param: Dict) -> Optional[str]:
|
| 97 |
"""get additional information about parameter such as: format, default value, min, max, ...
|
|
|
|
| 98 |
Args:
|
| 99 |
param (Dict): _description_
|
|
|
|
| 100 |
Returns:
|
| 101 |
Optional[str]: _description_
|
| 102 |
"""
|
|
|
|
| 142 |
depth: int,
|
| 143 |
):
|
| 144 |
"""Append a new parameter with comment to the info_list
|
|
|
|
| 145 |
Args:
|
| 146 |
info_lines (List[str]): current info_list
|
| 147 |
param_declaration (str): param: type
|
|
|
|
| 167 |
|
| 168 |
def get_examples_info(param_name: str, examples: List) -> List:
|
| 169 |
"""get information about examples provided
|
|
|
|
| 170 |
Args:
|
| 171 |
param_name (str): _description_
|
| 172 |
examples (List): _description_
|
|
|
|
| 173 |
Returns:
|
| 174 |
List: _description_
|
| 175 |
"""
|
|
|
|
| 186 |
|
| 187 |
def get_enum_option_str(enum_options: List) -> str:
|
| 188 |
"""get enum option separated by: "|"
|
|
|
|
| 189 |
Args:
|
| 190 |
enum_options (List): list of options
|
|
|
|
| 191 |
Returns:
|
| 192 |
_type_: concatenation of options separated by "|"
|
| 193 |
"""
|
|
|
|
| 199 |
param_name: Optional[str], param_dic: dict, depth: int = 0
|
| 200 |
) -> str:
|
| 201 |
"""recursive implementation for generating type script of array
|
|
|
|
| 202 |
Args:
|
| 203 |
param_name (Optional[str]): name of param, optional
|
| 204 |
param_dic (dict): param_dic
|
| 205 |
depth (int, optional): nested level. Defaults to 0.
|
|
|
|
| 206 |
Returns:
|
| 207 |
_type_: typescript of array
|
| 208 |
"""
|
|
|
|
| 255 |
def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
|
| 256 |
"""Recursion, returning the information about parameters including data type, description and other information
|
| 257 |
These kinds of information will be put into the prompt
|
|
|
|
| 258 |
Args:
|
| 259 |
properties (_type_): properties in parameters
|
| 260 |
required_params (_type_): List of required parameters
|
| 261 |
depth (int, optional): the depth of params (nested level). Defaults to 0.
|
|
|
|
| 262 |
Returns:
|
| 263 |
_type_: list of lines containing information about all parameters
|
| 264 |
"""
|
|
|
|
| 444 |
"point any code depending on them will stop working. We recommend setting a valid chat template before "
|
| 445 |
"then to ensure that this model continues working without issues."
|
| 446 |
)
|
| 447 |
+
|
| 448 |
+
PYTHON_RUN_SYS_MSG = "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."
|
| 449 |
+
SYSTEM_CONTENT = """You are capable of executing available function(s) if required.
|
| 450 |
+
Only execute function(s) when absolutely necessary.
|
| 451 |
+
Ask for the required input to:recipient==all
|
| 452 |
+
Use JSON for function arguments.
|
| 453 |
+
Respond in this format:
|
| 454 |
+
>>>${recipient}
|
| 455 |
+
${content}
|
| 456 |
+
Available functions:
|
| 457 |
+
"""
|
| 458 |
+
|
| 459 |
# Prepare tools/functions into schema
|
| 460 |
functions_pydantic_to_render = []
|
| 461 |
has_code_interpreter = False
|
| 462 |
+
if tools is not None:
|
| 463 |
+
for item in tools:
|
| 464 |
+
if (
|
| 465 |
+
"function" in item and item["function"] is not None
|
| 466 |
+
): # new data format: tools: [{"type": xx, "function": xxx}]
|
| 467 |
+
functions_pydantic_to_render.append(item["function"])
|
| 468 |
+
elif "type" in item and item["type"] == "code_interpreter":
|
| 469 |
+
has_code_interpreter = True
|
| 470 |
+
else:
|
| 471 |
+
functions_pydantic_to_render.append(item) # old format
|
| 472 |
+
|
| 473 |
+
conversation.insert(
|
| 474 |
+
0,
|
| 475 |
+
{
|
| 476 |
+
"role": "system",
|
| 477 |
+
"content": SYSTEM_CONTENT + generate_schema_from_functions(functions_pydantic_to_render),
|
| 478 |
+
},
|
| 479 |
+
)
|
| 480 |
+
if has_code_interpreter:
|
| 481 |
+
conversation.insert(1, {"role": "system", "content": PYTHON_RUN_SYS_MSG})
|
| 482 |
|
| 483 |
# Compilation function uses a cache to avoid recompiling the same template
|
| 484 |
compiled_template = self._compile_jinja_template(chat_template)
|
tokenizer_config.json
CHANGED
|
@@ -2050,7 +2050,7 @@
|
|
| 2050 |
}
|
| 2051 |
},
|
| 2052 |
"bos_token": "<|begin_of_text|>",
|
| 2053 |
-
"chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>
|
| 2054 |
"clean_up_tokenization_spaces": true,
|
| 2055 |
"eos_token": "<|eot_id|>",
|
| 2056 |
"legacy": true,
|
|
|
|
| 2050 |
}
|
| 2051 |
},
|
| 2052 |
"bos_token": "<|begin_of_text|>",
|
| 2053 |
+
"chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
|
| 2054 |
"clean_up_tokenization_spaces": true,
|
| 2055 |
"eos_token": "<|eot_id|>",
|
| 2056 |
"legacy": true,
|