Upload 2 files

Browse files

Files changed (2) hide show

modeling_functionary.py +109 -0
tokenization_functionary.py +520 -0

modeling_functionary.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# coding=utf-8
+# Copyright (c) 2024, MeetKai Inc. All rights reserved.
+"""PyTorch LLaMA model."""
+import json
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.utils import (
+    GenerateBeamDecoderOnlyOutput,
+    GenerateBeamEncoderDecoderOutput,
+    GenerateDecoderOnlyOutput,
+    GenerateEncoderDecoderOutput
+)
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+from transformers.utils import logging
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.generation.streamers import BaseStreamer
+logger = logging.get_logger(__name__)
+GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput]
+GenerateBeamOutput = Union[GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput]
+GenerateOutput = Union[GenerateNonBeamOutput, GenerateBeamOutput]
+class FunctionaryForCausalLM(LlamaForCausalLM):
+    def generate_tool_use(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we use it to parse raw output
+        results = self.generate(
+            inputs=inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            **kwargs,
+        )
+        input_ids = kwargs.pop("input_ids")
+        function_call_token = "<|reserved_special_token_249|>"
+        correct_results = []
+        for input_id, result in zip(input_ids, results):
+            final_output_json = {"role": "assistant", "content": None, "tool_calls": None}
+            tool_calls = []
+            raw_output_str = tokenizer.decode(result[len(input_id):].cpu())
+            has_text = False if raw_output_str.startswith(function_call_token) else True
+            chunks = raw_output_str.split(function_call_token)
+            for i, chunk in enumerate(chunks):
+                if len(chunk) == 0:
+                    continue
+                chunk = chunk.replace(tokenizer.pad_token, "")
+                if i == 0 and has_text is not False:
+                    final_output_json["content"] = chunk.strip[:-len("<|eot_id|>")] if chunk.endswith("<|eot_id|>") else chunk
+                else:
+                    tool_calls.append(
+                        {
+                            "name": chunk[: chunk.index("\n{")],
+                            "arguments": chunk[chunk.index("\n{") + 1: -len("<|eot_id|>")] if chunk.endswith("<|eot_id|>") else chunk[chunk.index("\n{") + 1:]
+                        }
+                    )
+            if len(tool_calls) > 0:
+                final_output_json["tool_calls"] = tool_calls
+            final_output_str = json.dumps(final_output_json, indent=4)
+            final_output_ids = tokenizer(final_output_str, add_special_tokens=False)["input_ids"]
+            correct_results.append(
+                torch.cat(
+                    (result[:len(input_id)].cpu(), torch.tensor(final_output_ids))
+                )
+            )
+        max_len = max([tensor.shape[0] for tensor in correct_results])
+        correct_results = [
+            torch.nn.functional.pad(
+                correct_result, (0, max_len - correct_result.shape[0]), value=tokenizer.eos_token_id
+            ) for correct_result in correct_results
+        ]
+        correct_results = torch.stack(correct_results)
+        return correct_results

tokenization_functionary.py ADDED Viewed

	@@ -0,0 +1,520 @@

+# Copyright (c) 2024, MeetKai Inc. All rights reserved.
+from copy import deepcopy
+import json
+from typing import Any, Dict, List, Literal, Optional, Union
+import jsonref
+from pydantic import BaseModel, Field, model_validator
+from typing_extensions import Self
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+SYSTEM_PROMPT = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
+CODE_INTERPRETER_SYSTEM_PROMPT = """When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."""
+class Function(BaseModel):
+    name: str
+    description: Optional[str] = Field(default="")
+    parameters: Optional[dict] = None
+class Tool(BaseModel):
+    type: Literal["function", "code_interpreter"]
+    function: Optional[Function] = None
+    @model_validator(mode="after")
+    def check_type_function_matches(self) -> Self:
+        if self.type == "function":
+            assert self.function is not None, '"function" must contain function description when `"type": "function"`'
+        else:
+            assert self.function is None, '"function" must not be provided when `"type": "code_interpreter"`'
+        return self
+def convert_data_type(param_type: str) -> str:
+    """convert data_type to typescript data type
+    Args:
+        param_type (str): param_type
+    Returns:
+        str: param type in typescript
+    """
+    if param_type == "integer" or param_type == "float":
+        return "number"
+    return param_type
+def get_param_type(param: Dict) -> str:
+    """get param_type of parameter
+    Args:
+        param (Dict): param dict in properties
+    Returns:
+        str: _description_
+    """
+    param_type = "any"
+    if "type" in param:
+        raw_param_type = param["type"]
+        if type(raw_param_type) is list:
+            param_type = " | ".join(raw_param_type)
+        else:
+            param_type = raw_param_type
+    else:  # in many cases, the json schema contains: oneOf instead of "type"
+        if "oneOf" in param:
+            one_of_types = []
+            for item in param["oneOf"]:
+                if "type" in item:
+                    one_of_types.append(convert_data_type(item["type"]))
+            one_of_types = list(set(one_of_types))
+            param_type = " | ".join(one_of_types)
+    return convert_data_type(param_type)
+def get_format_param(param: Dict) -> Optional[str]:
+    """Get "format" from param. There are cases where format is not directly in param but in oneOf
+    Args:
+        param (Dict): _description_
+    Returns:
+        Optional[str]: _description_
+    """
+    if "format" in param:
+        return param["format"]
+    if "oneOf" in param:
+        formats = []
+        for item in param["oneOf"]:
+            if "format" in item:
+                formats.append(item["format"])
+        if len(formats) > 0:
+            return " or ".join(formats)
+    return None
+def get_param_info(param: Dict) -> Optional[str]:
+    """get additional information about parameter such as: format, default value, min, max, ...
+    Args:
+        param (Dict): _description_
+    Returns:
+        Optional[str]: _description_
+    """
+    param_type = param.get("type", "any")
+    info_list = []
+    if "description" in param:
+        desc = param["description"]
+        if not desc.endswith("."):
+            desc += "."
+        info_list.append(desc)
+    if "default" in param:
+        default_value = param["default"]
+        if param_type == "string":
+            default_value = f'"{default_value}"'  # if string --> add ""
+        info_list.append(f"Default={default_value}.")
+    format_param = get_format_param(param)
+    if format_param is not None:
+        info_list.append("Format=" + format_param)
+    for field, field_name in [
+        ("maximum", "Maximum"),
+        ("minimum", "Minimum"),
+        ("maxLength", "Maximum length"),
+        ("minLength", "Minimum length"),
+    ]:
+        if field in param:
+            info_list.append(f"{field_name}=" + str(param[field]))
+    if len(info_list) > 0:
+        result = "// " + " ".join(info_list)
+        result = result.replace("\n", " ")
+        return result
+    return None
+def append_new_param_info(
+    info_list: List[str],
+    param_declaration: str,
+    comment_info: Optional[str],
+    examples_info: List,
+    depth: int,
+):
+    """Append a new parameter with comment to the info_list
+    Args:
+        info_lines (List[str]): current info_list
+        param_declaration (str): param: type
+        comment_info (Optional[str]): information of comment
+        examples_info (List): information of examples given
+        depth (int): level of nested param
+    """
+    offset = ""
+    if depth >= 1:
+        offset = "".join(["    " for _ in range(depth)])
+    if comment_info is not None:
+        # if depth == 0:  # format: //comment\nparam: type
+        info_list.append(f"{offset}{comment_info}")
+        if len(examples_info) > 0:
+            for example in examples_info:
+                info_list.append(f"{offset}{example}")
+        info_list.append(f"{offset}{param_declaration}")
+    # else:  # format: param: type  // comment
+    #     info_list.append(f"{offset}{param_declaration}    {comment_info}")
+    else:
+        info_list.append(f"{offset}{param_declaration}")
+def get_examples_info(param_name: str, examples: List) -> List:
+    """get information about examples provided
+    Args:
+        param_name (str): _description_
+        examples (List): _description_
+    Returns:
+        List: _description_
+    """
+    examples_list = [f"// Example {param_name}:"]
+    for example in examples:
+        if isinstance(example, dict) or isinstance(example, list):
+            example_str = json.dumps(example, ensure_ascii=False).replace('\n', '\\n')
+        else:
+            example_str = str(example).replace('\n', '\\n')
+        examples_list.append(f"// {example_str}")
+    return examples_list
+def get_enum_option_str(enum_options: List) -> str:
+    """get enum option separated by: "|"
+    Args:
+        enum_options (List): list of options
+    Returns:
+        _type_: concatenation of options separated by "|"
+    """
+    # if each option is string --> add quote
+    return " | ".join([f'"{v}"' if type(v) is str else str(v) for v in enum_options])
+def get_array_typescript(
+    param_name: Optional[str], param_dic: dict, depth: int = 0
+) -> str:
+    """recursive implementation for generating type script of array
+    Args:
+        param_name (Optional[str]): name of param, optional
+        param_dic (dict): param_dic
+        depth (int, optional): nested level. Defaults to 0.
+    Returns:
+        _type_: typescript of array
+    """
+    offset = ""
+    if depth >= 1:
+        offset = "".join(["    " for _ in range(depth)])
+    items_info = param_dic.get("items", {})
+    if len(items_info) == 0:
+        if param_name is not None:
+            return f"{offset}{param_name}: []"
+        else:
+            return "[]"
+    array_type = get_param_type(items_info)
+    if array_type == "object":
+        info_lines = []
+        child_lines = get_parameter_typescript(
+            items_info.get("properties", {}), items_info.get("required", []), depth + 1
+        )
+        # if comment_info is not None:
+        #    info_lines.append(f"{offset}{comment_info}")
+        if param_name is not None:
+            info_lines.append(f"{offset}{param_name}" + ": {")
+        else:
+            info_lines.append(f"{offset}" + "{")
+        info_lines.extend(child_lines)
+        info_lines.append(f"{offset}" + "}[]")
+        return "\n".join(info_lines)
+    elif array_type == "array":
+        item_info = get_array_typescript(None, items_info, depth + 1)
+        if param_name is None:
+            return f"{item_info}[]"
+        return f"{offset}{param_name}: {item_info.strip()}[]"
+    else:
+        if "enum" in items_info:
+            item_type = get_enum_option_str(items_info["enum"])
+            if param_name is None:
+                return f"({item_type})[]"
+            else:
+                return f"{offset}{param_name}: ({item_type})[]"
+        else:
+            if param_name is None:
+                return f"{array_type}[]"
+            else:
+                return f"{offset}{param_name}: {array_type}[],"
+def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
+    """Recursion, returning the information about parameters including data type, description and other information
+    These kinds of information will be put into the prompt
+    Args:
+        properties (_type_): properties in parameters
+        required_params (_type_): List of required parameters
+        depth (int, optional): the depth of params (nested level). Defaults to 0.
+    Returns:
+        _type_: list of lines containing information about all parameters
+    """
+    tp_lines = []
+    for param_name, param in properties.items():
+        # Sometimes properties have "required" field as a list of string.
+        # Even though its supposed to be not under properties. So we skip it
+        if not isinstance(param, dict):
+            continue
+        # Param Description
+        comment_info = get_param_info(param)
+        # Param Examples
+        examples_info = []
+        if "examples" in param:
+            examples_info = get_examples_info(param_name, param["examples"])
+        # Param Name declaration
+        param_declaration = f"{param_name}"
+        if isinstance(required_params, list):
+            if param_name not in required_params:
+                param_declaration += "?"
+        param_type = get_param_type(param)
+        offset = ""
+        if depth >= 1:
+            offset = "".join(["    " for _ in range(depth)])
+        if param_type == "object":  # param_type is object
+            child_lines = get_parameter_typescript(
+                param.get("properties", {}), param.get("required", []), depth + 1
+            )
+            if comment_info is not None:
+                tp_lines.append(f"{offset}{comment_info}")
+            if len(examples_info) > 0:
+                for example in examples_info:
+                    tp_lines.append(f"{offset}{example}")
+            param_declaration += ": {"
+            tp_lines.append(f"{offset}{param_declaration}")
+            tp_lines.extend(child_lines)
+            tp_lines.append(f"{offset}" + "},")
+        elif param_type == "array":  # param_type is an array
+            item_info = param.get("items", {})
+            if "type" not in item_info:  # don't know type of array
+                param_declaration += ": [],"
+                append_new_param_info(
+                    tp_lines, param_declaration, comment_info, examples_info, depth
+                )
+            else:
+                array_declaration = get_array_typescript(
+                    param_declaration, param, depth
+                )
+                if not array_declaration.endswith(","):
+                    array_declaration += ","
+                if comment_info is not None:
+                    tp_lines.append(f"{offset}{comment_info}")
+                if len(examples_info) > 0:
+                    for example in examples_info:
+                        tp_lines.append(f"{offset}{example}")
+                tp_lines.append(array_declaration)
+        else:
+            if "enum" in param:
+                param_type = get_enum_option_str(param["enum"])
+                # param_type = " | ".join([f'"{v}"' for v in param["enum"]])
+            if "nullable" in param and param["nullable"] is True:
+                param_type += " | null"
+            param_declaration += f": {param_type},"
+            append_new_param_info(
+                tp_lines, param_declaration, comment_info, examples_info, depth
+            )
+    return tp_lines
+def generate_schema_from_functions(
+    functions: List[Function], namespace="functions"
+) -> str:
+    """
+    Convert functions schema to a schema that language models can understand.
+    """
+    schema = "// Supported function definitions that should be called when necessary.\n"
+    schema += f"namespace {namespace} {{\n\n"
+    for function in functions:
+        # Convert a Function object to dict, if necessary
+        if not isinstance(function, dict):
+            function = function.model_dump()
+        function_name = function.get("name", None)
+        if function_name is None:
+            continue
+        description = function.get("description", "")
+        schema += f"// {description}\n"
+        schema += f"type {function_name}"
+        parameters = function.get("parameters", None)
+        if parameters is not None and parameters.get("properties") is not None:
+            parameters = deepcopy(jsonref.JsonRef.replace_refs(parameters))
+            schema += " = (_: {\n"
+            required_params = parameters.get("required", [])
+            tp_lines = get_parameter_typescript(
+                parameters.get("properties"),
+                required_params,
+                0,
+            )
+            schema += "\n".join(tp_lines)
+            schema += "\n}) => any;\n\n"
+        else:
+            # Doesn't have any parameters
+            schema += " = () => any;\n\n"
+    schema += f"}} // namespace {namespace}"
+    return schema
+class FunctionaryTokenizer(PreTrainedTokenizerFast):
+    def apply_chat_template(
+        self,
+        conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], str],
+        tools: Optional[List[Dict[str, Any]]],
+        chat_template: Optional[str] = None,
+        add_generation_prompt: bool = False,
+        tokenize: bool = True,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_dict: bool = False,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
+        if return_dict and not tokenize:
+            raise ValueError(
+                "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
+                "of tokenizer outputs to return."
+            )
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        using_default_template = False
+        # First, handle the cases when the model has a dict of multiple templates
+        if isinstance(self.chat_template, dict) or (
+            self.chat_template is None and isinstance(self.default_chat_template, dict)
+        ):
+            if self.chat_template is not None:
+                template_dict = self.chat_template
+                using_default_dict = False
+            else:
+                template_dict = self.default_chat_template
+                using_default_dict = True
+            if chat_template is not None and chat_template in template_dict:
+                # The user can pass the name of a template to the chat template argument instead of an entire template
+                chat_template = template_dict[chat_template]
+                if using_default_dict:
+                    using_default_template = True
+            elif chat_template is None and "default" in template_dict:
+                chat_template = template_dict["default"]
+                if using_default_dict:
+                    using_default_template = True
+            elif chat_template is None:
+                raise ValueError(
+                    "This model has multiple chat templates with no default specified! Please either pass a chat "
+                    "template or the name of the template you wish to use to the `chat_template` argument. Available "
+                    f"template names are {sorted(template_dict.keys())}."
+                )
+        elif chat_template is None:
+            # These are the cases when the model has a single template
+            # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
+            if self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                chat_template = self.default_chat_template
+                using_default_template = True
+        if using_default_template:
+            logger.warning_once(
+                "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
+                "very error-prone, because models are often trained with templates different from the class default! "
+                "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
+                "point any code depending on them will stop working. We recommend setting a valid chat template before "
+                "then to ensure that this model continues working without issues."
+            )
+        # Prepare tools/functions into schema
+        functions_pydantic_to_render = []
+        has_code_interpreter = False
+        for i in range(len(tools)):
+            tool_pydantic = Tool.model_validate(tools[i])
+            if tool_pydantic.type == "function":
+                functions_pydantic_to_render.append(tool_pydantic.function)
+            else:
+                has_code_interpreter = True
+        conversation.insert(0, {"role": "system", "content": generate_schema_from_functions(functions_pydantic_to_render)})
+        # Insert system prompt
+        system_prompt_to_use = SYSTEM_PROMPT if not has_code_interpreter else CODE_INTERPRETER_SYSTEM_PROMPT
+        conversation.insert(1, {"role": "system", "content": system_prompt_to_use})
+        # Compilation function uses a cache to avoid recompiling the same template
+        compiled_template = self._compile_jinja_template(chat_template)
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
+        ):
+            conversations = conversation
+            is_batched = True
+        else:
+            conversations = [conversation]
+            is_batched = False
+        rendered = []
+        template_kwargs = {**self.special_tokens_map, **kwargs}  # kwargs overwrite special tokens if both are present
+        for chat in conversations:
+            if hasattr(chat, "messages"):
+                # Indicates it's a Conversation object
+                chat = chat.messages
+            rendered_chat = compiled_template.render(
+                messages=chat, add_generation_prompt=add_generation_prompt, **template_kwargs
+            )
+            rendered.append(rendered_chat)
+        if not is_batched:
+            rendered = rendered[0]
+        if tokenize:
+            out = self(
+                rendered,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                add_special_tokens=False,
+                return_tensors=return_tensors,
+                **tokenizer_kwargs,
+            )
+            if return_dict:
+                return out
+            else:
+                return out["input_ids"]
+        else:
+            return rendered