# Copyright 2020-2026 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from pathlib import Path from typing import TypeVar from jinja2 import TemplateError from transformers import AddedToken, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin from .data_utils import prepare_multimodal_messages _CHAT_TEMPLATES_DIR = Path(__file__).parent / "chat_templates" def clone_chat_template( model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, source_tokenizer_path: str, resize_to_multiple_of: int | None = 64, ) -> tuple[PreTrainedModel, PreTrainedTokenizerBase, list[int]]: """ Clones a chat template from a source tokenizer to the target tokenizer and updates the model accordingly. This function: - Copies the chat template from a source tokenizer to the target tokenizer. - Adds any new tokens from the source tokenizer to the target tokenizer. - Sets and synchronizes the EOS token across the tokenizer and model. - Resizes the model's token embeddings to match the new vocabulary size, optionally rounding it up to a multiple of a specified value. In such cases, dummy tokens are added to the tokenizer to ensure the vocabulary size matches the embedding dimensions. Args: model ([`~transformers.PreTrainedModel`]): Model to update. tokenizer ([`~transformers.PreTrainedTokenizerBase`]): Tokenizer to update. source_tokenizer_path (`str`): Path or identifier of the pretrained tokenizer to clone from. resize_to_multiple_of (`int` or `None`, *optional*, defaults to `64`): The embedding layer will be resized to the new vocabulary size. If this is not `None`, it will round up the new vocabulary size to the nearest multiple of this value. Returns: model ([`~transformers.PreTrainedModel`]): Updated model with resized token embeddings and EOS token configured. tokenizer ([`~transformers.PreTrainedTokenizerBase`]): Updated tokenizer with the chat template and special tokens applied. added_tokens (`list[int]`): List of tokens that were added to the tokenizer from the source tokenizer. Example: ```python from transformers import AutoModelForCausalLM, AutoTokenizer from trl import clone_chat_template model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") model, tokenizer, added_tokens = clone_chat_template(model, tokenizer, "Qwen/Qwen3-0.6B") ``` """ # Load the source tokenizer containing the desired chat template tokenizer_source = AutoTokenizer.from_pretrained(source_tokenizer_path) # Copy the chat template from the source tokenizer tokenizer.chat_template = tokenizer_source.get_chat_template() # Ensure all added tokens from the source are available in the target tokenizer added_tokens = [ token for token in tokenizer_source.added_tokens_decoder.values() if token.content not in tokenizer.vocab ] tokenizer.add_tokens(added_tokens) # Set the EOS token from the source tokenizer (important for generation) tokenizer.eos_token = tokenizer_source.eos_token model.config.eos_token_id = tokenizer.eos_token_id if model.can_generate(): # Non-generative models (e.g. SequenceClassification) may not have a generation_config model.generation_config.eos_token_id = tokenizer.eos_token_id # Resize model embeddings to include any new tokens, optionally rounding up to a multiple model.resize_token_embeddings( # After studying many tokenizers, we found that len(tokenizer.vocab) is the most reliable way to get the vocab # size. Avoid using tokenizer.vocab_size or tokenizer.vocab_size + len(tokenizer.added_tokens_encoder), # as handling of special and added tokens varies across tokenizers. new_num_tokens=len(tokenizer.vocab), pad_to_multiple_of=resize_to_multiple_of if resize_to_multiple_of is not None else None, ) # After resizing, the embedding matrix size may exceed the vocabulary size. Add dummy tokens to the tokenizer to # ensure vocabulary size matches the embedding matrix dimensions. idx = 0 while model.vocab_size > len(tokenizer.vocab): dummy_token = AddedToken(f"") is_added = tokenizer.add_tokens(dummy_token) idx += 1 if is_added == 1: added_tokens.append(dummy_token) # Verify that vocabulary size now matches embedding dimensions if len(tokenizer.vocab) != model.vocab_size: raise RuntimeError( f"Vocabulary size mismatch after resizing: tokenizer vocab size is {len(tokenizer.vocab)}, but model " f"embedding size is {model.vocab_size}. This indicates an internal error in the token alignment process." ) added_tokens = [token.content for token in added_tokens] added_tokens = tokenizer.convert_tokens_to_ids(added_tokens) return model, tokenizer, added_tokens glm4moe_schema = { "x-regex": r"^(?:\n?\n?(?:(?P.*?\S.*?)\n?|[\s]*)\s*)?(?P.*?)(?:\n(?=))?(?=(?:|$))(?P(?:.+?\s*)+)?$", "type": "object", "properties": { "role": {"const": "assistant"}, "content": {"type": "string"}, "reasoning_content": {"type": "string"}, "tool_calls": { "type": "array", "x-regex-iterator": r"\s*(.+?)\s*", "items": { "type": "object", "properties": { "type": {"const": "function"}, "function": { "type": "object", "properties": { "name": {"type": "string", "x-regex": r"^(\S+)"}, "arguments": { "type": "object", "x-regex-key-value": r"(?P[^<]+)\s*\n(?P.*?)", "default": {}, "additionalProperties": { "x-parser": "json", "x-parser-args": {"allow_non_json": True}, }, }, }, }, }, }, }, }, } gptoss_schema = { # Normalize final content to analysis format so both map to the same "content" group. "x-regex-substitutions": [ [r"<\|channel\|>final<\|message\|>(.*?)<\|return\|>", r"<|channel|>analysis<|message|>\1<|end|>"], ], "x-regex": r"^(?:<\|channel\|>analysis<\|message\|>(?P.*?)<\|end\|>(?:<\|start\|>assistant)?)?\s*(?Pto=functions\.\S+<\|channel\|>commentary json<\|message\|>.*?<\|call\|>)?$", "type": "object", "properties": { "role": {"const": "assistant"}, "content": {"type": "string"}, "tool_calls": { "type": "array", "x-regex-iterator": r"(to=functions\.\S+<\|channel\|>commentary json<\|message\|>.*?<\|call\|>)", "items": { # Convert "to=functions.NAME<|channel|>commentary json<|message|>ARGS<|call|>" # into '{"name": "NAME", "arguments": ARGS}' so it can be parsed as JSON. "x-regex-substitutions": [ [ r"to=functions\.(\S+)<\|channel\|>commentary json<\|message\|>(.*?)<\|call\|>", r'{"name": "\1", "arguments": \2}', ], ], "x-parser": "json", "x-parser-args": {"transform": "{type: 'function', function: @}"}, "type": "object", "properties": { "type": {"const": "function"}, "function": { "type": "object", "properties": { "name": {"type": "string"}, "arguments": { "type": "object", "additionalProperties": {}, }, }, }, }, }, }, }, } # Adapted and corrected versions of the schemas from: # https://github.com/huggingface/transformers/blob/main/tests/utils/test_chat_parsing_utils.py qwen3_schema = { "x-regex": r"^(?:\n?(?:(?P.*?\S.*?)\n?|[\s]*)\s*)?(?P.*?)(?:\n(?=))?(?=(?:|<\|im_end\|>|$))(?P(?:.+?\s*)+)?\s*(?:<\|im_end\|>|$)", "type": "object", "properties": { "role": {"const": "assistant"}, "content": {"type": "string"}, "reasoning_content": {"type": "string"}, "tool_calls": { "type": "array", "x-regex-iterator": r"\s*(.+?)\s*", "items": { "x-parser": "json", "x-parser-args": {"transform": "{type: 'function', function: @}"}, "type": "object", "properties": { "type": {"const": "function"}, "function": { "type": "object", "properties": { "name": {"type": "string"}, "arguments": { "type": "object", "additionalProperties": {}, }, }, }, }, }, }, }, } llama3_schema = { # Llama 3.1 / 3.2 render a tool call as a single bare JSON object using the key "parameters" instead of # "arguments": `{"name": "", "parameters": }<|eot_id|>`. There is no surrounding marker, no # support for content alongside a tool call, and at most one tool call per assistant turn (the template raises # otherwise). Either we match a tool call (capturing the JSON) or we treat the response as plain content. "x-regex": r'^(?:(?P\{"name":\s*".+?",\s*"parameters":\s*.+\})|(?P.*?))(?:<\|eot_id\|>|$)', "type": "object", "properties": { "role": {"const": "assistant"}, "content": {"type": "string"}, "tool_calls": { "type": "array", "x-regex-iterator": r'(\{"name":\s*".+?",\s*"parameters":\s*.+\})', "items": { # Rewrite "parameters" → "arguments" so the JSON parses into the standard tool-call shape. Anchored # on the leading `{"name": "..."` so a stray `"parameters"` inside argument values is not touched. "x-regex-substitutions": [ [r'^(\{"name":\s*"[^"]+",\s*)"parameters":', r'\1"arguments":'], ], "x-parser": "json", "x-parser-args": {"transform": "{type: 'function', function: @}"}, "type": "object", "properties": { "type": {"const": "function"}, "function": { "type": "object", "properties": { "name": {"type": "string"}, "arguments": { "type": "object", "additionalProperties": {}, }, }, }, }, }, }, }, } qwen3_5_schema = { "x-regex": r"^(?:(?:\n?)?(?:(?P.*?\S.*?)\n?|[\s]*)\s*)?(?P.*?)(?:\n+(?=))?(?=(?:|<\|im_end\|>|$))(?P(?:.+?\s*)+)?\s*(?:<\|im_end\|>|$)", "type": "object", "properties": { "role": {"const": "assistant"}, "content": {"type": "string"}, "reasoning_content": {"type": "string"}, "tool_calls": { "type": "array", "x-regex-iterator": r"\s*(.+?)\s*", "items": { "type": "object", "properties": { "type": {"const": "function"}, "function": { "type": "object", "properties": { "name": {"type": "string", "x-regex": r"]+)>"}, "arguments": { "type": "object", "x-regex-key-value": r"[^>\n]+)>\n(?P.*?)\n", "default": {}, "additionalProperties": { "x-parser": "json", "x-parser-args": {"allow_non_json": True}, }, }, }, }, }, }, }, }, } deepseekv3_chat_template = (_CHAT_TEMPLATES_DIR / "deepseekv3.jinja").read_text() gemma_chat_template = (_CHAT_TEMPLATES_DIR / "gemma.jinja").read_text() glm4moe_chat_template = (_CHAT_TEMPLATES_DIR / "glm4moe.jinja").read_text() gptoss_chat_template = (_CHAT_TEMPLATES_DIR / "gptoss.jinja").read_text() llama3_chat_template = (_CHAT_TEMPLATES_DIR / "llama3.jinja").read_text() llama3_1_chat_template = (_CHAT_TEMPLATES_DIR / "llama3_1.jinja").read_text() llama3_2_chat_template = (_CHAT_TEMPLATES_DIR / "llama3_2.jinja").read_text() phi3_chat_template = (_CHAT_TEMPLATES_DIR / "phi3.jinja").read_text() qwen2_5_chat_template = (_CHAT_TEMPLATES_DIR / "qwen2_5.jinja").read_text() qwen3_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3.jinja").read_text() qwen3_vl_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3_vl.jinja").read_text() qwen3_5_chat_template_2b_and_below = (_CHAT_TEMPLATES_DIR / "qwen3_5_2b_and_below.jinja").read_text() qwen3_5_chat_template_4b_and_above = (_CHAT_TEMPLATES_DIR / "qwen3_5_4b_and_above.jinja").read_text() ProcessingClassT = TypeVar("ProcessingClassT", PreTrainedTokenizerBase, ProcessorMixin) def add_response_schema(processing_class: ProcessingClassT) -> ProcessingClassT: r""" Adds the appropriate response schema to the given tokenizer based on its chat template. At the time of initial implementation, most tokenizers do not have built-in support for response schemas. While waiting for broader adoption, we provide this utility function to manually set the response schema for known chat templates. When given a VLM processor, the schema is set on the inner tokenizer, since `parse_response` is a tokenizer method and reads `self.response_schema` from the tokenizer instance. Args: processing_class (`PreTrainedTokenizerBase` or `ProcessorMixin`): Tokenizer or VLM processor to which the response schema will be added. Returns: `PreTrainedTokenizerBase` or `ProcessorMixin`: The same object that was passed in, with the response schema set on the underlying tokenizer. Examples: ```python >>> from trl.chat_template_utils import add_response_schema >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") >>> tokenizer = add_response_schema(tokenizer) >>> assistant_text = '\n{"name": "multiply", "arguments": {"a": 3, "b": 4}}\n<|im_end|>' >>> tokenizer.parse_response(assistant_text) {'role': 'assistant', 'content': '', 'tool_calls': [{'type': 'function', 'function': {'name': 'multiply', 'arguments': {'a': 3, 'b': 4}}}]} ``` """ # For VLM processors, set the schema on the inner tokenizer (where `parse_response` reads it from). # Match against the top-level chat_template, since that's what was used historically and processors # may carry their own VLM-specific template separate from the inner tokenizer's. chat_template = processing_class.chat_template if isinstance(processing_class, ProcessorMixin): tokenizer = processing_class.tokenizer else: tokenizer = processing_class if chat_template == glm4moe_chat_template: tokenizer.response_schema = glm4moe_schema elif chat_template == gptoss_chat_template: tokenizer.response_schema = gptoss_schema elif chat_template in [llama3_1_chat_template, llama3_2_chat_template]: tokenizer.response_schema = llama3_schema elif chat_template in [qwen3_chat_template, qwen3_vl_chat_template]: tokenizer.response_schema = qwen3_schema elif chat_template in [qwen3_5_chat_template_2b_and_below, qwen3_5_chat_template_4b_and_above]: tokenizer.response_schema = qwen3_5_schema else: raise ValueError( "Unrecognized chat template, failed to add response schema. Please manually set the response schema on " "the tokenizer or processor. See the Transformers " "[docs](https://huggingface.co/docs/transformers/main/en/chat_response_parsing#response-parsing) for more " "details on response parsing." ) return processing_class def supports_tool_calling(processing_class) -> bool: """ Check if the processing class's chat template can render a full tool-calling conversation. This tests that (1) the template doesn't error when rendering a conversation with ``user → assistant (with tool_calls) → tool`` roles, and (2) every part of the tool-calling exchange — the assistant's tool call name, its arguments, and the tool message content — actually appears in the rendered output. Some templates silently swallow `tool_calls` (e.g. the basic Llama 3 template, which only reads `message['content']`) or tool messages (e.g. Cohere2, Phi3); both cases must be rejected. For VLMs (processors), the messages are converted to multimodal format via [`~trl.data_utils.prepare_multimodal_messages`] before rendering. Args: processing_class (`PreTrainedTokenizerBase` or `ProcessorMixin`): Tokenizer or processor instance to check. Returns: `bool`: `True` if the chat template supports tool-calling conversations, `False` otherwise. """ if processing_class.chat_template is None: return False is_vlm = isinstance(processing_class, ProcessorMixin) # Distinct sentinels so we can tell which part of the exchange a template drops. _name_sentinel = "tool_name_a8f3e2b1" _arg_key_sentinel = "tool_arg_key_b9d4f5c2" _arg_val_sentinel = "tool_arg_val_d6e7a9f3" _content_sentinel = "tool_content_c4f9a8e2" tool_calls = [ { "type": "function", "function": {"name": _name_sentinel, "arguments": {_arg_key_sentinel: _arg_val_sentinel}}, } ] messages = [ {"role": "user", "content": "hi"}, {"role": "assistant", "content": "", "tool_calls": tool_calls}, {"role": "tool", "name": _name_sentinel, "content": _content_sentinel}, ] # VLMs expect content as [{"type": "text", "text": "..."}] instead of plain strings if is_vlm: messages = prepare_multimodal_messages(messages) try: rendered = processing_class.apply_chat_template(messages, tokenize=False) except TemplateError: # TemplateError: template rejects the role sequence (Cohere, FalconMamba, Gemma, Gemma2, Gemma3) # UndefinedError (subclass): template indexes into content as a list for all roles, including tool # (Idefics2, Idefics3, LlavaNext, SmolVLM) return False except TypeError: # Best-effort fallback for templates that reject dict args (e.g. DeepSeek-V3). This is a chat template # bug (see transformers#45419), and the training chat template fixes it to avoid blocking users. tool_calls[0]["function"]["arguments"] = f'{{"{_arg_key_sentinel}": "{_arg_val_sentinel}"}}' try: rendered = processing_class.apply_chat_template(messages, tokenize=False) except TemplateError: return False # All four sentinels must survive: the tool name and arguments (assistant tool_calls) AND the tool message # content. Templates that silently drop either side (basic Llama 3 drops tool_calls; Cohere2/Phi3 drop tool # messages) will fail this check. return all(s in rendered for s in (_name_sentinel, _arg_key_sentinel, _arg_val_sentinel, _content_sentinel)) def is_chat_template_prefix_preserving(processing_class: PreTrainedTokenizerBase | ProcessorMixin) -> bool: """ Check whether the chat template preserves prefixes when applied. A prefix-preserving chat template renders earlier messages identically regardless of what messages follow. This property is required by `_get_tool_suffix_ids`, which extracts tool response formatting tokens by comparing tokenizations with and without tool messages appended. Args: processing_class (`PreTrainedTokenizerBase` or `ProcessorMixin`): Tokenizer or processor instance to check. Returns: `bool`: `True` if the chat template preserves prefixes, `False` otherwise. """ # Use the same dummy messages as _get_tool_suffix_ids to test the exact property it relies on. dummy_tool_calls = [{"type": "function", "function": {"name": "dummy", "arguments": {}}}] messages1 = [ {"role": "user", "content": "dummy"}, {"role": "assistant", "content": "", "tool_calls": dummy_tool_calls}, ] messages2 = [ {"role": "user", "content": "dummy"}, {"role": "assistant", "content": "", "tool_calls": dummy_tool_calls}, {"role": "tool", "name": "dummy", "content": "dummy"}, ] # VLM processors expect structured list-of-blocks content, and image-token expansion only kicks in when an image # is actually present, so include a dummy image to exercise the real code path. is_vlm = isinstance(processing_class, ProcessorMixin) if is_vlm: from PIL import Image dummy_image = Image.new("RGB", (8, 8)) messages1 = prepare_multimodal_messages(messages1, images=[dummy_image]) messages2 = prepare_multimodal_messages(messages2, images=[dummy_image]) try: ids1 = processing_class.apply_chat_template(messages1, tokenize=True, return_dict=False) ids2 = processing_class.apply_chat_template( messages2, tokenize=True, return_dict=False, add_generation_prompt=True ) except TypeError: # Best-effort fallback for templates that reject dict args (e.g. DeepSeek-V3). This is a chat template # bug (see transformers#45419), and the training chat template fixes it to avoid blocking users. dummy_tool_calls = [{"type": "function", "function": {"name": "dummy", "arguments": "{}"}}] messages1[1]["tool_calls"] = dummy_tool_calls messages2[1]["tool_calls"] = dummy_tool_calls ids1 = processing_class.apply_chat_template(messages1, tokenize=True, return_dict=False) ids2 = processing_class.apply_chat_template( messages2, tokenize=True, return_dict=False, add_generation_prompt=True ) # VLM processors return batched output (list of lists), unbatch for single conversation if is_vlm: ids1 = ids1[0] ids2 = ids2[0] return ids2[: len(ids1)] == ids1 deepseekv3_training_chat_template = (_CHAT_TEMPLATES_DIR / "deepseekv3_training.jinja").read_text() gemma_training_chat_template = (_CHAT_TEMPLATES_DIR / "gemma_training.jinja").read_text() glm4moe_training_chat_template = (_CHAT_TEMPLATES_DIR / "glm4moe_training.jinja").read_text() gptoss_training_chat_template = (_CHAT_TEMPLATES_DIR / "gptoss_training.jinja").read_text() llama3_training_chat_template = (_CHAT_TEMPLATES_DIR / "llama3_training.jinja").read_text() phi3_training_chat_template = (_CHAT_TEMPLATES_DIR / "phi3_training.jinja").read_text() qwen2_5_training_chat_template = (_CHAT_TEMPLATES_DIR / "qwen2_5_training.jinja").read_text() qwen3_training_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3_training.jinja").read_text() def get_training_chat_template(tokenizer: PreTrainedTokenizerBase) -> str | None: r""" Get a training-compatible chat template, if needed. Returns a patched chat template that is prefix-preserving and includes `{%% generation %%}` / `{%% endgeneration %%}` markers for assistant-only loss masking. Returns `None` if the tokenizer's template already satisfies both requirements. Currently DeepSeek-V3, Gemma, Gemma2, GLM-4-MoE, GPT-OSS, LLaMA 3, Phi-3, Qwen2.5, and Qwen3 are supported. Args: tokenizer (`PreTrainedTokenizerBase`): Tokenizer instance to check. Returns: `str` or `None`: Training-compatible chat template, or `None` if no patching is needed. Example: ```python >>> from trl.chat_template_utils import get_training_chat_template >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") >>> messages1 = [ ... {"role": "user", "content": "What is 2 * 3?"}, ... { ... "role": "assistant", ... "content": "", ... "tool_calls": [{"type": "function", "function": {"name": "multiply", "arguments": {"a": 2, "b": 3}}}], ... }, ... ] >>> messages2 = messages1 + [ ... {"role": "tool", "name": "multiply", "content": "6"}, ... ] >>> tokenizer.apply_chat_template(messages1, tokenize=False) '<|im_start|>user\nWhat is 2 * 3?<|im_end|>\n<|im_start|>assistant\n\n\n\n\n\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n<|im_end|>\n' >>> tokenizer.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True) '<|im_start|>user\nWhat is 2 * 3?<|im_end|>\n<|im_start|>assistant\n\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n<|im_end|>\n<|im_start|>user\n\n6\n<|im_end|>\n<|im_start|>assistant\n' >>> # ^ think tags missing >>> chat_template = get_training_chat_template(tokenizer) >>> tokenizer.apply_chat_template(messages1, tokenize=False, chat_template=chat_template) '<|im_start|>user\nWhat is 2 * 3?<|im_end|>\n<|im_start|>assistant\n\n\n\n\n\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n<|im_end|>\n' >>> tokenizer.apply_chat_template( ... messages2, tokenize=False, add_generation_prompt=True, chat_template=chat_template ... ) '<|im_start|>user\nWhat is 2 * 3?<|im_end|>\n<|im_start|>assistant\n\n\n\n\n\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n<|im_end|>\n<|im_start|>user\n\n6\n<|im_end|>\n<|im_start|>assistant\n' ``` """ # First check if patching is needed. Prefix-preservation only matters when the template actually supports tools # (the check itself renders a tool message), so skip it otherwise. prefix_ok = not supports_tool_calling(tokenizer) or is_chat_template_prefix_preserving(tokenizer) if prefix_ok and "{% generation %}" in tokenizer.chat_template: return None # No patching needed if tokenizer.chat_template == deepseekv3_chat_template: return deepseekv3_training_chat_template if tokenizer.chat_template == gemma_chat_template: return gemma_training_chat_template if tokenizer.chat_template == glm4moe_chat_template: return glm4moe_training_chat_template if tokenizer.chat_template == gptoss_chat_template: return gptoss_training_chat_template if tokenizer.chat_template == llama3_chat_template: return llama3_training_chat_template if tokenizer.chat_template == phi3_chat_template: return phi3_training_chat_template if tokenizer.chat_template == qwen2_5_chat_template: return qwen2_5_training_chat_template if tokenizer.chat_template == qwen3_chat_template: return qwen3_training_chat_template raise ValueError( "The tokenizer's chat template is not training-compatible (missing prefix-preservation or " "`{% generation %}` markers) and patching is not supported for this template. " "Please manually modify the tokenizer's chat template for training." ) def _validate_tool_calls(tool_calls: list | None) -> None: """ Validate tool_calls to ensure all required fields exist with valid values. Raises ValueError when the model generates malformed tool calls (e.g., missing 'arguments' field) that are partially parsed. Args: tool_calls: List of tool call dictionaries, or None. """ if tool_calls is None: return None if not isinstance(tool_calls, list): raise ValueError("tool_calls must be a list or None.") for idx, tool_call in enumerate(tool_calls): if not isinstance(tool_call, dict): raise ValueError(f"tool_calls[{idx}] must be a dict.") # Handle nested function structure: {"type": "function", "function": {"name": ..., "arguments": ...}} if "function" in tool_call: func = tool_call["function"] if not isinstance(func, dict): raise ValueError(f"tool_calls[{idx}]['function'] must be a dict.") if not isinstance(func.get("name"), str): raise ValueError(f"tool_calls[{idx}]['function']['name'] must be a string.") # Some templates (e.g. Qwen3.5) omit arguments for valid no-arg calls; normalize to {}. if "arguments" not in func or func["arguments"] is None: func["arguments"] = {} else: # Handle flat structure: {"name": ..., "arguments": ...} if not isinstance(tool_call.get("name"), str): raise ValueError(f"tool_calls[{idx}]['name'] must be a string.") # Some templates (e.g. Qwen3.5) omit arguments for valid no-arg calls; normalize to {}. if "arguments" not in tool_call or tool_call["arguments"] is None: tool_call["arguments"] = {} def parse_response(processing_class: PreTrainedTokenizerBase | ProcessorMixin, ids: list[int]) -> dict: r""" Parse a token sequence into structured response dictionaries with fallback handling. Attempts to parse the sequence using `tokenizer.parse_response()`. If parsing fails (e.g., due to malformed tool calls like `{"type":"function"`), falls back to decoding as plain text. Also removes incorrectly appended EOS tokens from tool call content when present, and validates tool_calls to ensure all required fields exist. For VLM processors, automatically uses the inner tokenizer for parsing. Args: processing_class (`PreTrainedTokenizerBase` or VLM processor): Tokenizer or processor with a `parse_response()` method (directly or via inner tokenizer). ids (`list[int]`): List of token sequences. Returns: `dict`: Response dictionary. Example: ```python >>> from trl.chat_template_utils import parse_response, add_response_schema >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") >>> tokenizer = add_response_schema(tokenizer) # temporary until built-in support >>> text = '\n{"name": "multiply", "arguments": {"a": 3, "b": 4}}\n<|im_end|>' >>> ids = tokenizer(text)["input_ids"] >>> parse_response(tokenizer, ids) {'role': 'assistant', 'content': '', 'tool_calls': [{'type': 'function', 'function': {'name': 'multiply', 'arguments': {'a': 3, 'b': 4}}}]} ``` """ # VLM processors don't have parse_response directly; use the inner tokenizer tokenizer = getattr(processing_class, "tokenizer", processing_class) try: parsed = tokenizer.parse_response(ids) # Hotfix: remove incorrectly appended EOS token from tool calls # See https://github.com/huggingface/transformers/issues/42249 if isinstance(parsed.get("content"), str): parsed["content"] = parsed["content"].removesuffix(tokenizer.eos_token) # Normalize: ensure content is always a string (some models omit it or set it to None) if not parsed.get("content"): parsed["content"] = "" # Validate tool_calls to prevent Jinja2 Undefined errors when fields are missing if "tool_calls" in parsed: _validate_tool_calls(parsed["tool_calls"]) except (ValueError, TypeError): # Fallback: decode as plain text if parsing fails. This happens if the model outputs malformed tool calls. content = tokenizer.decode(ids, skip_special_tokens=True) parsed = {"role": "assistant", "content": content} return parsed