trl-mcsd / trl /chat_template_utils.py

Implement MCSD for experimental SDPO

1fa3c6c verified about 1 month ago

34.8 kB

	# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from pathlib import Path
	from typing import TypeVar

	from jinja2 import TemplateError
	from transformers import AddedToken, AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin

	from .data_utils import prepare_multimodal_messages


	_CHAT_TEMPLATES_DIR = Path(__file__).parent / "chat_templates"


	def clone_chat_template(
	model: PreTrainedModel,
	tokenizer: PreTrainedTokenizerBase,
	source_tokenizer_path: str,
	resize_to_multiple_of: int \| None = 64,
	) -> tuple[PreTrainedModel, PreTrainedTokenizerBase, list[int]]:
	"""
	Clones a chat template from a source tokenizer to the target tokenizer and updates the model accordingly.

	This function:
	- Copies the chat template from a source tokenizer to the target tokenizer.
	- Adds any new tokens from the source tokenizer to the target tokenizer.
	- Sets and synchronizes the EOS token across the tokenizer and model.
	- Resizes the model's token embeddings to match the new vocabulary size, optionally rounding it up to a multiple of
	a specified value. In such cases, dummy tokens are added to the tokenizer to ensure the vocabulary size matches
	the embedding dimensions.

	Args:
	model ([`~transformers.PreTrainedModel`]):
	Model to update.
	tokenizer ([`~transformers.PreTrainedTokenizerBase`]):
	Tokenizer to update.
	source_tokenizer_path (`str`):
	Path or identifier of the pretrained tokenizer to clone from.
	resize_to_multiple_of (`int` or `None`, optional, defaults to `64`):
	The embedding layer will be resized to the new vocabulary size. If this is not `None`, it will round up the
	new vocabulary size to the nearest multiple of this value.

	Returns:
	model ([`~transformers.PreTrainedModel`]):
	Updated model with resized token embeddings and EOS token configured.
	tokenizer ([`~transformers.PreTrainedTokenizerBase`]):
	Updated tokenizer with the chat template and special tokens applied.
	added_tokens (`list[int]`):
	List of tokens that were added to the tokenizer from the source tokenizer.

	Example:
	```python
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from trl import clone_chat_template

	model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
	model, tokenizer, added_tokens = clone_chat_template(model, tokenizer, "Qwen/Qwen3-0.6B")
	```
	"""
	# Load the source tokenizer containing the desired chat template
	tokenizer_source = AutoTokenizer.from_pretrained(source_tokenizer_path)

	# Copy the chat template from the source tokenizer
	tokenizer.chat_template = tokenizer_source.get_chat_template()

	# Ensure all added tokens from the source are available in the target tokenizer
	added_tokens = [
	token for token in tokenizer_source.added_tokens_decoder.values() if token.content not in tokenizer.vocab
	]
	tokenizer.add_tokens(added_tokens)

	# Set the EOS token from the source tokenizer (important for generation)
	tokenizer.eos_token = tokenizer_source.eos_token
	model.config.eos_token_id = tokenizer.eos_token_id
	if model.can_generate(): # Non-generative models (e.g. SequenceClassification) may not have a generation_config
	model.generation_config.eos_token_id = tokenizer.eos_token_id

	# Resize model embeddings to include any new tokens, optionally rounding up to a multiple
	model.resize_token_embeddings(
	# After studying many tokenizers, we found that len(tokenizer.vocab) is the most reliable way to get the vocab
	# size. Avoid using tokenizer.vocab_size or tokenizer.vocab_size + len(tokenizer.added_tokens_encoder),
	# as handling of special and added tokens varies across tokenizers.
	new_num_tokens=len(tokenizer.vocab),
	pad_to_multiple_of=resize_to_multiple_of if resize_to_multiple_of is not None else None,
	)

	# After resizing, the embedding matrix size may exceed the vocabulary size. Add dummy tokens to the tokenizer to
	# ensure vocabulary size matches the embedding matrix dimensions.
	idx = 0
	while model.vocab_size > len(tokenizer.vocab):
	dummy_token = AddedToken(f"<extra_id_{idx}>")
	is_added = tokenizer.add_tokens(dummy_token)
	idx += 1
	if is_added == 1:
	added_tokens.append(dummy_token)

	# Verify that vocabulary size now matches embedding dimensions
	if len(tokenizer.vocab) != model.vocab_size:
	raise RuntimeError(
	f"Vocabulary size mismatch after resizing: tokenizer vocab size is {len(tokenizer.vocab)}, but model "
	f"embedding size is {model.vocab_size}. This indicates an internal error in the token alignment process."
	)
	added_tokens = [token.content for token in added_tokens]
	added_tokens = tokenizer.convert_tokens_to_ids(added_tokens)
	return model, tokenizer, added_tokens


	glm4moe_schema = {
	"x-regex": r"^(?:\n?<think>\n?(?:(?P<reasoning_content>.?\S.?)\n?\|[\s])</think>\s)?(?P<content>.?)(?:\n(?=<tool_call>))?(?=(?:<tool_call>\|$))(?P<tool_calls>(?:<tool_call>.+?</tool_call>\s)+)?$",
	"type": "object",
	"properties": {
	"role": {"const": "assistant"},
	"content": {"type": "string"},
	"reasoning_content": {"type": "string"},
	"tool_calls": {
	"type": "array",
	"x-regex-iterator": r"<tool_call>\s(.+?)\s</tool_call>",
	"items": {
	"type": "object",
	"properties": {
	"type": {"const": "function"},
	"function": {
	"type": "object",
	"properties": {
	"name": {"type": "string", "x-regex": r"^(\S+)"},
	"arguments": {
	"type": "object",
	"x-regex-key-value": r"<arg_key>(?P<key>[^<]+)</arg_key>\s\n<arg_value>(?P<value>.?)</arg_value>",
	"default": {},
	"additionalProperties": {
	"x-parser": "json",
	"x-parser-args": {"allow_non_json": True},
	},
	},
	},
	},
	},
	},
	},
	},
	}

	gptoss_schema = {
	# Normalize final content to analysis format so both map to the same "content" group.
	"x-regex-substitutions": [
	[r"<\\|channel\\|>final<\\|message\\|>(.*?)<\\|return\\|>", r"<\|channel\|>analysis<\|message\|>\1<\|end\|>"],
	],
	"x-regex": r"^(?:<\\|channel\\|>analysis<\\|message\\|>(?P<content>.?)<\\|end\\|>(?:<\\|start\\|>assistant)?)?\s(?P<tool_calls>to=functions\.\S+<\\|channel\\|>commentary json<\\|message\\|>.*?<\\|call\\|>)?$",
	"type": "object",
	"properties": {
	"role": {"const": "assistant"},
	"content": {"type": "string"},
	"tool_calls": {
	"type": "array",
	"x-regex-iterator": r"(to=functions\.\S+<\\|channel\\|>commentary json<\\|message\\|>.*?<\\|call\\|>)",
	"items": {
	# Convert "to=functions.NAME<\|channel\|>commentary json<\|message\|>ARGS<\|call\|>"
	# into '{"name": "NAME", "arguments": ARGS}' so it can be parsed as JSON.
	"x-regex-substitutions": [
	[
	r"to=functions\.(\S+)<\\|channel\\|>commentary json<\\|message\\|>(.*?)<\\|call\\|>",
	r'{"name": "\1", "arguments": \2}',
	],
	],
	"x-parser": "json",
	"x-parser-args": {"transform": "{type: 'function', function: @}"},
	"type": "object",
	"properties": {
	"type": {"const": "function"},
	"function": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"arguments": {
	"type": "object",
	"additionalProperties": {},
	},
	},
	},
	},
	},
	},
	},
	}

	# Adapted and corrected versions of the schemas from:
	# https://github.com/huggingface/transformers/blob/main/tests/utils/test_chat_parsing_utils.py
	qwen3_schema = {
	"x-regex": r"^(?:<think>\n?(?:(?P<reasoning_content>.?\S.?)\n?\|[\s])</think>\s)?(?P<content>.?)(?:\n(?=<tool_call>))?(?=(?:<tool_call>\|<\\|im_end\\|>\|$))(?P<tool_calls>(?:<tool_call>.+?</tool_call>\s)+)?\s*(?:<\\|im_end\\|>\|$)",
	"type": "object",
	"properties": {
	"role": {"const": "assistant"},
	"content": {"type": "string"},
	"reasoning_content": {"type": "string"},
	"tool_calls": {
	"type": "array",
	"x-regex-iterator": r"<tool_call>\s(.+?)\s</tool_call>",
	"items": {
	"x-parser": "json",
	"x-parser-args": {"transform": "{type: 'function', function: @}"},
	"type": "object",
	"properties": {
	"type": {"const": "function"},
	"function": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"arguments": {
	"type": "object",
	"additionalProperties": {},
	},
	},
	},
	},
	},
	},
	},
	}

	llama3_schema = {
	# Llama 3.1 / 3.2 render a tool call as a single bare JSON object using the key "parameters" instead of
	# "arguments": `{"name": "<name>", "parameters": <args_json>}<\|eot_id\|>`. There is no surrounding marker, no
	# support for content alongside a tool call, and at most one tool call per assistant turn (the template raises
	# otherwise). Either we match a tool call (capturing the JSON) or we treat the response as plain content.
	"x-regex": r'^(?:(?P<tool_calls>\{"name":\s".+?",\s"parameters":\s.+\})\|(?P<content>.?))(?:<\\|eot_id\\|>\|$)',
	"type": "object",
	"properties": {
	"role": {"const": "assistant"},
	"content": {"type": "string"},
	"tool_calls": {
	"type": "array",
	"x-regex-iterator": r'(\{"name":\s".+?",\s"parameters":\s*.+\})',
	"items": {
	# Rewrite "parameters" → "arguments" so the JSON parses into the standard tool-call shape. Anchored
	# on the leading `{"name": "..."` so a stray `"parameters"` inside argument values is not touched.
	"x-regex-substitutions": [
	[r'^(\{"name":\s"[^"]+",\s)"parameters":', r'\1"arguments":'],
	],
	"x-parser": "json",
	"x-parser-args": {"transform": "{type: 'function', function: @}"},
	"type": "object",
	"properties": {
	"type": {"const": "function"},
	"function": {
	"type": "object",
	"properties": {
	"name": {"type": "string"},
	"arguments": {
	"type": "object",
	"additionalProperties": {},
	},
	},
	},
	},
	},
	},
	},
	}

	qwen3_5_schema = {
	"x-regex": r"^(?:(?:<think>\n?)?(?:(?P<reasoning_content>.?\S.?)\n?\|[\s])</think>\s)?(?P<content>.?)(?:\n+(?=<tool_call>))?(?=(?:<tool_call>\|<\\|im_end\\|>\|$))(?P<tool_calls>(?:<tool_call>.+?</tool_call>\s)+)?\s*(?:<\\|im_end\\|>\|$)",
	"type": "object",
	"properties": {
	"role": {"const": "assistant"},
	"content": {"type": "string"},
	"reasoning_content": {"type": "string"},
	"tool_calls": {
	"type": "array",
	"x-regex-iterator": r"<tool_call>\s(.+?)\s</tool_call>",
	"items": {
	"type": "object",
	"properties": {
	"type": {"const": "function"},
	"function": {
	"type": "object",
	"properties": {
	"name": {"type": "string", "x-regex": r"<function=([^\n>]+)>"},
	"arguments": {
	"type": "object",
	"x-regex-key-value": r"<parameter=(?P<key>[^>\n]+)>\n(?P<value>.*?)\n</parameter>",
	"default": {},
	"additionalProperties": {
	"x-parser": "json",
	"x-parser-args": {"allow_non_json": True},
	},
	},
	},
	},
	},
	},
	},
	},
	}


	deepseekv3_chat_template = (_CHAT_TEMPLATES_DIR / "deepseekv3.jinja").read_text()

	gemma_chat_template = (_CHAT_TEMPLATES_DIR / "gemma.jinja").read_text()

	glm4moe_chat_template = (_CHAT_TEMPLATES_DIR / "glm4moe.jinja").read_text()

	gptoss_chat_template = (_CHAT_TEMPLATES_DIR / "gptoss.jinja").read_text()

	llama3_chat_template = (_CHAT_TEMPLATES_DIR / "llama3.jinja").read_text()

	llama3_1_chat_template = (_CHAT_TEMPLATES_DIR / "llama3_1.jinja").read_text()

	llama3_2_chat_template = (_CHAT_TEMPLATES_DIR / "llama3_2.jinja").read_text()

	phi3_chat_template = (_CHAT_TEMPLATES_DIR / "phi3.jinja").read_text()

	qwen2_5_chat_template = (_CHAT_TEMPLATES_DIR / "qwen2_5.jinja").read_text()

	qwen3_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3.jinja").read_text()

	qwen3_vl_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3_vl.jinja").read_text()

	qwen3_5_chat_template_2b_and_below = (_CHAT_TEMPLATES_DIR / "qwen3_5_2b_and_below.jinja").read_text()

	qwen3_5_chat_template_4b_and_above = (_CHAT_TEMPLATES_DIR / "qwen3_5_4b_and_above.jinja").read_text()


	ProcessingClassT = TypeVar("ProcessingClassT", PreTrainedTokenizerBase, ProcessorMixin)


	def add_response_schema(processing_class: ProcessingClassT) -> ProcessingClassT:
	r"""
	Adds the appropriate response schema to the given tokenizer based on its chat template.

	At the time of initial implementation, most tokenizers do not have built-in support for response schemas. While
	waiting for broader adoption, we provide this utility function to manually set the response schema for known chat
	templates.

	When given a VLM processor, the schema is set on the inner tokenizer, since `parse_response` is a tokenizer method
	and reads `self.response_schema` from the tokenizer instance.

	Args:
	processing_class (`PreTrainedTokenizerBase` or `ProcessorMixin`):
	Tokenizer or VLM processor to which the response schema will be added.

	Returns:
	`PreTrainedTokenizerBase` or `ProcessorMixin`:
	The same object that was passed in, with the response schema set on the underlying tokenizer.

	Examples:

	```python
	>>> from trl.chat_template_utils import add_response_schema
	>>> from transformers import AutoTokenizer

	>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
	>>> tokenizer = add_response_schema(tokenizer)
	>>> assistant_text = '<tool_call>\n{"name": "multiply", "arguments": {"a": 3, "b": 4}}\n</tool_call><\|im_end\|>'
	>>> tokenizer.parse_response(assistant_text)
	{'role': 'assistant', 'content': '', 'tool_calls': [{'type': 'function', 'function': {'name': 'multiply', 'arguments': {'a': 3, 'b': 4}}}]}
	```
	"""
	# For VLM processors, set the schema on the inner tokenizer (where `parse_response` reads it from).
	# Match against the top-level chat_template, since that's what was used historically and processors
	# may carry their own VLM-specific template separate from the inner tokenizer's.
	chat_template = processing_class.chat_template
	if isinstance(processing_class, ProcessorMixin):
	tokenizer = processing_class.tokenizer
	else:
	tokenizer = processing_class
	if chat_template == glm4moe_chat_template:
	tokenizer.response_schema = glm4moe_schema
	elif chat_template == gptoss_chat_template:
	tokenizer.response_schema = gptoss_schema
	elif chat_template in [llama3_1_chat_template, llama3_2_chat_template]:
	tokenizer.response_schema = llama3_schema
	elif chat_template in [qwen3_chat_template, qwen3_vl_chat_template]:
	tokenizer.response_schema = qwen3_schema
	elif chat_template in [qwen3_5_chat_template_2b_and_below, qwen3_5_chat_template_4b_and_above]:
	tokenizer.response_schema = qwen3_5_schema
	else:
	raise ValueError(
	"Unrecognized chat template, failed to add response schema. Please manually set the response schema on "
	"the tokenizer or processor. See the Transformers "
	"[docs](https://huggingface.co/docs/transformers/main/en/chat_response_parsing#response-parsing) for more "
	"details on response parsing."
	)
	return processing_class


	def supports_tool_calling(processing_class) -> bool:
	"""
	Check if the processing class's chat template can render a full tool-calling conversation.

	This tests that (1) the template doesn't error when rendering a conversation with ``user → assistant (with
	tool_calls) → tool`` roles, and (2) every part of the tool-calling exchange — the assistant's tool call name, its
	arguments, and the tool message content — actually appears in the rendered output. Some templates silently swallow
	`tool_calls` (e.g. the basic Llama 3 template, which only reads `message['content']`) or tool messages (e.g.
	Cohere2, Phi3); both cases must be rejected.

	For VLMs (processors), the messages are converted to multimodal format via
	[`~trl.data_utils.prepare_multimodal_messages`] before rendering.

	Args:
	processing_class (`PreTrainedTokenizerBase` or `ProcessorMixin`):
	Tokenizer or processor instance to check.

	Returns:
	`bool`:
	`True` if the chat template supports tool-calling conversations, `False` otherwise.
	"""
	if processing_class.chat_template is None:
	return False

	is_vlm = isinstance(processing_class, ProcessorMixin)
	# Distinct sentinels so we can tell which part of the exchange a template drops.
	_name_sentinel = "tool_name_a8f3e2b1"
	_arg_key_sentinel = "tool_arg_key_b9d4f5c2"
	_arg_val_sentinel = "tool_arg_val_d6e7a9f3"
	_content_sentinel = "tool_content_c4f9a8e2"
	tool_calls = [
	{
	"type": "function",
	"function": {"name": _name_sentinel, "arguments": {_arg_key_sentinel: _arg_val_sentinel}},
	}
	]
	messages = [
	{"role": "user", "content": "hi"},
	{"role": "assistant", "content": "", "tool_calls": tool_calls},
	{"role": "tool", "name": _name_sentinel, "content": _content_sentinel},
	]
	# VLMs expect content as [{"type": "text", "text": "..."}] instead of plain strings
	if is_vlm:
	messages = prepare_multimodal_messages(messages)

	try:
	rendered = processing_class.apply_chat_template(messages, tokenize=False)
	except TemplateError:
	# TemplateError: template rejects the role sequence (Cohere, FalconMamba, Gemma, Gemma2, Gemma3)
	# UndefinedError (subclass): template indexes into content as a list for all roles, including tool
	# (Idefics2, Idefics3, LlavaNext, SmolVLM)
	return False
	except TypeError:
	# Best-effort fallback for templates that reject dict args (e.g. DeepSeek-V3). This is a chat template
	# bug (see transformers#45419), and the training chat template fixes it to avoid blocking users.
	tool_calls[0]["function"]["arguments"] = f'{{"{_arg_key_sentinel}": "{_arg_val_sentinel}"}}'
	try:
	rendered = processing_class.apply_chat_template(messages, tokenize=False)
	except TemplateError:
	return False
	# All four sentinels must survive: the tool name and arguments (assistant tool_calls) AND the tool message
	# content. Templates that silently drop either side (basic Llama 3 drops tool_calls; Cohere2/Phi3 drop tool
	# messages) will fail this check.
	return all(s in rendered for s in (_name_sentinel, _arg_key_sentinel, _arg_val_sentinel, _content_sentinel))


	def is_chat_template_prefix_preserving(processing_class: PreTrainedTokenizerBase \| ProcessorMixin) -> bool:
	"""
	Check whether the chat template preserves prefixes when applied.

	A prefix-preserving chat template renders earlier messages identically regardless of what messages follow. This
	property is required by `_get_tool_suffix_ids`, which extracts tool response formatting tokens by comparing
	tokenizations with and without tool messages appended.

	Args:
	processing_class (`PreTrainedTokenizerBase` or `ProcessorMixin`):
	Tokenizer or processor instance to check.

	Returns:
	`bool`:
	`True` if the chat template preserves prefixes, `False` otherwise.
	"""
	# Use the same dummy messages as _get_tool_suffix_ids to test the exact property it relies on.
	dummy_tool_calls = [{"type": "function", "function": {"name": "dummy", "arguments": {}}}]
	messages1 = [
	{"role": "user", "content": "dummy"},
	{"role": "assistant", "content": "", "tool_calls": dummy_tool_calls},
	]
	messages2 = [
	{"role": "user", "content": "dummy"},
	{"role": "assistant", "content": "", "tool_calls": dummy_tool_calls},
	{"role": "tool", "name": "dummy", "content": "dummy"},
	]
	# VLM processors expect structured list-of-blocks content, and image-token expansion only kicks in when an image
	# is actually present, so include a dummy image to exercise the real code path.
	is_vlm = isinstance(processing_class, ProcessorMixin)
	if is_vlm:
	from PIL import Image

	dummy_image = Image.new("RGB", (8, 8))
	messages1 = prepare_multimodal_messages(messages1, images=[dummy_image])
	messages2 = prepare_multimodal_messages(messages2, images=[dummy_image])

	try:
	ids1 = processing_class.apply_chat_template(messages1, tokenize=True, return_dict=False)
	ids2 = processing_class.apply_chat_template(
	messages2, tokenize=True, return_dict=False, add_generation_prompt=True
	)
	except TypeError:
	# Best-effort fallback for templates that reject dict args (e.g. DeepSeek-V3). This is a chat template
	# bug (see transformers#45419), and the training chat template fixes it to avoid blocking users.
	dummy_tool_calls = [{"type": "function", "function": {"name": "dummy", "arguments": "{}"}}]
	messages1[1]["tool_calls"] = dummy_tool_calls
	messages2[1]["tool_calls"] = dummy_tool_calls
	ids1 = processing_class.apply_chat_template(messages1, tokenize=True, return_dict=False)
	ids2 = processing_class.apply_chat_template(
	messages2, tokenize=True, return_dict=False, add_generation_prompt=True
	)

	# VLM processors return batched output (list of lists), unbatch for single conversation
	if is_vlm:
	ids1 = ids1[0]
	ids2 = ids2[0]

	return ids2[: len(ids1)] == ids1


	deepseekv3_training_chat_template = (_CHAT_TEMPLATES_DIR / "deepseekv3_training.jinja").read_text()

	gemma_training_chat_template = (_CHAT_TEMPLATES_DIR / "gemma_training.jinja").read_text()

	glm4moe_training_chat_template = (_CHAT_TEMPLATES_DIR / "glm4moe_training.jinja").read_text()

	gptoss_training_chat_template = (_CHAT_TEMPLATES_DIR / "gptoss_training.jinja").read_text()

	llama3_training_chat_template = (_CHAT_TEMPLATES_DIR / "llama3_training.jinja").read_text()

	phi3_training_chat_template = (_CHAT_TEMPLATES_DIR / "phi3_training.jinja").read_text()

	qwen2_5_training_chat_template = (_CHAT_TEMPLATES_DIR / "qwen2_5_training.jinja").read_text()

	qwen3_training_chat_template = (_CHAT_TEMPLATES_DIR / "qwen3_training.jinja").read_text()


	def get_training_chat_template(tokenizer: PreTrainedTokenizerBase) -> str \| None:
	r"""
	Get a training-compatible chat template, if needed.

	Returns a patched chat template that is prefix-preserving and includes `{%% generation %%}` / `{%% endgeneration
	%%}` markers for assistant-only loss masking. Returns `None` if the tokenizer's template already satisfies both
	requirements. Currently DeepSeek-V3, Gemma, Gemma2, GLM-4-MoE, GPT-OSS, LLaMA 3, Phi-3, Qwen2.5, and Qwen3 are
	supported.

	Args:
	tokenizer (`PreTrainedTokenizerBase`):
	Tokenizer instance to check.

	Returns:
	`str` or `None`:
	Training-compatible chat template, or `None` if no patching is needed.

	Example:

	```python
	>>> from trl.chat_template_utils import get_training_chat_template
	>>> from transformers import AutoTokenizer

	>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
	>>> messages1 = [
	... {"role": "user", "content": "What is 2 * 3?"},
	... {
	... "role": "assistant",
	... "content": "",
	... "tool_calls": [{"type": "function", "function": {"name": "multiply", "arguments": {"a": 2, "b": 3}}}],
	... },
	... ]
	>>> messages2 = messages1 + [
	... {"role": "tool", "name": "multiply", "content": "6"},
	... ]
	>>> tokenizer.apply_chat_template(messages1, tokenize=False)
	'<\|im_start\|>user\nWhat is 2 * 3?<\|im_end\|>\n<\|im_start\|>assistant\n<think>\n\n</think>\n\n<tool_call>\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n</tool_call><\|im_end\|>\n'

	>>> tokenizer.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
	'<\|im_start\|>user\nWhat is 2 * 3?<\|im_end\|>\n<\|im_start\|>assistant\n<tool_call>\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n</tool_call><\|im_end\|>\n<\|im_start\|>user\n<tool_response>\n6\n</tool_response><\|im_end\|>\n<\|im_start\|>assistant\n'

	>>> # ^ think tags missing
	>>> chat_template = get_training_chat_template(tokenizer)
	>>> tokenizer.apply_chat_template(messages1, tokenize=False, chat_template=chat_template)
	'<\|im_start\|>user\nWhat is 2 * 3?<\|im_end\|>\n<\|im_start\|>assistant\n<think>\n\n</think>\n\n<tool_call>\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n</tool_call><\|im_end\|>\n'

	>>> tokenizer.apply_chat_template(
	... messages2, tokenize=False, add_generation_prompt=True, chat_template=chat_template
	... )
	'<\|im_start\|>user\nWhat is 2 * 3?<\|im_end\|>\n<\|im_start\|>assistant\n<think>\n\n</think>\n\n<tool_call>\n{"name": "multiply", "arguments": {"a": 2, "b": 3}}\n</tool_call><\|im_end\|>\n<\|im_start\|>user\n<tool_response>\n6\n</tool_response><\|im_end\|>\n<\|im_start\|>assistant\n'
	```
	"""
	# First check if patching is needed. Prefix-preservation only matters when the template actually supports tools
	# (the check itself renders a tool message), so skip it otherwise.
	prefix_ok = not supports_tool_calling(tokenizer) or is_chat_template_prefix_preserving(tokenizer)
	if prefix_ok and "{% generation %}" in tokenizer.chat_template:
	return None # No patching needed

	if tokenizer.chat_template == deepseekv3_chat_template:
	return deepseekv3_training_chat_template

	if tokenizer.chat_template == gemma_chat_template:
	return gemma_training_chat_template

	if tokenizer.chat_template == glm4moe_chat_template:
	return glm4moe_training_chat_template

	if tokenizer.chat_template == gptoss_chat_template:
	return gptoss_training_chat_template

	if tokenizer.chat_template == llama3_chat_template:
	return llama3_training_chat_template

	if tokenizer.chat_template == phi3_chat_template:
	return phi3_training_chat_template

	if tokenizer.chat_template == qwen2_5_chat_template:
	return qwen2_5_training_chat_template

	if tokenizer.chat_template == qwen3_chat_template:
	return qwen3_training_chat_template

	raise ValueError(
	"The tokenizer's chat template is not training-compatible (missing prefix-preservation or "
	"`{% generation %}` markers) and patching is not supported for this template. "
	"Please manually modify the tokenizer's chat template for training."
	)


	def _validate_tool_calls(tool_calls: list \| None) -> None:
	"""
	Validate tool_calls to ensure all required fields exist with valid values.

	Raises ValueError when the model generates malformed tool calls (e.g., missing 'arguments' field) that are
	partially parsed.

	Args:
	tool_calls: List of tool call dictionaries, or None.
	"""
	if tool_calls is None:
	return None
	if not isinstance(tool_calls, list):
	raise ValueError("tool_calls must be a list or None.")

	for idx, tool_call in enumerate(tool_calls):
	if not isinstance(tool_call, dict):
	raise ValueError(f"tool_calls[{idx}] must be a dict.")

	# Handle nested function structure: {"type": "function", "function": {"name": ..., "arguments": ...}}
	if "function" in tool_call:
	func = tool_call["function"]
	if not isinstance(func, dict):
	raise ValueError(f"tool_calls[{idx}]['function'] must be a dict.")
	if not isinstance(func.get("name"), str):
	raise ValueError(f"tool_calls[{idx}]['function']['name'] must be a string.")
	# Some templates (e.g. Qwen3.5) omit arguments for valid no-arg calls; normalize to {}.
	if "arguments" not in func or func["arguments"] is None:
	func["arguments"] = {}
	else:
	# Handle flat structure: {"name": ..., "arguments": ...}
	if not isinstance(tool_call.get("name"), str):
	raise ValueError(f"tool_calls[{idx}]['name'] must be a string.")
	# Some templates (e.g. Qwen3.5) omit arguments for valid no-arg calls; normalize to {}.
	if "arguments" not in tool_call or tool_call["arguments"] is None:
	tool_call["arguments"] = {}


	def parse_response(processing_class: PreTrainedTokenizerBase \| ProcessorMixin, ids: list[int]) -> dict:
	r"""
	Parse a token sequence into structured response dictionaries with fallback handling.

	Attempts to parse the sequence using `tokenizer.parse_response()`. If parsing fails (e.g., due to malformed tool
	calls like `<tool_call>{"type":"function"</tool_call>`), falls back to decoding as plain text.

	Also removes incorrectly appended EOS tokens from tool call content when present, and validates tool_calls to
	ensure all required fields exist.

	For VLM processors, automatically uses the inner tokenizer for parsing.

	Args:
	processing_class (`PreTrainedTokenizerBase` or VLM processor):
	Tokenizer or processor with a `parse_response()` method (directly or via inner tokenizer).
	ids (`list[int]`):
	List of token sequences.

	Returns:
	`dict`:
	Response dictionary.

	Example:
	```python
	>>> from trl.chat_template_utils import parse_response, add_response_schema
	>>> from transformers import AutoTokenizer

	>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
	>>> tokenizer = add_response_schema(tokenizer) # temporary until built-in support
	>>> text = '<tool_call>\n{"name": "multiply", "arguments": {"a": 3, "b": 4}}\n</tool_call><\|im_end\|>'
	>>> ids = tokenizer(text)["input_ids"]
	>>> parse_response(tokenizer, ids)
	{'role': 'assistant', 'content': '', 'tool_calls': [{'type': 'function', 'function': {'name': 'multiply', 'arguments': {'a': 3, 'b': 4}}}]}
	```
	"""
	# VLM processors don't have parse_response directly; use the inner tokenizer
	tokenizer = getattr(processing_class, "tokenizer", processing_class)
	try:
	parsed = tokenizer.parse_response(ids)
	# Hotfix: remove incorrectly appended EOS token from tool calls
	# See https://github.com/huggingface/transformers/issues/42249
	if isinstance(parsed.get("content"), str):
	parsed["content"] = parsed["content"].removesuffix(tokenizer.eos_token)
	# Normalize: ensure content is always a string (some models omit it or set it to None)
	if not parsed.get("content"):
	parsed["content"] = ""
	# Validate tool_calls to prevent Jinja2 Undefined errors when fields are missing
	if "tool_calls" in parsed:
	_validate_tool_calls(parsed["tool_calls"])
	except (ValueError, TypeError):
	# Fallback: decode as plain text if parsing fails. This happens if the model outputs malformed tool calls.
	content = tokenizer.decode(ids, skip_special_tokens=True)
	parsed = {"role": "assistant", "content": content}
	return parsed