phucngodev's picture
Duplicate from froggeric/Qwen-Fixed-Chat-Templates
ff1b54b
#!/usr/bin/env python3
"""
Test suite for Qwen v18 chat templates.
Tests logical correctness using Python Jinja2 (minijinja compatibility verified separately).
Usage:
python3 scripts/test_v18.py
python3 scripts/test_v18.py qwen3.6 # test one variant
"""
import sys
import os
from pathlib import Path
from jinja2 import Environment
# ── Setup ──────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).parent.parent
VARIANTS = ["root"]
VERSION = "v18"
def load_template() -> str:
path = ROOT / "chat_template.jinja"
return path.read_text(encoding="utf-8")
def render(template_src: str, messages: list, tools=None,
add_generation_prompt: bool = True,
enable_thinking=None, preserve_thinking=None) -> str:
env = Environment(keep_trailing_newline=False)
env.globals["raise_exception"] = lambda msg: (_ for _ in ()).throw(ValueError(msg))
tmpl = env.from_string(template_src)
kwargs = dict(messages=messages, tools=tools, add_generation_prompt=add_generation_prompt)
if enable_thinking is not None:
kwargs["enable_thinking"] = enable_thinking
if preserve_thinking is not None:
kwargs["preserve_thinking"] = preserve_thinking
return tmpl.render(**kwargs)
# ── Test helpers ───────────────────────────────────────────────────────────────
PASS = "\033[92m✓\033[0m"
FAIL = "\033[91m✗\033[0m"
results = []
def check(name: str, condition: bool, detail: str = ""):
status = PASS if condition else FAIL
results.append(condition)
suffix = f" ({detail})" if detail and not condition else ""
print(f" {status} {name}{suffix}")
# ── Tests ──────────────────────────────────────────────────────────────────────
def test_xml_tool_call_format(t: str):
"""Tool calls in history must use XML <function=...> format, not JSON."""
messages = [
{"role": "user", "content": "Get weather for Paris"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "get_weather",
"arguments": {"city": "Paris", "units": "celsius"}}}]},
{"role": "tool", "content": "Sunny, 22°C"},
]
tools = [{"name": "get_weather", "description": "Get weather",
"parameters": {"type": "object",
"properties": {"city": {"type": "string"},
"units": {"type": "string"}},
"required": ["city"]}}]
out = render(t, messages, tools=tools)
check("XML format: <function=...> present", "<function=get_weather>" in out,
f"got: {repr(out[out.find('<tool_call>'):out.find('<tool_call>')+120] if '<tool_call>' in out else 'no <tool_call>')}")
check("XML format: <parameter=city> present", "<parameter=city>" in out)
check("XML format: <parameter=units> present", "<parameter=units>" in out)
check("XML format: </function> present", "</function>" in out)
check("XML format: JSON {\"name\": absent in tool_call section",
'{"name":' not in out[out.rfind('<tool_call>'):] if '<tool_call>' in out else True,
f"tool_call section: {repr(out[out.rfind('<tool_call>'):out.rfind('<tool_call>')+120] if '<tool_call>' in out else 'none')}")
check("XML format: tool_instructions mention <function=", "<function=" in out)
def test_tool_instructions_format(t: str):
"""System prompt tool instructions must show XML example, not JSON."""
messages = [{"role": "user", "content": "Hello"}]
tools = [{"name": "noop", "description": "no-op",
"parameters": {"type": "object", "properties": {}, "required": []}}]
out = render(t, messages, tools=tools)
check("Instructions: XML example present", "<function=example_function_name>" in out)
check("Instructions: JSON example absent", '{"name": "tool_name"' not in out)
check("Instructions: <parameter= in example", "<parameter=example_parameter_1>" in out)
def test_normal_generation_prompt(t: str):
"""Normal generation prompt opens <think> block."""
messages = [{"role": "user", "content": "Hello"}]
out = render(t, messages)
check("Normal gen-prompt: ends with <think>\\n", out.endswith("<think>\n"),
f"tail: {repr(out[-30:])}")
def test_thinking_bypass(t: str):
"""When thinking disabled via enable_thinking=False, bypass is injected."""
messages = [{"role": "user", "content": "Hello"}]
out = render(t, messages, enable_thinking=False)
check("Think bypass: <think>\\n\\n</think>\\n\\n present",
"<think>\n\n</think>\n\n" in out, f"tail: {repr(out[-60:])}")
check("Think bypass: no open-only <think>\\n at end",
not out.endswith("<think>\n"))
def test_think_off_token(t: str):
"""<|think_off|> in system message disables thinking."""
messages = [
{"role": "system", "content": "You are helpful.<|think_off|>"},
{"role": "user", "content": "Hello"},
]
out = render(t, messages)
check("<|think_off|> disables thinking", "<think>\n\n</think>\n\n" in out)
check("<|think_off|> token stripped from output", "<|think_off|>" not in out)
def test_think_on_after_off(t: str):
"""<|think_on|> re-enables thinking after <|think_off|>."""
messages = [
{"role": "system", "content": "<|think_off|>"},
{"role": "user", "content": "Step 1"},
{"role": "assistant", "content": "Done."},
{"role": "user", "content": "<|think_on|>Now think"},
]
out = render(t, messages)
check("<|think_on|> re-enables thinking", out.endswith("<think>\n"),
f"tail: {repr(out[-40:])}")
def test_tier1_error_escalation(t: str):
"""First tool error injects Tier 1 correction hint."""
messages = [
{"role": "user", "content": "Read a file"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file",
"arguments": {"path": "/foo", "pages": "1-30"}}}]},
{"role": "tool", "content": "Pages range exceeds maximum of 20 pages per request."},
]
tools = [{"name": "read_file", "description": "Read",
"parameters": {"type": "object", "properties": {"path": {"type": "string"}},
"required": ["path"]}}]
out = render(t, messages, tools=tools)
check("Tier 1: correction hint in gen-prompt",
"The previous tool call returned an error" in out,
f"tail: {repr(out[-120:])}")
check("Tier 1: think block still open (not bypassed)", out.endswith("\n"))
def test_tier2_error_escalation(t: str):
"""Two consecutive tool errors trigger Tier 2 bypass."""
tool_error = "Pages range exceeds maximum of 20 pages per request."
messages = [
{"role": "user", "content": "Read a file"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file",
"arguments": {"path": "/foo", "pages": "1-30"}}}]},
{"role": "tool", "content": tool_error},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file",
"arguments": {"path": "/foo", "pages": "1-25"}}}]},
{"role": "tool", "content": tool_error},
]
tools = [{"name": "read_file", "description": "Read",
"parameters": {"type": "object", "properties": {"path": {"type": "string"}},
"required": ["path"]}}]
out = render(t, messages, tools=tools)
check("Tier 2: think bypass injected", "<think>\n\n</think>\n\n" in out,
f"tail: {repr(out[-200:])}")
check("Tier 2: escalation warning present",
"consecutive tool errors" in out or "consecutive" in out,
f"tail: {repr(out[-200:])}")
def test_length_gated_detection(t: str):
"""Long tool response (code content with 'error') must NOT trigger error flag."""
long_content = ("// Error handling\nfunction handleError(e) {\n"
" throw new Error('invalid input');\n}\n") * 30 # >> 500 chars
messages = [
{"role": "user", "content": "Read the code"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file",
"arguments": {"path": "app.js"}}}]},
{"role": "tool", "content": long_content},
]
tools = [{"name": "read_file", "description": "Read",
"parameters": {"type": "object", "properties": {"path": {"type": "string"}},
"required": ["path"]}}]
out = render(t, messages, tools=tools)
check("Length gate: long response with 'error' does NOT trigger hint",
"The previous tool call returned an error" not in out,
f"tail: {repr(out[-80:])}")
check("Length gate: normal gen-prompt after long response",
out.endswith("<think>\n"), f"tail: {repr(out[-40:])}")
def test_error_counter_resets_on_success(t: str):
"""After a successful tool call, the consecutive failure counter resets."""
tool_error = "Error: file not found."
messages = [
{"role": "user", "content": "Do something"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/bad"}}}]},
{"role": "tool", "content": tool_error}, # error → cf=1
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/good"}}}]},
{"role": "tool", "content": "file content here " * 30}, # success, long → cf=0
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/bad2"}}}]},
{"role": "tool", "content": tool_error}, # error → cf=1 again (not 2)
]
tools = [{"name": "read_file", "description": "Read",
"parameters": {"type": "object", "properties": {"path": {"type": "string"}},
"required": ["path"]}}]
out = render(t, messages, tools=tools)
# Should be Tier 1 (cf=1), not Tier 2 (cf>=2)
check("Counter reset: Tier 1 after success+new error (not Tier 2)",
"The previous tool call returned an error" in out and
"consecutive tool errors" not in out,
f"tail: {repr(out[-200:])}")
def test_historical_thinking_stripped(t: str):
"""Historical assistant <think> blocks are stripped (not last message)."""
messages = [
{"role": "user", "content": "Q1"},
{"role": "assistant", "content": "<think>\nmy thoughts\n</think>\n\nAnswer 1"},
{"role": "user", "content": "Q2"},
]
out = render(t, messages)
check("Historical think stripped by default", "my thoughts" not in out)
def test_preserve_thinking(t: str):
"""With preserve_thinking=True, historical <think> blocks are kept."""
messages = [
{"role": "user", "content": "Q1"},
{"role": "assistant", "content": "<think>\nmy thoughts\n</think>\n\nAnswer 1"},
{"role": "user", "content": "Q2"},
]
out = render(t, messages, preserve_thinking=True)
check("preserve_thinking=True keeps historical thoughts", "my thoughts" in out)
def test_developer_role(t: str):
"""developer role is accepted (same as system)."""
messages = [
{"role": "developer", "content": "You are a coder."},
{"role": "user", "content": "Write code"},
]
try:
out = render(t, messages)
check("developer role: no crash", True)
check("developer role: content rendered", "You are a coder." in out)
except Exception as e:
check("developer role: no crash", False, str(e))
def test_mid_conversation_system(t: str):
"""System messages mid-conversation are rendered chronologically."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi"},
{"role": "system", "content": "You must now speak in French."},
{"role": "user", "content": "What's up?"},
]
try:
out = render(t, messages)
check("Mid-conv system: no crash", True)
check("Mid-conv system: content present", "You must now speak in French." in out)
except Exception as e:
check("Mid-conv system: no crash", False, str(e))
def test_tool_response_wrapping(t: str):
"""Tool responses are wrapped in <tool_response> tags."""
messages = [
{"role": "user", "content": "Get data"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "fetch", "arguments": {"url": "https://example.com"}}}]},
{"role": "tool", "content": "data here"},
]
tools = [{"name": "fetch", "description": "Fetch URL",
"parameters": {"type": "object", "properties": {"url": {"type": "string"}},
"required": ["url"]}}]
out = render(t, messages, tools=tools)
check("Tool response: <tool_response> wrapper present", "<tool_response>" in out)
check("Tool response: content inside wrapper", "data here" in out)
def test_no_tools_no_crash(t: str):
"""Template works without any tools passed."""
messages = [{"role": "user", "content": "What is 2+2?"}]
try:
out = render(t, messages)
check("No tools: no crash", True)
check("No tools: normal gen-prompt", out.endswith("<think>\n"))
except Exception as e:
check("No tools: no crash", False, str(e))
def test_string_arguments_passthrough(t: str):
"""String tool arguments (pre-serialized JSON) are passed through as-is."""
messages = [
{"role": "user", "content": "Search"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "search",
"arguments": '{"query": "python", "limit": 5}'}}]},
{"role": "tool", "content": "results"},
]
tools = [{"name": "search", "description": "Search",
"parameters": {"type": "object", "properties": {"query": {"type": "string"}},
"required": ["query"]}}]
out = render(t, messages, tools=tools)
check("String args: passthrough without crash", "search" in out)
# ── New tests for v16 fixes ────────────────────────────────────────────────────
def test_shell_result_false_positive(t: str):
"""Short grep results containing 'error' in identifiers must NOT trigger error flag."""
shell_result = '$ grep -n "error_message" orchestrator.go (timeout 5s)\n\n661: "error_message": "",\n\nTook 0.1s'
messages = [
{"role": "user", "content": "Search for error_message"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "shell",
"arguments": {"cmd": "grep -n error_message file.go"}}}]},
{"role": "tool", "content": shell_result},
]
tools = [{"name": "shell", "description": "Run shell command",
"parameters": {"type": "object", "properties": {"cmd": {"type": "string"}},
"required": ["cmd"]}}]
out = render(t, messages, tools=tools)
check("Shell false-positive: grep with 'error_message' not flagged",
"The previous tool call returned an error" not in out,
f"tail: {repr(out[-80:])}")
check("Shell false-positive: normal gen-prompt after grep result",
out.endswith("<think>\n"), f"tail: {repr(out[-40:])}")
def test_no_thinking_with_error_escalation(t: str):
"""When enable_thinking=False and a tool errors, correction hint must NOT open a <think> block."""
tool_error = "Pages range exceeds maximum of 20 pages per request."
messages = [
{"role": "user", "content": "Read a file"},
{"role": "assistant", "content": "",
"tool_calls": [{"function": {"name": "read_file",
"arguments": {"path": "/foo", "pages": "1-30"}}}]},
{"role": "tool", "content": tool_error},
]
tools = [{"name": "read_file", "description": "Read",
"parameters": {"type": "object", "properties": {"path": {"type": "string"}},
"required": ["path"]}}]
out = render(t, messages, tools=tools, enable_thinking=False)
check("No-think + error: correction hint present",
"The previous tool call returned an error" in out,
f"tail: {repr(out[-120:])}")
check("No-think + error: does not end with open think block",
not out.rstrip().endswith("<think>"),
f"tail: {repr(out[-80:])}")
check("No-think + error: no unclosed <think> in error section",
"<think>\nThe previous" not in out,
f"tail: {repr(out[-120:])}")
# ── Runner ─────────────────────────────────────────────────────────────────────
TESTS = [
test_xml_tool_call_format,
test_tool_instructions_format,
test_normal_generation_prompt,
test_thinking_bypass,
test_think_off_token,
test_think_on_after_off,
test_tier1_error_escalation,
test_tier2_error_escalation,
test_length_gated_detection,
test_error_counter_resets_on_success,
test_historical_thinking_stripped,
test_preserve_thinking,
test_developer_role,
test_mid_conversation_system,
test_tool_response_wrapping,
test_no_tools_no_crash,
test_string_arguments_passthrough,
test_shell_result_false_positive,
test_no_thinking_with_error_escalation,
]
results: list[bool] = []
def run_tests():
print(f"\n{'═'*60}")
print(f" Testing v17 Chat Template")
print(f"{'═'*60}")
try:
tmpl = load_template()
except FileNotFoundError:
print(f" Template not found: chat_template.jinja")
return
for fn in TESTS:
label = fn.__name__.replace("test_", "").replace("_", " ").title()
print(f"\n [{label}]")
fn(tmpl)
if __name__ == "__main__":
results.clear()
run_tests()
total = len(results)
passed = sum(results)
failed = total - passed
print(f"\n{'═'*60}")
print(f" Results: {passed}/{total} passed", end="")
if failed:
print(f" \033[91m({failed} failed)\033[0m")
else:
print(f" \033[92m(all passed)\033[0m")
print(f"{'═'*60}\n")
sys.exit(0 if failed == 0 else 1)