#!/usr/bin/env python3 """ Test suite for Qwen v18 chat templates. Tests logical correctness using Python Jinja2 (minijinja compatibility verified separately). Usage: python3 scripts/test_v18.py python3 scripts/test_v18.py qwen3.6 # test one variant """ import sys import os from pathlib import Path from jinja2 import Environment # ── Setup ────────────────────────────────────────────────────────────────────── ROOT = Path(__file__).parent.parent VARIANTS = ["root"] VERSION = "v18" def load_template() -> str: path = ROOT / "chat_template.jinja" return path.read_text(encoding="utf-8") def render(template_src: str, messages: list, tools=None, add_generation_prompt: bool = True, enable_thinking=None, preserve_thinking=None) -> str: env = Environment(keep_trailing_newline=False) env.globals["raise_exception"] = lambda msg: (_ for _ in ()).throw(ValueError(msg)) tmpl = env.from_string(template_src) kwargs = dict(messages=messages, tools=tools, add_generation_prompt=add_generation_prompt) if enable_thinking is not None: kwargs["enable_thinking"] = enable_thinking if preserve_thinking is not None: kwargs["preserve_thinking"] = preserve_thinking return tmpl.render(**kwargs) # ── Test helpers ─────────────────────────────────────────────────────────────── PASS = "\033[92m✓\033[0m" FAIL = "\033[91m✗\033[0m" results = [] def check(name: str, condition: bool, detail: str = ""): status = PASS if condition else FAIL results.append(condition) suffix = f" ({detail})" if detail and not condition else "" print(f" {status} {name}{suffix}") # ── Tests ────────────────────────────────────────────────────────────────────── def test_xml_tool_call_format(t: str): """Tool calls in history must use XML format, not JSON.""" messages = [ {"role": "user", "content": "Get weather for Paris"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "get_weather", "arguments": {"city": "Paris", "units": "celsius"}}}]}, {"role": "tool", "content": "Sunny, 22°C"}, ] tools = [{"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {"city": {"type": "string"}, "units": {"type": "string"}}, "required": ["city"]}}] out = render(t, messages, tools=tools) check("XML format: present", "" in out, f"got: {repr(out[out.find(''):out.find('')+120] if '' in out else 'no ')}") check("XML format: present", "" in out) check("XML format: present", "" in out) check("XML format: present", "" in out) check("XML format: JSON {\"name\": absent in tool_call section", '{"name":' not in out[out.rfind(''):] if '' in out else True, f"tool_call section: {repr(out[out.rfind(''):out.rfind('')+120] if '' in out else 'none')}") check("XML format: tool_instructions mention " in out) check("Instructions: JSON example absent", '{"name": "tool_name"' not in out) check("Instructions: " in out) def test_normal_generation_prompt(t: str): """Normal generation prompt opens block.""" messages = [{"role": "user", "content": "Hello"}] out = render(t, messages) check("Normal gen-prompt: ends with \\n", out.endswith("\n"), f"tail: {repr(out[-30:])}") def test_thinking_bypass(t: str): """When thinking disabled via enable_thinking=False, bypass is injected.""" messages = [{"role": "user", "content": "Hello"}] out = render(t, messages, enable_thinking=False) check("Think bypass: \\n\\n\\n\\n present", "\n\n\n\n" in out, f"tail: {repr(out[-60:])}") check("Think bypass: no open-only \\n at end", not out.endswith("\n")) def test_think_off_token(t: str): """<|think_off|> in system message disables thinking.""" messages = [ {"role": "system", "content": "You are helpful.<|think_off|>"}, {"role": "user", "content": "Hello"}, ] out = render(t, messages) check("<|think_off|> disables thinking", "\n\n\n\n" in out) check("<|think_off|> token stripped from output", "<|think_off|>" not in out) def test_think_on_after_off(t: str): """<|think_on|> re-enables thinking after <|think_off|>.""" messages = [ {"role": "system", "content": "<|think_off|>"}, {"role": "user", "content": "Step 1"}, {"role": "assistant", "content": "Done."}, {"role": "user", "content": "<|think_on|>Now think"}, ] out = render(t, messages) check("<|think_on|> re-enables thinking", out.endswith("\n"), f"tail: {repr(out[-40:])}") def test_tier1_error_escalation(t: str): """First tool error injects Tier 1 correction hint.""" messages = [ {"role": "user", "content": "Read a file"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/foo", "pages": "1-30"}}}]}, {"role": "tool", "content": "Pages range exceeds maximum of 20 pages per request."}, ] tools = [{"name": "read_file", "description": "Read", "parameters": {"type": "object", "properties": {"path": {"type": "string"}}, "required": ["path"]}}] out = render(t, messages, tools=tools) check("Tier 1: correction hint in gen-prompt", "The previous tool call returned an error" in out, f"tail: {repr(out[-120:])}") check("Tier 1: think block still open (not bypassed)", out.endswith("\n")) def test_tier2_error_escalation(t: str): """Two consecutive tool errors trigger Tier 2 bypass.""" tool_error = "Pages range exceeds maximum of 20 pages per request." messages = [ {"role": "user", "content": "Read a file"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/foo", "pages": "1-30"}}}]}, {"role": "tool", "content": tool_error}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/foo", "pages": "1-25"}}}]}, {"role": "tool", "content": tool_error}, ] tools = [{"name": "read_file", "description": "Read", "parameters": {"type": "object", "properties": {"path": {"type": "string"}}, "required": ["path"]}}] out = render(t, messages, tools=tools) check("Tier 2: think bypass injected", "\n\n\n\n" in out, f"tail: {repr(out[-200:])}") check("Tier 2: escalation warning present", "consecutive tool errors" in out or "consecutive" in out, f"tail: {repr(out[-200:])}") def test_length_gated_detection(t: str): """Long tool response (code content with 'error') must NOT trigger error flag.""" long_content = ("// Error handling\nfunction handleError(e) {\n" " throw new Error('invalid input');\n}\n") * 30 # >> 500 chars messages = [ {"role": "user", "content": "Read the code"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "app.js"}}}]}, {"role": "tool", "content": long_content}, ] tools = [{"name": "read_file", "description": "Read", "parameters": {"type": "object", "properties": {"path": {"type": "string"}}, "required": ["path"]}}] out = render(t, messages, tools=tools) check("Length gate: long response with 'error' does NOT trigger hint", "The previous tool call returned an error" not in out, f"tail: {repr(out[-80:])}") check("Length gate: normal gen-prompt after long response", out.endswith("\n"), f"tail: {repr(out[-40:])}") def test_error_counter_resets_on_success(t: str): """After a successful tool call, the consecutive failure counter resets.""" tool_error = "Error: file not found." messages = [ {"role": "user", "content": "Do something"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/bad"}}}]}, {"role": "tool", "content": tool_error}, # error → cf=1 {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/good"}}}]}, {"role": "tool", "content": "file content here " * 30}, # success, long → cf=0 {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/bad2"}}}]}, {"role": "tool", "content": tool_error}, # error → cf=1 again (not 2) ] tools = [{"name": "read_file", "description": "Read", "parameters": {"type": "object", "properties": {"path": {"type": "string"}}, "required": ["path"]}}] out = render(t, messages, tools=tools) # Should be Tier 1 (cf=1), not Tier 2 (cf>=2) check("Counter reset: Tier 1 after success+new error (not Tier 2)", "The previous tool call returned an error" in out and "consecutive tool errors" not in out, f"tail: {repr(out[-200:])}") def test_historical_thinking_stripped(t: str): """Historical assistant blocks are stripped (not last message).""" messages = [ {"role": "user", "content": "Q1"}, {"role": "assistant", "content": "\nmy thoughts\n\n\nAnswer 1"}, {"role": "user", "content": "Q2"}, ] out = render(t, messages) check("Historical think stripped by default", "my thoughts" not in out) def test_preserve_thinking(t: str): """With preserve_thinking=True, historical blocks are kept.""" messages = [ {"role": "user", "content": "Q1"}, {"role": "assistant", "content": "\nmy thoughts\n\n\nAnswer 1"}, {"role": "user", "content": "Q2"}, ] out = render(t, messages, preserve_thinking=True) check("preserve_thinking=True keeps historical thoughts", "my thoughts" in out) def test_developer_role(t: str): """developer role is accepted (same as system).""" messages = [ {"role": "developer", "content": "You are a coder."}, {"role": "user", "content": "Write code"}, ] try: out = render(t, messages) check("developer role: no crash", True) check("developer role: content rendered", "You are a coder." in out) except Exception as e: check("developer role: no crash", False, str(e)) def test_mid_conversation_system(t: str): """System messages mid-conversation are rendered chronologically.""" messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi"}, {"role": "system", "content": "You must now speak in French."}, {"role": "user", "content": "What's up?"}, ] try: out = render(t, messages) check("Mid-conv system: no crash", True) check("Mid-conv system: content present", "You must now speak in French." in out) except Exception as e: check("Mid-conv system: no crash", False, str(e)) def test_tool_response_wrapping(t: str): """Tool responses are wrapped in tags.""" messages = [ {"role": "user", "content": "Get data"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "fetch", "arguments": {"url": "https://example.com"}}}]}, {"role": "tool", "content": "data here"}, ] tools = [{"name": "fetch", "description": "Fetch URL", "parameters": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]}}] out = render(t, messages, tools=tools) check("Tool response: wrapper present", "" in out) check("Tool response: content inside wrapper", "data here" in out) def test_no_tools_no_crash(t: str): """Template works without any tools passed.""" messages = [{"role": "user", "content": "What is 2+2?"}] try: out = render(t, messages) check("No tools: no crash", True) check("No tools: normal gen-prompt", out.endswith("\n")) except Exception as e: check("No tools: no crash", False, str(e)) def test_string_arguments_passthrough(t: str): """String tool arguments (pre-serialized JSON) are passed through as-is.""" messages = [ {"role": "user", "content": "Search"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "search", "arguments": '{"query": "python", "limit": 5}'}}]}, {"role": "tool", "content": "results"}, ] tools = [{"name": "search", "description": "Search", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}] out = render(t, messages, tools=tools) check("String args: passthrough without crash", "search" in out) # ── New tests for v16 fixes ──────────────────────────────────────────────────── def test_shell_result_false_positive(t: str): """Short grep results containing 'error' in identifiers must NOT trigger error flag.""" shell_result = '$ grep -n "error_message" orchestrator.go (timeout 5s)\n\n661: "error_message": "",\n\nTook 0.1s' messages = [ {"role": "user", "content": "Search for error_message"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "shell", "arguments": {"cmd": "grep -n error_message file.go"}}}]}, {"role": "tool", "content": shell_result}, ] tools = [{"name": "shell", "description": "Run shell command", "parameters": {"type": "object", "properties": {"cmd": {"type": "string"}}, "required": ["cmd"]}}] out = render(t, messages, tools=tools) check("Shell false-positive: grep with 'error_message' not flagged", "The previous tool call returned an error" not in out, f"tail: {repr(out[-80:])}") check("Shell false-positive: normal gen-prompt after grep result", out.endswith("\n"), f"tail: {repr(out[-40:])}") def test_no_thinking_with_error_escalation(t: str): """When enable_thinking=False and a tool errors, correction hint must NOT open a block.""" tool_error = "Pages range exceeds maximum of 20 pages per request." messages = [ {"role": "user", "content": "Read a file"}, {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/foo", "pages": "1-30"}}}]}, {"role": "tool", "content": tool_error}, ] tools = [{"name": "read_file", "description": "Read", "parameters": {"type": "object", "properties": {"path": {"type": "string"}}, "required": ["path"]}}] out = render(t, messages, tools=tools, enable_thinking=False) check("No-think + error: correction hint present", "The previous tool call returned an error" in out, f"tail: {repr(out[-120:])}") check("No-think + error: does not end with open think block", not out.rstrip().endswith(""), f"tail: {repr(out[-80:])}") check("No-think + error: no unclosed in error section", "\nThe previous" not in out, f"tail: {repr(out[-120:])}") # ── Runner ───────────────────────────────────────────────────────────────────── TESTS = [ test_xml_tool_call_format, test_tool_instructions_format, test_normal_generation_prompt, test_thinking_bypass, test_think_off_token, test_think_on_after_off, test_tier1_error_escalation, test_tier2_error_escalation, test_length_gated_detection, test_error_counter_resets_on_success, test_historical_thinking_stripped, test_preserve_thinking, test_developer_role, test_mid_conversation_system, test_tool_response_wrapping, test_no_tools_no_crash, test_string_arguments_passthrough, test_shell_result_false_positive, test_no_thinking_with_error_escalation, ] results: list[bool] = [] def run_tests(): print(f"\n{'═'*60}") print(f" Testing v17 Chat Template") print(f"{'═'*60}") try: tmpl = load_template() except FileNotFoundError: print(f" Template not found: chat_template.jinja") return for fn in TESTS: label = fn.__name__.replace("test_", "").replace("_", " ").title() print(f"\n [{label}]") fn(tmpl) if __name__ == "__main__": results.clear() run_tests() total = len(results) passed = sum(results) failed = total - passed print(f"\n{'═'*60}") print(f" Results: {passed}/{total} passed", end="") if failed: print(f" \033[91m({failed} failed)\033[0m") else: print(f" \033[92m(all passed)\033[0m") print(f"{'═'*60}\n") sys.exit(0 if failed == 0 else 1)