File size: 19,059 Bytes

ff1b54b

#!/usr/bin/env python3
"""
Test suite for Qwen v18 chat templates.
Tests logical correctness using Python Jinja2 (minijinja compatibility verified separately).

Usage:
    python3 scripts/test_v18.py
    python3 scripts/test_v18.py qwen3.6   # test one variant
"""

import sys
import os
from pathlib import Path
from jinja2 import Environment

# ── Setup ──────────────────────────────────────────────────────────────────────

ROOT = Path(__file__).parent.parent
VARIANTS = ["root"]
VERSION = "v18"

def load_template() -> str:
    path = ROOT / "chat_template.jinja"
    return path.read_text(encoding="utf-8")

def render(template_src: str, messages: list, tools=None,
           add_generation_prompt: bool = True,
           enable_thinking=None, preserve_thinking=None) -> str:
    env = Environment(keep_trailing_newline=False)
    env.globals["raise_exception"] = lambda msg: (_ for _ in ()).throw(ValueError(msg))
    tmpl = env.from_string(template_src)
    kwargs = dict(messages=messages, tools=tools, add_generation_prompt=add_generation_prompt)
    if enable_thinking is not None:
        kwargs["enable_thinking"] = enable_thinking
    if preserve_thinking is not None:
        kwargs["preserve_thinking"] = preserve_thinking
    return tmpl.render(**kwargs)

# ── Test helpers ───────────────────────────────────────────────────────────────

PASS = "\033[92m✓\033[0m"
FAIL = "\033[91m✗\033[0m"

results = []

def check(name: str, condition: bool, detail: str = ""):
    status = PASS if condition else FAIL
    results.append(condition)
    suffix = f"  ({detail})" if detail and not condition else ""
    print(f"  {status}  {name}{suffix}")

# ── Tests ──────────────────────────────────────────────────────────────────────

def test_xml_tool_call_format(t: str):
    """Tool calls in history must use XML <function=...> format, not JSON."""
    messages = [
        {"role": "user", "content": "Get weather for Paris"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "get_weather",
                                      "arguments": {"city": "Paris", "units": "celsius"}}}]},
        {"role": "tool", "content": "Sunny, 22°C"},
    ]
    tools = [{"name": "get_weather", "description": "Get weather",
              "parameters": {"type": "object",
                             "properties": {"city": {"type": "string"},
                                            "units": {"type": "string"}},
                             "required": ["city"]}}]
    out = render(t, messages, tools=tools)
    check("XML format: <function=...> present", "<function=get_weather>" in out,
          f"got: {repr(out[out.find('<tool_call>'):out.find('<tool_call>')+120] if '<tool_call>' in out else 'no <tool_call>')}")
    check("XML format: <parameter=city> present", "<parameter=city>" in out)
    check("XML format: <parameter=units> present", "<parameter=units>" in out)
    check("XML format: </function> present", "</function>" in out)
    check("XML format: JSON {\"name\": absent in tool_call section",
          '{"name":' not in out[out.rfind('<tool_call>'):] if '<tool_call>' in out else True,
          f"tool_call section: {repr(out[out.rfind('<tool_call>'):out.rfind('<tool_call>')+120] if '<tool_call>' in out else 'none')}")
    check("XML format: tool_instructions mention <function=", "<function=" in out)

def test_tool_instructions_format(t: str):
    """System prompt tool instructions must show XML example, not JSON."""
    messages = [{"role": "user", "content": "Hello"}]
    tools = [{"name": "noop", "description": "no-op",
              "parameters": {"type": "object", "properties": {}, "required": []}}]
    out = render(t, messages, tools=tools)
    check("Instructions: XML example present", "<function=example_function_name>" in out)
    check("Instructions: JSON example absent", '{"name": "tool_name"' not in out)
    check("Instructions: <parameter= in example", "<parameter=example_parameter_1>" in out)

def test_normal_generation_prompt(t: str):
    """Normal generation prompt opens <think> block."""
    messages = [{"role": "user", "content": "Hello"}]
    out = render(t, messages)
    check("Normal gen-prompt: ends with <think>\\n", out.endswith("<think>\n"),
          f"tail: {repr(out[-30:])}")

def test_thinking_bypass(t: str):
    """When thinking disabled via enable_thinking=False, bypass is injected."""
    messages = [{"role": "user", "content": "Hello"}]
    out = render(t, messages, enable_thinking=False)
    check("Think bypass: <think>\\n\\n</think>\\n\\n present",
          "<think>\n\n</think>\n\n" in out, f"tail: {repr(out[-60:])}")
    check("Think bypass: no open-only <think>\\n at end",
          not out.endswith("<think>\n"))

def test_think_off_token(t: str):
    """<|think_off|> in system message disables thinking."""
    messages = [
        {"role": "system", "content": "You are helpful.<|think_off|>"},
        {"role": "user", "content": "Hello"},
    ]
    out = render(t, messages)
    check("<|think_off|> disables thinking", "<think>\n\n</think>\n\n" in out)
    check("<|think_off|> token stripped from output", "<|think_off|>" not in out)

def test_think_on_after_off(t: str):
    """<|think_on|> re-enables thinking after <|think_off|>."""
    messages = [
        {"role": "system", "content": "<|think_off|>"},
        {"role": "user", "content": "Step 1"},
        {"role": "assistant", "content": "Done."},
        {"role": "user", "content": "<|think_on|>Now think"},
    ]
    out = render(t, messages)
    check("<|think_on|> re-enables thinking", out.endswith("<think>\n"),
          f"tail: {repr(out[-40:])}")

def test_tier1_error_escalation(t: str):
    """First tool error injects Tier 1 correction hint."""
    messages = [
        {"role": "user", "content": "Read a file"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file",
                                      "arguments": {"path": "/foo", "pages": "1-30"}}}]},
        {"role": "tool", "content": "Pages range exceeds maximum of 20 pages per request."},
    ]
    tools = [{"name": "read_file", "description": "Read",
              "parameters": {"type": "object", "properties": {"path": {"type": "string"}},
                             "required": ["path"]}}]
    out = render(t, messages, tools=tools)
    check("Tier 1: correction hint in gen-prompt",
          "The previous tool call returned an error" in out,
          f"tail: {repr(out[-120:])}")
    check("Tier 1: think block still open (not bypassed)", out.endswith("\n"))

def test_tier2_error_escalation(t: str):
    """Two consecutive tool errors trigger Tier 2 bypass."""
    tool_error = "Pages range exceeds maximum of 20 pages per request."
    messages = [
        {"role": "user", "content": "Read a file"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file",
                                      "arguments": {"path": "/foo", "pages": "1-30"}}}]},
        {"role": "tool", "content": tool_error},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file",
                                      "arguments": {"path": "/foo", "pages": "1-25"}}}]},
        {"role": "tool", "content": tool_error},
    ]
    tools = [{"name": "read_file", "description": "Read",
              "parameters": {"type": "object", "properties": {"path": {"type": "string"}},
                             "required": ["path"]}}]
    out = render(t, messages, tools=tools)
    check("Tier 2: think bypass injected", "<think>\n\n</think>\n\n" in out,
          f"tail: {repr(out[-200:])}")
    check("Tier 2: escalation warning present",
          "consecutive tool errors" in out or "consecutive" in out,
          f"tail: {repr(out[-200:])}")

def test_length_gated_detection(t: str):
    """Long tool response (code content with 'error') must NOT trigger error flag."""
    long_content = ("// Error handling\nfunction handleError(e) {\n"
                    "  throw new Error('invalid input');\n}\n") * 30  # >> 500 chars
    messages = [
        {"role": "user", "content": "Read the code"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file",
                                      "arguments": {"path": "app.js"}}}]},
        {"role": "tool", "content": long_content},
    ]
    tools = [{"name": "read_file", "description": "Read",
              "parameters": {"type": "object", "properties": {"path": {"type": "string"}},
                             "required": ["path"]}}]
    out = render(t, messages, tools=tools)
    check("Length gate: long response with 'error' does NOT trigger hint",
          "The previous tool call returned an error" not in out,
          f"tail: {repr(out[-80:])}")
    check("Length gate: normal gen-prompt after long response",
          out.endswith("<think>\n"), f"tail: {repr(out[-40:])}")

def test_error_counter_resets_on_success(t: str):
    """After a successful tool call, the consecutive failure counter resets."""
    tool_error = "Error: file not found."
    messages = [
        {"role": "user", "content": "Do something"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/bad"}}}]},
        {"role": "tool", "content": tool_error},  # error → cf=1
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/good"}}}]},
        {"role": "tool", "content": "file content here " * 30},  # success, long → cf=0
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file", "arguments": {"path": "/bad2"}}}]},
        {"role": "tool", "content": tool_error},  # error → cf=1 again (not 2)
    ]
    tools = [{"name": "read_file", "description": "Read",
              "parameters": {"type": "object", "properties": {"path": {"type": "string"}},
                             "required": ["path"]}}]
    out = render(t, messages, tools=tools)
    # Should be Tier 1 (cf=1), not Tier 2 (cf>=2)
    check("Counter reset: Tier 1 after success+new error (not Tier 2)",
          "The previous tool call returned an error" in out and
          "consecutive tool errors" not in out,
          f"tail: {repr(out[-200:])}")

def test_historical_thinking_stripped(t: str):
    """Historical assistant <think> blocks are stripped (not last message)."""
    messages = [
        {"role": "user", "content": "Q1"},
        {"role": "assistant", "content": "<think>\nmy thoughts\n</think>\n\nAnswer 1"},
        {"role": "user", "content": "Q2"},
    ]
    out = render(t, messages)
    check("Historical think stripped by default", "my thoughts" not in out)

def test_preserve_thinking(t: str):
    """With preserve_thinking=True, historical <think> blocks are kept."""
    messages = [
        {"role": "user", "content": "Q1"},
        {"role": "assistant", "content": "<think>\nmy thoughts\n</think>\n\nAnswer 1"},
        {"role": "user", "content": "Q2"},
    ]
    out = render(t, messages, preserve_thinking=True)
    check("preserve_thinking=True keeps historical thoughts", "my thoughts" in out)

def test_developer_role(t: str):
    """developer role is accepted (same as system)."""
    messages = [
        {"role": "developer", "content": "You are a coder."},
        {"role": "user", "content": "Write code"},
    ]
    try:
        out = render(t, messages)
        check("developer role: no crash", True)
        check("developer role: content rendered", "You are a coder." in out)
    except Exception as e:
        check("developer role: no crash", False, str(e))

def test_mid_conversation_system(t: str):
    """System messages mid-conversation are rendered chronologically."""
    messages = [
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hi"},
        {"role": "system", "content": "You must now speak in French."},
        {"role": "user", "content": "What's up?"},
    ]
    try:
        out = render(t, messages)
        check("Mid-conv system: no crash", True)
        check("Mid-conv system: content present", "You must now speak in French." in out)
    except Exception as e:
        check("Mid-conv system: no crash", False, str(e))

def test_tool_response_wrapping(t: str):
    """Tool responses are wrapped in <tool_response> tags."""
    messages = [
        {"role": "user", "content": "Get data"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "fetch", "arguments": {"url": "https://example.com"}}}]},
        {"role": "tool", "content": "data here"},
    ]
    tools = [{"name": "fetch", "description": "Fetch URL",
              "parameters": {"type": "object", "properties": {"url": {"type": "string"}},
                             "required": ["url"]}}]
    out = render(t, messages, tools=tools)
    check("Tool response: <tool_response> wrapper present", "<tool_response>" in out)
    check("Tool response: content inside wrapper", "data here" in out)

def test_no_tools_no_crash(t: str):
    """Template works without any tools passed."""
    messages = [{"role": "user", "content": "What is 2+2?"}]
    try:
        out = render(t, messages)
        check("No tools: no crash", True)
        check("No tools: normal gen-prompt", out.endswith("<think>\n"))
    except Exception as e:
        check("No tools: no crash", False, str(e))

def test_string_arguments_passthrough(t: str):
    """String tool arguments (pre-serialized JSON) are passed through as-is."""
    messages = [
        {"role": "user", "content": "Search"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "search",
                                      "arguments": '{"query": "python", "limit": 5}'}}]},
        {"role": "tool", "content": "results"},
    ]
    tools = [{"name": "search", "description": "Search",
              "parameters": {"type": "object", "properties": {"query": {"type": "string"}},
                             "required": ["query"]}}]
    out = render(t, messages, tools=tools)
    check("String args: passthrough without crash", "search" in out)

# ── New tests for v16 fixes ────────────────────────────────────────────────────

def test_shell_result_false_positive(t: str):
    """Short grep results containing 'error' in identifiers must NOT trigger error flag."""
    shell_result = '$ grep -n "error_message" orchestrator.go (timeout 5s)\n\n661: "error_message": "",\n\nTook 0.1s'
    messages = [
        {"role": "user", "content": "Search for error_message"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "shell",
                                      "arguments": {"cmd": "grep -n error_message file.go"}}}]},
        {"role": "tool", "content": shell_result},
    ]
    tools = [{"name": "shell", "description": "Run shell command",
              "parameters": {"type": "object", "properties": {"cmd": {"type": "string"}},
                             "required": ["cmd"]}}]
    out = render(t, messages, tools=tools)
    check("Shell false-positive: grep with 'error_message' not flagged",
          "The previous tool call returned an error" not in out,
          f"tail: {repr(out[-80:])}")
    check("Shell false-positive: normal gen-prompt after grep result",
          out.endswith("<think>\n"), f"tail: {repr(out[-40:])}")


def test_no_thinking_with_error_escalation(t: str):
    """When enable_thinking=False and a tool errors, correction hint must NOT open a <think> block."""
    tool_error = "Pages range exceeds maximum of 20 pages per request."
    messages = [
        {"role": "user", "content": "Read a file"},
        {"role": "assistant", "content": "",
         "tool_calls": [{"function": {"name": "read_file",
                                      "arguments": {"path": "/foo", "pages": "1-30"}}}]},
        {"role": "tool", "content": tool_error},
    ]
    tools = [{"name": "read_file", "description": "Read",
              "parameters": {"type": "object", "properties": {"path": {"type": "string"}},
                             "required": ["path"]}}]
    out = render(t, messages, tools=tools, enable_thinking=False)
    check("No-think + error: correction hint present",
          "The previous tool call returned an error" in out,
          f"tail: {repr(out[-120:])}")
    check("No-think + error: does not end with open think block",
          not out.rstrip().endswith("<think>"),
          f"tail: {repr(out[-80:])}")
    check("No-think + error: no unclosed <think> in error section",
          "<think>\nThe previous" not in out,
          f"tail: {repr(out[-120:])}")


# ── Runner ─────────────────────────────────────────────────────────────────────

TESTS = [
    test_xml_tool_call_format,
    test_tool_instructions_format,
    test_normal_generation_prompt,
    test_thinking_bypass,
    test_think_off_token,
    test_think_on_after_off,
    test_tier1_error_escalation,
    test_tier2_error_escalation,
    test_length_gated_detection,
    test_error_counter_resets_on_success,
    test_historical_thinking_stripped,
    test_preserve_thinking,
    test_developer_role,
    test_mid_conversation_system,
    test_tool_response_wrapping,
    test_no_tools_no_crash,
    test_string_arguments_passthrough,
    test_shell_result_false_positive,
    test_no_thinking_with_error_escalation,
]
results: list[bool] = []

def run_tests():
    print(f"\n{'═'*60}")
    print(f"  Testing v17 Chat Template")
    print(f"{'═'*60}")
    try:
        tmpl = load_template()
    except FileNotFoundError:
        print(f"  Template not found: chat_template.jinja")
        return
    for fn in TESTS:
        label = fn.__name__.replace("test_", "").replace("_", " ").title()
        print(f"\n  [{label}]")
        fn(tmpl)

if __name__ == "__main__":
    results.clear()
    run_tests()

    total = len(results)
    passed = sum(results)
    failed = total - passed
    print(f"\n{'═'*60}")
    print(f"  Results: {passed}/{total} passed", end="")
    if failed:
        print(f"  \033[91m({failed} failed)\033[0m")
    else:
        print(f"  \033[92m(all passed)\033[0m")
    print(f"{'═'*60}\n")
    sys.exit(0 if failed == 0 else 1)