Spaces:

osunlp
/

QUEST

Running

File size: 16,535 Bytes

8e8119b

"""
Regression tests for the '<answer>...</answer>' placeholder bug that caused the
Space to render only a literal `...` instead of the real (often table-shaped)
final answer.

These tests are plain asserts, runnable with `python _test_markdown_fix.py`.
They import the fixed helpers directly from `app.py` without booting Gradio.
"""

import os
import sys
from pathlib import Path

# Do not start the Gradio UI when importing app.py.
os.environ.setdefault("GRADIO_SERVER_PORT", "0")

HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(HERE))

from unittest import mock

from app import (
    extract_answer,
    strip_think_blocks,
    ensure_markdown_table_blank_lines,
    decode_escaped_whitespace,
    _is_placeholder_answer,
    parse_tool_call,
)


def _check(name: str, actual, expected) -> None:
    ok = actual == expected
    status = "PASS" if ok else "FAIL"
    print(f"[{status}] {name}")
    if not ok:
        print(f"  expected: {expected!r}")
        print(f"  actual  : {actual!r}")
    assert ok, name


# -------------------------------------------------------------------------
# 1. The original bug: Quest-4B echoes the template literally.
# -------------------------------------------------------------------------
_check(
    "echoed placeholder `<answer>...</answer>` is rejected",
    extract_answer("<answer>...</answer>"),
    None,
)

_check(
    "echoed unicode ellipsis `<answer>…</answer>` is rejected",
    extract_answer("<answer>…</answer>"),
    None,
)

_check(
    "whitespace-only `<answer>   </answer>` is rejected",
    extract_answer("<answer>   </answer>"),
    None,
)

_check(
    "placeholder detector recognises ASCII dots",
    _is_placeholder_answer("..."),
    True,
)
_check(
    "placeholder detector recognises unicode ellipsis",
    _is_placeholder_answer("…"),
    True,
)
_check(
    "placeholder detector recognises interpunct",
    _is_placeholder_answer("·"),
    True,
)
_check(
    "placeholder detector accepts real text",
    _is_placeholder_answer("The answer is 3..."),
    False,
)


# -------------------------------------------------------------------------
# 2. A real Markdown table inside <answer> survives round-trip.
# -------------------------------------------------------------------------
table_body = "| Color | Hex |\n|---|---|\n| Red | #ff0000 |\n| Green | #00ff00 |"
_check(
    "Markdown table inside <answer> is returned intact",
    extract_answer(f"<answer>\n{table_body}\n</answer>"),
    table_body,
)


# -------------------------------------------------------------------------
# 3. <think> block is stripped before extracting the answer.
# -------------------------------------------------------------------------
_check(
    "<think>...</think> is removed from answer content",
    extract_answer("<think>reasoning goes here</think><answer>real answer</answer>"),
    "real answer",
)

_check(
    "multi-line <think> is removed",
    extract_answer(
        "<think>line 1\nline 2\nline 3</think>\n<answer>the truth</answer>"
    ),
    "the truth",
)

_check(
    "strip_think_blocks leaves non-think content alone",
    strip_think_blocks("plain text"),
    "plain text",
)


# -------------------------------------------------------------------------
# 4. Truncated output: <answer> opened, never closed.
# -------------------------------------------------------------------------
_check(
    "truncated `<answer>` with real text is still extracted",
    extract_answer("<answer>Here is the partial answer"),
    "Here is the partial answer",
)

_check(
    "truncated `<answer>` that is just dots is still rejected",
    extract_answer("<answer>..."),
    None,
)


# -------------------------------------------------------------------------
# 5. ensure_markdown_table_blank_lines inserts the required break.
# -------------------------------------------------------------------------
glued = "Here is the comparison:\n| Col | Val |\n|---|---|\n| a | b |"
fixed = ensure_markdown_table_blank_lines(glued)
assert "\n\n| Col | Val |" in fixed, f"blank line was not inserted: {fixed!r}"
print("[PASS] ensure_markdown_table_blank_lines inserts break before table")

already_ok = "Here is the comparison:\n\n| Col | Val |\n|---|---|\n| a | b |"
_check(
    "ensure_markdown_table_blank_lines is a no-op when blank line already exists",
    ensure_markdown_table_blank_lines(already_ok),
    already_ok,
)

table_at_start = "| Col | Val |\n|---|---|\n| a | b |"
_check(
    "ensure_markdown_table_blank_lines leaves a table at the very start alone",
    ensure_markdown_table_blank_lines(table_at_start),
    table_at_start,
)


# -------------------------------------------------------------------------
# 6. parse_tool_call still works after the <think>-stripping refactor.
# -------------------------------------------------------------------------
tool_out = (
    "<think>I should search for this</think>\n"
    '<tool_call>{"name": "search", "arguments": {"query": ["hello"]}}</tool_call>'
)
name, args, err = parse_tool_call(tool_out)
assert err is None, f"unexpected parse error: {err}"
_check("parse_tool_call extracts name", name, "search")
_check("parse_tool_call extracts arguments", args, {"query": ["hello"]})


# -------------------------------------------------------------------------
# 7. Escaped-whitespace decoding (the 2nd reported bug):
#    the endpoint returned `\n` as literal 2-char sequences, so the
#    pipe table rendered as a one-line sentence of `| a | b |\n...`.
# -------------------------------------------------------------------------
user_reported_payload = (
    "\\n| Color | Hex |\\n|---|---|\\n| Red | #FF0000 |"
    "\\n| Green | #00FF00 |\\n| Blue | #0000FF |\\n"
)
decoded_user_payload = decode_escaped_whitespace(user_reported_payload)
assert "\n| Color | Hex |" in decoded_user_payload, decoded_user_payload
assert "\\n" not in decoded_user_payload, decoded_user_payload
print("[PASS] decode_escaped_whitespace converts the user-reported payload")

# Extract from a full <answer> block whose content is escape-encoded.
escape_encoded_answer = f"<answer>{user_reported_payload}</answer>"
extracted_escape = extract_answer(escape_encoded_answer)
assert extracted_escape is not None
assert "| Red | #FF0000 |" in extracted_escape
assert "\\n" not in extracted_escape
# And the separator must be on its own line so GFM recognises the table.
assert "|---|---|" in extracted_escape
print("[PASS] extract_answer decodes escape-encoded <answer> into real newlines")

# Heuristic: do NOT decode when escapes are rare (a real code example).
code_example = 'Some prose with a single \\n in a code example.'
_check(
    "decode_escaped_whitespace leaves lightly-escaped prose alone",
    decode_escaped_whitespace(code_example),
    code_example,
)

# Heuristic: do NOT decode when real newlines already dominate.
mostly_real = "real\nnewlines\nhere\nwith\\none escape"
_check(
    "decode_escaped_whitespace leaves mostly-real-newline text alone",
    decode_escaped_whitespace(mostly_real),
    mostly_real,
)

# Heuristic: DO decode when escapes clearly dominate.
mostly_escaped = "one real\n then \\na \\nb \\nc \\nd"
decoded_ok = decode_escaped_whitespace(mostly_escaped)
assert decoded_ok.count("\n") > mostly_escaped.count("\n"), decoded_ok
assert decoded_ok.count("\\n") == 0, decoded_ok
print("[PASS] decode_escaped_whitespace decodes when escapes dominate")


# -------------------------------------------------------------------------
# 8. End-to-end: the originally-reported scenario now renders a real table.
# -------------------------------------------------------------------------
buggy_output = "<answer>...</answer>"
good_output = (
    "<think>let me build the table</think>\n"
    "<answer>\n"
    "Here is the table:\n"
    "| Planet | Distance (AU) |\n"
    "|---|---|\n"
    "| Mercury | 0.39 |\n"
    "| Venus | 0.72 |\n"
    "| Earth | 1.00 |\n"
    "</answer>"
)

# The buggy case must no longer be accepted as an answer.
assert extract_answer(buggy_output) is None
# The good case must round-trip AND come out table-ready.
extracted = extract_answer(good_output)
assert extracted is not None
rendered_ready = ensure_markdown_table_blank_lines(extracted)
assert "\n\n| Planet | Distance (AU) |" in rendered_ready, rendered_ready
print("[PASS] end-to-end: placeholder rejected, real table rendered with blank line")

# -------------------------------------------------------------------------
# 9. Search backend rate-limit no longer crashes the whole agent.
#    Simulates the DuckDuckGo 202 Ratelimit error the user reported.
# -------------------------------------------------------------------------
import app as _app

class _FakeRatelimit(Exception):
    pass


class _RatelimitedDDGS:
    """Stand-in for DDGS that always raises the way ddgs does on 202."""

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc, tb):
        return False

    def text(self, *args, **kwargs):
        raise _FakeRatelimit("https://html.duckduckgo.com/html 202 Ratelimit")


# Clear in-memory cache so the mock is actually exercised.
_app.SEARCH_CACHE.clear()

with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
     mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
    out = _app._run_search_single("iPhone 15 vs iPhone 16 features", max_results=3)

assert out["ok"] is False, out
assert "Ratelimit" in out["error"], out
assert out["results"] == []
assert "hint" in out and "training knowledge" in out["hint"], out
print("[PASS] _run_search_single converts DDG rate-limit into a graceful tool error")

# The caller that invokes build_research_agent wraps tool responses into a
# user message; the important thing is that _run_search_single NEVER raises,
# so the agent loop can continue and let the model produce an <answer>.
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
     mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
    try:
        _ = _app.run_search(["q1", "q2"], max_results=3)
        raised = False
    except Exception:
        raised = True
assert not raised, "run_search should not raise when DDG rate-limits"
print("[PASS] run_search swallows backend errors across multi-query calls")


# -------------------------------------------------------------------------
# 10. Serper backend is preferred when SERPER_API_KEY is set, and DDG is
#     used as a fallback. Verifies the latency fix for the iPhone query.
# -------------------------------------------------------------------------
class _FakeResponse:
    def __init__(self, payload):
        self._payload = payload

    def raise_for_status(self):
        return None

    def json(self):
        return self._payload


def _fake_serper_ok(url, headers, json, timeout):  # noqa: A002 - gradio-style arg
    assert headers.get("X-API-KEY") == "test-serper-key"
    return _FakeResponse(
        {
            "answerBox": {
                "title": "iPhone 16 vs 15",
                "link": "https://example.com/answer",
                "snippet": "Apple replaced the mute switch with an action button.",
            },
            "organic": [
                {
                    "title": "iPhone 16 Specs",
                    "link": "https://example.com/iphone-16",
                    "snippet": "A18 chip, 48 MP camera, ...",
                },
                {
                    "title": "iPhone 15 Specs",
                    "link": "https://example.com/iphone-15",
                    "snippet": "A16 Bionic, Dynamic Island...",
                },
            ],
        }
    )


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
    serper_out = _app._run_search_single("iPhone 16 vs iPhone 15", max_results=5)

assert serper_out["ok"] is True, serper_out
assert serper_out.get("backend") == "serper", serper_out
assert serper_out["results"][0]["title"] == "iPhone 16 vs 15", serper_out  # answer box first
assert len(serper_out["results"]) == 3, serper_out
print("[PASS] Serper backend is preferred when SERPER_API_KEY is set")


def _fake_serper_fail(url, headers, json, timeout):  # noqa: A002
    raise RuntimeError("serper: 429 quota exceeded")


class _WorkingDDGS:
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc, tb):
        return False

    def text(self, *args, **kwargs):
        yield {
            "title": "DDG result",
            "href": "https://example.org/ddg",
            "body": "ddg fallback body",
        }


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
     mock.patch.object(_app, "DDGS", _WorkingDDGS):
    fallback_out = _app._run_search_single("anything", max_results=2)

assert fallback_out["ok"] is True, fallback_out
assert fallback_out.get("backend") == "duckduckgo", fallback_out
assert fallback_out["results"][0]["href"] == "https://example.org/ddg"
print("[PASS] Falls back to DuckDuckGo when Serper errors out")


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
     mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
     mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
    both_fail = _app._run_search_single("anything", max_results=2)

assert both_fail["ok"] is False, both_fail
assert "serper" in both_fail["error"].lower(), both_fail
assert "duckduckgo" in both_fail["error"].lower(), both_fail
assert "hint" in both_fail
print("[PASS] Returns graceful error when both Serper and DDG fail")


# -------------------------------------------------------------------------
# 11. build_research_agent streams progress (is a generator).
# -------------------------------------------------------------------------
import inspect as _inspect

assert _inspect.isgeneratorfunction(_app.build_research_agent), (
    "build_research_agent should be a generator so run_ui can stream progress"
)
assert _inspect.isgeneratorfunction(_app.run_ui), (
    "run_ui should be a generator so Gradio streams per-turn status to the UI"
)
print("[PASS] build_research_agent and run_ui are streaming generators")


# -------------------------------------------------------------------------
# 12. End-to-end dry run of the generator: verify at least one progress
#     tuple is yielded BEFORE the final answer, and that the final yield
#     is a real answer (not a placeholder).
# -------------------------------------------------------------------------
_fake_model_script = [
    (
        "<think>I should search the web for Mercury distance.</think>"
        '<tool_call>{"name": "search", "arguments": {"query": ["Mercury distance AU"]}}</tool_call>',
        "fake-model",
    ),
    (
        "<answer>\n"
        "Here is the table:\n"
        "| Planet | Distance (AU) |\n"
        "|---|---|\n"
        "| Mercury | 0.39 |\n"
        "</answer>",
        "fake-model",
    ),
]


def _fake_call_model(*args, **kwargs):
    return _fake_model_script.pop(0)


class _FakeInferenceClient:
    def __init__(self, *a, **k):
        pass


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "call_model", side_effect=_fake_call_model), \
     mock.patch.object(_app, "_build_client_for_model",
                       return_value=(_FakeInferenceClient(), "fake-model", [])), \
     mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
    gen = _app.build_research_agent(
        question="How far is Mercury from the sun?",
        model="fake-model",
        max_turns=4,
        max_search_results=3,
        temperature=0.0,
    )
    emitted = list(gen)

assert len(emitted) >= 3, f"expected multiple progress yields, got {len(emitted)}"
final_answer, final_trace = emitted[-1]
assert "Mercury" in final_answer, final_answer
assert "| Planet |" in final_answer, final_answer
assert "...</answer>" not in final_answer
# Intermediate yields should have progress scaffolding.
assert any("⏳ Researching" in ans for ans, _ in emitted[:-1]), (
    "no intermediate progress yield detected"
)
print("[PASS] build_research_agent streams progress then a real final answer")

print()
print("All markdown-fix regression tests passed.")