"""
Regression tests for the '...' placeholder bug that caused the
Space to render only a literal `...` instead of the real (often table-shaped)
final answer.
These tests are plain asserts, runnable with `python _test_markdown_fix.py`.
They import the fixed helpers directly from `app.py` without booting Gradio.
"""
import os
import sys
from pathlib import Path
# Do not start the Gradio UI when importing app.py.
os.environ.setdefault("GRADIO_SERVER_PORT", "0")
HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(HERE))
from unittest import mock
from app import (
extract_answer,
strip_think_blocks,
ensure_markdown_table_blank_lines,
decode_escaped_whitespace,
_is_placeholder_answer,
parse_tool_call,
)
def _check(name: str, actual, expected) -> None:
ok = actual == expected
status = "PASS" if ok else "FAIL"
print(f"[{status}] {name}")
if not ok:
print(f" expected: {expected!r}")
print(f" actual : {actual!r}")
assert ok, name
# -------------------------------------------------------------------------
# 1. The original bug: Quest-4B echoes the template literally.
# -------------------------------------------------------------------------
_check(
"echoed placeholder `...` is rejected",
extract_answer("..."),
None,
)
_check(
"echoed unicode ellipsis `…` is rejected",
extract_answer("…"),
None,
)
_check(
"whitespace-only ` ` is rejected",
extract_answer(" "),
None,
)
_check(
"placeholder detector recognises ASCII dots",
_is_placeholder_answer("..."),
True,
)
_check(
"placeholder detector recognises unicode ellipsis",
_is_placeholder_answer("…"),
True,
)
_check(
"placeholder detector recognises interpunct",
_is_placeholder_answer("·"),
True,
)
_check(
"placeholder detector accepts real text",
_is_placeholder_answer("The answer is 3..."),
False,
)
# -------------------------------------------------------------------------
# 2. A real Markdown table inside survives round-trip.
# -------------------------------------------------------------------------
table_body = "| Color | Hex |\n|---|---|\n| Red | #ff0000 |\n| Green | #00ff00 |"
_check(
"Markdown table inside is returned intact",
extract_answer(f"\n{table_body}\n"),
table_body,
)
# -------------------------------------------------------------------------
# 3. block is stripped before extracting the answer.
# -------------------------------------------------------------------------
_check(
"... is removed from answer content",
extract_answer("reasoning goes herereal answer"),
"real answer",
)
_check(
"multi-line is removed",
extract_answer(
"line 1\nline 2\nline 3\nthe truth"
),
"the truth",
)
_check(
"strip_think_blocks leaves non-think content alone",
strip_think_blocks("plain text"),
"plain text",
)
# -------------------------------------------------------------------------
# 4. Truncated output: opened, never closed.
# -------------------------------------------------------------------------
_check(
"truncated `` with real text is still extracted",
extract_answer("Here is the partial answer"),
"Here is the partial answer",
)
_check(
"truncated `` that is just dots is still rejected",
extract_answer("..."),
None,
)
# -------------------------------------------------------------------------
# 5. ensure_markdown_table_blank_lines inserts the required break.
# -------------------------------------------------------------------------
glued = "Here is the comparison:\n| Col | Val |\n|---|---|\n| a | b |"
fixed = ensure_markdown_table_blank_lines(glued)
assert "\n\n| Col | Val |" in fixed, f"blank line was not inserted: {fixed!r}"
print("[PASS] ensure_markdown_table_blank_lines inserts break before table")
already_ok = "Here is the comparison:\n\n| Col | Val |\n|---|---|\n| a | b |"
_check(
"ensure_markdown_table_blank_lines is a no-op when blank line already exists",
ensure_markdown_table_blank_lines(already_ok),
already_ok,
)
table_at_start = "| Col | Val |\n|---|---|\n| a | b |"
_check(
"ensure_markdown_table_blank_lines leaves a table at the very start alone",
ensure_markdown_table_blank_lines(table_at_start),
table_at_start,
)
# -------------------------------------------------------------------------
# 6. parse_tool_call still works after the -stripping refactor.
# -------------------------------------------------------------------------
tool_out = (
"I should search for this\n"
'{"name": "search", "arguments": {"query": ["hello"]}}'
)
name, args, err = parse_tool_call(tool_out)
assert err is None, f"unexpected parse error: {err}"
_check("parse_tool_call extracts name", name, "search")
_check("parse_tool_call extracts arguments", args, {"query": ["hello"]})
# -------------------------------------------------------------------------
# 7. Escaped-whitespace decoding (the 2nd reported bug):
# the endpoint returned `\n` as literal 2-char sequences, so the
# pipe table rendered as a one-line sentence of `| a | b |\n...`.
# -------------------------------------------------------------------------
user_reported_payload = (
"\\n| Color | Hex |\\n|---|---|\\n| Red | #FF0000 |"
"\\n| Green | #00FF00 |\\n| Blue | #0000FF |\\n"
)
decoded_user_payload = decode_escaped_whitespace(user_reported_payload)
assert "\n| Color | Hex |" in decoded_user_payload, decoded_user_payload
assert "\\n" not in decoded_user_payload, decoded_user_payload
print("[PASS] decode_escaped_whitespace converts the user-reported payload")
# Extract from a full block whose content is escape-encoded.
escape_encoded_answer = f"{user_reported_payload}"
extracted_escape = extract_answer(escape_encoded_answer)
assert extracted_escape is not None
assert "| Red | #FF0000 |" in extracted_escape
assert "\\n" not in extracted_escape
# And the separator must be on its own line so GFM recognises the table.
assert "|---|---|" in extracted_escape
print("[PASS] extract_answer decodes escape-encoded into real newlines")
# Heuristic: do NOT decode when escapes are rare (a real code example).
code_example = 'Some prose with a single \\n in a code example.'
_check(
"decode_escaped_whitespace leaves lightly-escaped prose alone",
decode_escaped_whitespace(code_example),
code_example,
)
# Heuristic: do NOT decode when real newlines already dominate.
mostly_real = "real\nnewlines\nhere\nwith\\none escape"
_check(
"decode_escaped_whitespace leaves mostly-real-newline text alone",
decode_escaped_whitespace(mostly_real),
mostly_real,
)
# Heuristic: DO decode when escapes clearly dominate.
mostly_escaped = "one real\n then \\na \\nb \\nc \\nd"
decoded_ok = decode_escaped_whitespace(mostly_escaped)
assert decoded_ok.count("\n") > mostly_escaped.count("\n"), decoded_ok
assert decoded_ok.count("\\n") == 0, decoded_ok
print("[PASS] decode_escaped_whitespace decodes when escapes dominate")
# -------------------------------------------------------------------------
# 8. End-to-end: the originally-reported scenario now renders a real table.
# -------------------------------------------------------------------------
buggy_output = "..."
good_output = (
"let me build the table\n"
"\n"
"Here is the table:\n"
"| Planet | Distance (AU) |\n"
"|---|---|\n"
"| Mercury | 0.39 |\n"
"| Venus | 0.72 |\n"
"| Earth | 1.00 |\n"
""
)
# The buggy case must no longer be accepted as an answer.
assert extract_answer(buggy_output) is None
# The good case must round-trip AND come out table-ready.
extracted = extract_answer(good_output)
assert extracted is not None
rendered_ready = ensure_markdown_table_blank_lines(extracted)
assert "\n\n| Planet | Distance (AU) |" in rendered_ready, rendered_ready
print("[PASS] end-to-end: placeholder rejected, real table rendered with blank line")
# -------------------------------------------------------------------------
# 9. Search backend rate-limit no longer crashes the whole agent.
# Simulates the DuckDuckGo 202 Ratelimit error the user reported.
# -------------------------------------------------------------------------
import app as _app
class _FakeRatelimit(Exception):
pass
class _RatelimitedDDGS:
"""Stand-in for DDGS that always raises the way ddgs does on 202."""
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def text(self, *args, **kwargs):
raise _FakeRatelimit("https://html.duckduckgo.com/html 202 Ratelimit")
# Clear in-memory cache so the mock is actually exercised.
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
out = _app._run_search_single("iPhone 15 vs iPhone 16 features", max_results=3)
assert out["ok"] is False, out
assert "Ratelimit" in out["error"], out
assert out["results"] == []
assert "hint" in out and "training knowledge" in out["hint"], out
print("[PASS] _run_search_single converts DDG rate-limit into a graceful tool error")
# The caller that invokes build_research_agent wraps tool responses into a
# user message; the important thing is that _run_search_single NEVER raises,
# so the agent loop can continue and let the model produce an .
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
try:
_ = _app.run_search(["q1", "q2"], max_results=3)
raised = False
except Exception:
raised = True
assert not raised, "run_search should not raise when DDG rate-limits"
print("[PASS] run_search swallows backend errors across multi-query calls")
# -------------------------------------------------------------------------
# 10. Serper backend is preferred when SERPER_API_KEY is set, and DDG is
# used as a fallback. Verifies the latency fix for the iPhone query.
# -------------------------------------------------------------------------
class _FakeResponse:
def __init__(self, payload):
self._payload = payload
def raise_for_status(self):
return None
def json(self):
return self._payload
def _fake_serper_ok(url, headers, json, timeout): # noqa: A002 - gradio-style arg
assert headers.get("X-API-KEY") == "test-serper-key"
return _FakeResponse(
{
"answerBox": {
"title": "iPhone 16 vs 15",
"link": "https://example.com/answer",
"snippet": "Apple replaced the mute switch with an action button.",
},
"organic": [
{
"title": "iPhone 16 Specs",
"link": "https://example.com/iphone-16",
"snippet": "A18 chip, 48 MP camera, ...",
},
{
"title": "iPhone 15 Specs",
"link": "https://example.com/iphone-15",
"snippet": "A16 Bionic, Dynamic Island...",
},
],
}
)
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
serper_out = _app._run_search_single("iPhone 16 vs iPhone 15", max_results=5)
assert serper_out["ok"] is True, serper_out
assert serper_out.get("backend") == "serper", serper_out
assert serper_out["results"][0]["title"] == "iPhone 16 vs 15", serper_out # answer box first
assert len(serper_out["results"]) == 3, serper_out
print("[PASS] Serper backend is preferred when SERPER_API_KEY is set")
def _fake_serper_fail(url, headers, json, timeout): # noqa: A002
raise RuntimeError("serper: 429 quota exceeded")
class _WorkingDDGS:
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def text(self, *args, **kwargs):
yield {
"title": "DDG result",
"href": "https://example.org/ddg",
"body": "ddg fallback body",
}
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
mock.patch.object(_app, "DDGS", _WorkingDDGS):
fallback_out = _app._run_search_single("anything", max_results=2)
assert fallback_out["ok"] is True, fallback_out
assert fallback_out.get("backend") == "duckduckgo", fallback_out
assert fallback_out["results"][0]["href"] == "https://example.org/ddg"
print("[PASS] Falls back to DuckDuckGo when Serper errors out")
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
both_fail = _app._run_search_single("anything", max_results=2)
assert both_fail["ok"] is False, both_fail
assert "serper" in both_fail["error"].lower(), both_fail
assert "duckduckgo" in both_fail["error"].lower(), both_fail
assert "hint" in both_fail
print("[PASS] Returns graceful error when both Serper and DDG fail")
# -------------------------------------------------------------------------
# 11. build_research_agent streams progress (is a generator).
# -------------------------------------------------------------------------
import inspect as _inspect
assert _inspect.isgeneratorfunction(_app.build_research_agent), (
"build_research_agent should be a generator so run_ui can stream progress"
)
assert _inspect.isgeneratorfunction(_app.run_ui), (
"run_ui should be a generator so Gradio streams per-turn status to the UI"
)
print("[PASS] build_research_agent and run_ui are streaming generators")
# -------------------------------------------------------------------------
# 12. End-to-end dry run of the generator: verify at least one progress
# tuple is yielded BEFORE the final answer, and that the final yield
# is a real answer (not a placeholder).
# -------------------------------------------------------------------------
_fake_model_script = [
(
"I should search the web for Mercury distance."
'{"name": "search", "arguments": {"query": ["Mercury distance AU"]}}',
"fake-model",
),
(
"\n"
"Here is the table:\n"
"| Planet | Distance (AU) |\n"
"|---|---|\n"
"| Mercury | 0.39 |\n"
"",
"fake-model",
),
]
def _fake_call_model(*args, **kwargs):
return _fake_model_script.pop(0)
class _FakeInferenceClient:
def __init__(self, *a, **k):
pass
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "call_model", side_effect=_fake_call_model), \
mock.patch.object(_app, "_build_client_for_model",
return_value=(_FakeInferenceClient(), "fake-model", [])), \
mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
gen = _app.build_research_agent(
question="How far is Mercury from the sun?",
model="fake-model",
max_turns=4,
max_search_results=3,
temperature=0.0,
)
emitted = list(gen)
assert len(emitted) >= 3, f"expected multiple progress yields, got {len(emitted)}"
final_answer, final_trace = emitted[-1]
assert "Mercury" in final_answer, final_answer
assert "| Planet |" in final_answer, final_answer
assert "..." not in final_answer
# Intermediate yields should have progress scaffolding.
assert any("⏳ Researching" in ans for ans, _ in emitted[:-1]), (
"no intermediate progress yield detected"
)
print("[PASS] build_research_agent streams progress then a real final answer")
print()
print("All markdown-fix regression tests passed.")