""" Regression tests for the '...' placeholder bug that caused the Space to render only a literal `...` instead of the real (often table-shaped) final answer. These tests are plain asserts, runnable with `python _test_markdown_fix.py`. They import the fixed helpers directly from `app.py` without booting Gradio. """ import os import sys from pathlib import Path # Do not start the Gradio UI when importing app.py. os.environ.setdefault("GRADIO_SERVER_PORT", "0") HERE = Path(__file__).resolve().parent sys.path.insert(0, str(HERE)) from unittest import mock from app import ( extract_answer, strip_think_blocks, ensure_markdown_table_blank_lines, decode_escaped_whitespace, _is_placeholder_answer, parse_tool_call, ) def _check(name: str, actual, expected) -> None: ok = actual == expected status = "PASS" if ok else "FAIL" print(f"[{status}] {name}") if not ok: print(f" expected: {expected!r}") print(f" actual : {actual!r}") assert ok, name # ------------------------------------------------------------------------- # 1. The original bug: Quest-4B echoes the template literally. # ------------------------------------------------------------------------- _check( "echoed placeholder `...` is rejected", extract_answer("..."), None, ) _check( "echoed unicode ellipsis `` is rejected", extract_answer(""), None, ) _check( "whitespace-only ` ` is rejected", extract_answer(" "), None, ) _check( "placeholder detector recognises ASCII dots", _is_placeholder_answer("..."), True, ) _check( "placeholder detector recognises unicode ellipsis", _is_placeholder_answer("…"), True, ) _check( "placeholder detector recognises interpunct", _is_placeholder_answer("·"), True, ) _check( "placeholder detector accepts real text", _is_placeholder_answer("The answer is 3..."), False, ) # ------------------------------------------------------------------------- # 2. A real Markdown table inside survives round-trip. # ------------------------------------------------------------------------- table_body = "| Color | Hex |\n|---|---|\n| Red | #ff0000 |\n| Green | #00ff00 |" _check( "Markdown table inside is returned intact", extract_answer(f"\n{table_body}\n"), table_body, ) # ------------------------------------------------------------------------- # 3. block is stripped before extracting the answer. # ------------------------------------------------------------------------- _check( "... is removed from answer content", extract_answer("reasoning goes herereal answer"), "real answer", ) _check( "multi-line is removed", extract_answer( "line 1\nline 2\nline 3\nthe truth" ), "the truth", ) _check( "strip_think_blocks leaves non-think content alone", strip_think_blocks("plain text"), "plain text", ) # ------------------------------------------------------------------------- # 4. Truncated output: opened, never closed. # ------------------------------------------------------------------------- _check( "truncated `` with real text is still extracted", extract_answer("Here is the partial answer"), "Here is the partial answer", ) _check( "truncated `` that is just dots is still rejected", extract_answer("..."), None, ) # ------------------------------------------------------------------------- # 5. ensure_markdown_table_blank_lines inserts the required break. # ------------------------------------------------------------------------- glued = "Here is the comparison:\n| Col | Val |\n|---|---|\n| a | b |" fixed = ensure_markdown_table_blank_lines(glued) assert "\n\n| Col | Val |" in fixed, f"blank line was not inserted: {fixed!r}" print("[PASS] ensure_markdown_table_blank_lines inserts break before table") already_ok = "Here is the comparison:\n\n| Col | Val |\n|---|---|\n| a | b |" _check( "ensure_markdown_table_blank_lines is a no-op when blank line already exists", ensure_markdown_table_blank_lines(already_ok), already_ok, ) table_at_start = "| Col | Val |\n|---|---|\n| a | b |" _check( "ensure_markdown_table_blank_lines leaves a table at the very start alone", ensure_markdown_table_blank_lines(table_at_start), table_at_start, ) # ------------------------------------------------------------------------- # 6. parse_tool_call still works after the -stripping refactor. # ------------------------------------------------------------------------- tool_out = ( "I should search for this\n" '{"name": "search", "arguments": {"query": ["hello"]}}' ) name, args, err = parse_tool_call(tool_out) assert err is None, f"unexpected parse error: {err}" _check("parse_tool_call extracts name", name, "search") _check("parse_tool_call extracts arguments", args, {"query": ["hello"]}) # ------------------------------------------------------------------------- # 7. Escaped-whitespace decoding (the 2nd reported bug): # the endpoint returned `\n` as literal 2-char sequences, so the # pipe table rendered as a one-line sentence of `| a | b |\n...`. # ------------------------------------------------------------------------- user_reported_payload = ( "\\n| Color | Hex |\\n|---|---|\\n| Red | #FF0000 |" "\\n| Green | #00FF00 |\\n| Blue | #0000FF |\\n" ) decoded_user_payload = decode_escaped_whitespace(user_reported_payload) assert "\n| Color | Hex |" in decoded_user_payload, decoded_user_payload assert "\\n" not in decoded_user_payload, decoded_user_payload print("[PASS] decode_escaped_whitespace converts the user-reported payload") # Extract from a full block whose content is escape-encoded. escape_encoded_answer = f"{user_reported_payload}" extracted_escape = extract_answer(escape_encoded_answer) assert extracted_escape is not None assert "| Red | #FF0000 |" in extracted_escape assert "\\n" not in extracted_escape # And the separator must be on its own line so GFM recognises the table. assert "|---|---|" in extracted_escape print("[PASS] extract_answer decodes escape-encoded into real newlines") # Heuristic: do NOT decode when escapes are rare (a real code example). code_example = 'Some prose with a single \\n in a code example.' _check( "decode_escaped_whitespace leaves lightly-escaped prose alone", decode_escaped_whitespace(code_example), code_example, ) # Heuristic: do NOT decode when real newlines already dominate. mostly_real = "real\nnewlines\nhere\nwith\\none escape" _check( "decode_escaped_whitespace leaves mostly-real-newline text alone", decode_escaped_whitespace(mostly_real), mostly_real, ) # Heuristic: DO decode when escapes clearly dominate. mostly_escaped = "one real\n then \\na \\nb \\nc \\nd" decoded_ok = decode_escaped_whitespace(mostly_escaped) assert decoded_ok.count("\n") > mostly_escaped.count("\n"), decoded_ok assert decoded_ok.count("\\n") == 0, decoded_ok print("[PASS] decode_escaped_whitespace decodes when escapes dominate") # ------------------------------------------------------------------------- # 8. End-to-end: the originally-reported scenario now renders a real table. # ------------------------------------------------------------------------- buggy_output = "..." good_output = ( "let me build the table\n" "\n" "Here is the table:\n" "| Planet | Distance (AU) |\n" "|---|---|\n" "| Mercury | 0.39 |\n" "| Venus | 0.72 |\n" "| Earth | 1.00 |\n" "" ) # The buggy case must no longer be accepted as an answer. assert extract_answer(buggy_output) is None # The good case must round-trip AND come out table-ready. extracted = extract_answer(good_output) assert extracted is not None rendered_ready = ensure_markdown_table_blank_lines(extracted) assert "\n\n| Planet | Distance (AU) |" in rendered_ready, rendered_ready print("[PASS] end-to-end: placeholder rejected, real table rendered with blank line") # ------------------------------------------------------------------------- # 9. Search backend rate-limit no longer crashes the whole agent. # Simulates the DuckDuckGo 202 Ratelimit error the user reported. # ------------------------------------------------------------------------- import app as _app class _FakeRatelimit(Exception): pass class _RatelimitedDDGS: """Stand-in for DDGS that always raises the way ddgs does on 202.""" def __enter__(self): return self def __exit__(self, exc_type, exc, tb): return False def text(self, *args, **kwargs): raise _FakeRatelimit("https://html.duckduckgo.com/html 202 Ratelimit") # Clear in-memory cache so the mock is actually exercised. _app.SEARCH_CACHE.clear() with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \ mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None): out = _app._run_search_single("iPhone 15 vs iPhone 16 features", max_results=3) assert out["ok"] is False, out assert "Ratelimit" in out["error"], out assert out["results"] == [] assert "hint" in out and "training knowledge" in out["hint"], out print("[PASS] _run_search_single converts DDG rate-limit into a graceful tool error") # The caller that invokes build_research_agent wraps tool responses into a # user message; the important thing is that _run_search_single NEVER raises, # so the agent loop can continue and let the model produce an . _app.SEARCH_CACHE.clear() with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \ mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None): try: _ = _app.run_search(["q1", "q2"], max_results=3) raised = False except Exception: raised = True assert not raised, "run_search should not raise when DDG rate-limits" print("[PASS] run_search swallows backend errors across multi-query calls") # ------------------------------------------------------------------------- # 10. Serper backend is preferred when SERPER_API_KEY is set, and DDG is # used as a fallback. Verifies the latency fix for the iPhone query. # ------------------------------------------------------------------------- class _FakeResponse: def __init__(self, payload): self._payload = payload def raise_for_status(self): return None def json(self): return self._payload def _fake_serper_ok(url, headers, json, timeout): # noqa: A002 - gradio-style arg assert headers.get("X-API-KEY") == "test-serper-key" return _FakeResponse( { "answerBox": { "title": "iPhone 16 vs 15", "link": "https://example.com/answer", "snippet": "Apple replaced the mute switch with an action button.", }, "organic": [ { "title": "iPhone 16 Specs", "link": "https://example.com/iphone-16", "snippet": "A18 chip, 48 MP camera, ...", }, { "title": "iPhone 15 Specs", "link": "https://example.com/iphone-15", "snippet": "A16 Bionic, Dynamic Island...", }, ], } ) _app.SEARCH_CACHE.clear() with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok): serper_out = _app._run_search_single("iPhone 16 vs iPhone 15", max_results=5) assert serper_out["ok"] is True, serper_out assert serper_out.get("backend") == "serper", serper_out assert serper_out["results"][0]["title"] == "iPhone 16 vs 15", serper_out # answer box first assert len(serper_out["results"]) == 3, serper_out print("[PASS] Serper backend is preferred when SERPER_API_KEY is set") def _fake_serper_fail(url, headers, json, timeout): # noqa: A002 raise RuntimeError("serper: 429 quota exceeded") class _WorkingDDGS: def __enter__(self): return self def __exit__(self, exc_type, exc, tb): return False def text(self, *args, **kwargs): yield { "title": "DDG result", "href": "https://example.org/ddg", "body": "ddg fallback body", } _app.SEARCH_CACHE.clear() with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \ mock.patch.object(_app, "DDGS", _WorkingDDGS): fallback_out = _app._run_search_single("anything", max_results=2) assert fallback_out["ok"] is True, fallback_out assert fallback_out.get("backend") == "duckduckgo", fallback_out assert fallback_out["results"][0]["href"] == "https://example.org/ddg" print("[PASS] Falls back to DuckDuckGo when Serper errors out") _app.SEARCH_CACHE.clear() with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \ mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \ mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None): both_fail = _app._run_search_single("anything", max_results=2) assert both_fail["ok"] is False, both_fail assert "serper" in both_fail["error"].lower(), both_fail assert "duckduckgo" in both_fail["error"].lower(), both_fail assert "hint" in both_fail print("[PASS] Returns graceful error when both Serper and DDG fail") # ------------------------------------------------------------------------- # 11. build_research_agent streams progress (is a generator). # ------------------------------------------------------------------------- import inspect as _inspect assert _inspect.isgeneratorfunction(_app.build_research_agent), ( "build_research_agent should be a generator so run_ui can stream progress" ) assert _inspect.isgeneratorfunction(_app.run_ui), ( "run_ui should be a generator so Gradio streams per-turn status to the UI" ) print("[PASS] build_research_agent and run_ui are streaming generators") # ------------------------------------------------------------------------- # 12. End-to-end dry run of the generator: verify at least one progress # tuple is yielded BEFORE the final answer, and that the final yield # is a real answer (not a placeholder). # ------------------------------------------------------------------------- _fake_model_script = [ ( "I should search the web for Mercury distance." '{"name": "search", "arguments": {"query": ["Mercury distance AU"]}}', "fake-model", ), ( "\n" "Here is the table:\n" "| Planet | Distance (AU) |\n" "|---|---|\n" "| Mercury | 0.39 |\n" "", "fake-model", ), ] def _fake_call_model(*args, **kwargs): return _fake_model_script.pop(0) class _FakeInferenceClient: def __init__(self, *a, **k): pass _app.SEARCH_CACHE.clear() with mock.patch.object(_app, "call_model", side_effect=_fake_call_model), \ mock.patch.object(_app, "_build_client_for_model", return_value=(_FakeInferenceClient(), "fake-model", [])), \ mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok): gen = _app.build_research_agent( question="How far is Mercury from the sun?", model="fake-model", max_turns=4, max_search_results=3, temperature=0.0, ) emitted = list(gen) assert len(emitted) >= 3, f"expected multiple progress yields, got {len(emitted)}" final_answer, final_trace = emitted[-1] assert "Mercury" in final_answer, final_answer assert "| Planet |" in final_answer, final_answer assert "..." not in final_answer # Intermediate yields should have progress scaffolding. assert any("⏳ Researching" in ans for ans, _ in emitted[:-1]), ( "no intermediate progress yield detected" ) print("[PASS] build_research_agent streams progress then a real final answer") print() print("All markdown-fix regression tests passed.")