| """ |
| Regression tests for the '<answer>...</answer>' placeholder bug that caused the |
| Space to render only a literal `...` instead of the real (often table-shaped) |
| final answer. |
| |
| These tests are plain asserts, runnable with `python _test_markdown_fix.py`. |
| They import the fixed helpers directly from `app.py` without booting Gradio. |
| """ |
|
|
| import os |
| import sys |
| from pathlib import Path |
|
|
| |
| os.environ.setdefault("GRADIO_SERVER_PORT", "0") |
|
|
| HERE = Path(__file__).resolve().parent |
| sys.path.insert(0, str(HERE)) |
|
|
| from unittest import mock |
|
|
| from app import ( |
| extract_answer, |
| strip_think_blocks, |
| ensure_markdown_table_blank_lines, |
| decode_escaped_whitespace, |
| _is_placeholder_answer, |
| parse_tool_call, |
| ) |
|
|
|
|
| def _check(name: str, actual, expected) -> None: |
| ok = actual == expected |
| status = "PASS" if ok else "FAIL" |
| print(f"[{status}] {name}") |
| if not ok: |
| print(f" expected: {expected!r}") |
| print(f" actual : {actual!r}") |
| assert ok, name |
|
|
|
|
| |
| |
| |
| _check( |
| "echoed placeholder `<answer>...</answer>` is rejected", |
| extract_answer("<answer>...</answer>"), |
| None, |
| ) |
|
|
| _check( |
| "echoed unicode ellipsis `<answer>…</answer>` is rejected", |
| extract_answer("<answer>…</answer>"), |
| None, |
| ) |
|
|
| _check( |
| "whitespace-only `<answer> </answer>` is rejected", |
| extract_answer("<answer> </answer>"), |
| None, |
| ) |
|
|
| _check( |
| "placeholder detector recognises ASCII dots", |
| _is_placeholder_answer("..."), |
| True, |
| ) |
| _check( |
| "placeholder detector recognises unicode ellipsis", |
| _is_placeholder_answer("…"), |
| True, |
| ) |
| _check( |
| "placeholder detector recognises interpunct", |
| _is_placeholder_answer("·"), |
| True, |
| ) |
| _check( |
| "placeholder detector accepts real text", |
| _is_placeholder_answer("The answer is 3..."), |
| False, |
| ) |
|
|
|
|
| |
| |
| |
| table_body = "| Color | Hex |\n|---|---|\n| Red | #ff0000 |\n| Green | #00ff00 |" |
| _check( |
| "Markdown table inside <answer> is returned intact", |
| extract_answer(f"<answer>\n{table_body}\n</answer>"), |
| table_body, |
| ) |
|
|
|
|
| |
| |
| |
| _check( |
| "<think>...</think> is removed from answer content", |
| extract_answer("<think>reasoning goes here</think><answer>real answer</answer>"), |
| "real answer", |
| ) |
|
|
| _check( |
| "multi-line <think> is removed", |
| extract_answer( |
| "<think>line 1\nline 2\nline 3</think>\n<answer>the truth</answer>" |
| ), |
| "the truth", |
| ) |
|
|
| _check( |
| "strip_think_blocks leaves non-think content alone", |
| strip_think_blocks("plain text"), |
| "plain text", |
| ) |
|
|
|
|
| |
| |
| |
| _check( |
| "truncated `<answer>` with real text is still extracted", |
| extract_answer("<answer>Here is the partial answer"), |
| "Here is the partial answer", |
| ) |
|
|
| _check( |
| "truncated `<answer>` that is just dots is still rejected", |
| extract_answer("<answer>..."), |
| None, |
| ) |
|
|
|
|
| |
| |
| |
| glued = "Here is the comparison:\n| Col | Val |\n|---|---|\n| a | b |" |
| fixed = ensure_markdown_table_blank_lines(glued) |
| assert "\n\n| Col | Val |" in fixed, f"blank line was not inserted: {fixed!r}" |
| print("[PASS] ensure_markdown_table_blank_lines inserts break before table") |
|
|
| already_ok = "Here is the comparison:\n\n| Col | Val |\n|---|---|\n| a | b |" |
| _check( |
| "ensure_markdown_table_blank_lines is a no-op when blank line already exists", |
| ensure_markdown_table_blank_lines(already_ok), |
| already_ok, |
| ) |
|
|
| table_at_start = "| Col | Val |\n|---|---|\n| a | b |" |
| _check( |
| "ensure_markdown_table_blank_lines leaves a table at the very start alone", |
| ensure_markdown_table_blank_lines(table_at_start), |
| table_at_start, |
| ) |
|
|
|
|
| |
| |
| |
| tool_out = ( |
| "<think>I should search for this</think>\n" |
| '<tool_call>{"name": "search", "arguments": {"query": ["hello"]}}</tool_call>' |
| ) |
| name, args, err = parse_tool_call(tool_out) |
| assert err is None, f"unexpected parse error: {err}" |
| _check("parse_tool_call extracts name", name, "search") |
| _check("parse_tool_call extracts arguments", args, {"query": ["hello"]}) |
|
|
|
|
| |
| |
| |
| |
| |
| user_reported_payload = ( |
| "\\n| Color | Hex |\\n|---|---|\\n| Red | #FF0000 |" |
| "\\n| Green | #00FF00 |\\n| Blue | #0000FF |\\n" |
| ) |
| decoded_user_payload = decode_escaped_whitespace(user_reported_payload) |
| assert "\n| Color | Hex |" in decoded_user_payload, decoded_user_payload |
| assert "\\n" not in decoded_user_payload, decoded_user_payload |
| print("[PASS] decode_escaped_whitespace converts the user-reported payload") |
|
|
| |
| escape_encoded_answer = f"<answer>{user_reported_payload}</answer>" |
| extracted_escape = extract_answer(escape_encoded_answer) |
| assert extracted_escape is not None |
| assert "| Red | #FF0000 |" in extracted_escape |
| assert "\\n" not in extracted_escape |
| |
| assert "|---|---|" in extracted_escape |
| print("[PASS] extract_answer decodes escape-encoded <answer> into real newlines") |
|
|
| |
| code_example = 'Some prose with a single \\n in a code example.' |
| _check( |
| "decode_escaped_whitespace leaves lightly-escaped prose alone", |
| decode_escaped_whitespace(code_example), |
| code_example, |
| ) |
|
|
| |
| mostly_real = "real\nnewlines\nhere\nwith\\none escape" |
| _check( |
| "decode_escaped_whitespace leaves mostly-real-newline text alone", |
| decode_escaped_whitespace(mostly_real), |
| mostly_real, |
| ) |
|
|
| |
| mostly_escaped = "one real\n then \\na \\nb \\nc \\nd" |
| decoded_ok = decode_escaped_whitespace(mostly_escaped) |
| assert decoded_ok.count("\n") > mostly_escaped.count("\n"), decoded_ok |
| assert decoded_ok.count("\\n") == 0, decoded_ok |
| print("[PASS] decode_escaped_whitespace decodes when escapes dominate") |
|
|
|
|
| |
| |
| |
| buggy_output = "<answer>...</answer>" |
| good_output = ( |
| "<think>let me build the table</think>\n" |
| "<answer>\n" |
| "Here is the table:\n" |
| "| Planet | Distance (AU) |\n" |
| "|---|---|\n" |
| "| Mercury | 0.39 |\n" |
| "| Venus | 0.72 |\n" |
| "| Earth | 1.00 |\n" |
| "</answer>" |
| ) |
|
|
| |
| assert extract_answer(buggy_output) is None |
| |
| extracted = extract_answer(good_output) |
| assert extracted is not None |
| rendered_ready = ensure_markdown_table_blank_lines(extracted) |
| assert "\n\n| Planet | Distance (AU) |" in rendered_ready, rendered_ready |
| print("[PASS] end-to-end: placeholder rejected, real table rendered with blank line") |
|
|
| |
| |
| |
| |
| import app as _app |
|
|
| class _FakeRatelimit(Exception): |
| pass |
|
|
|
|
| class _RatelimitedDDGS: |
| """Stand-in for DDGS that always raises the way ddgs does on 202.""" |
|
|
| def __enter__(self): |
| return self |
|
|
| def __exit__(self, exc_type, exc, tb): |
| return False |
|
|
| def text(self, *args, **kwargs): |
| raise _FakeRatelimit("https://html.duckduckgo.com/html 202 Ratelimit") |
|
|
|
|
| |
| _app.SEARCH_CACHE.clear() |
|
|
| with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \ |
| mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None): |
| out = _app._run_search_single("iPhone 15 vs iPhone 16 features", max_results=3) |
|
|
| assert out["ok"] is False, out |
| assert "Ratelimit" in out["error"], out |
| assert out["results"] == [] |
| assert "hint" in out and "training knowledge" in out["hint"], out |
| print("[PASS] _run_search_single converts DDG rate-limit into a graceful tool error") |
|
|
| |
| |
| |
| _app.SEARCH_CACHE.clear() |
| with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \ |
| mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None): |
| try: |
| _ = _app.run_search(["q1", "q2"], max_results=3) |
| raised = False |
| except Exception: |
| raised = True |
| assert not raised, "run_search should not raise when DDG rate-limits" |
| print("[PASS] run_search swallows backend errors across multi-query calls") |
|
|
|
|
| |
| |
| |
| |
| class _FakeResponse: |
| def __init__(self, payload): |
| self._payload = payload |
|
|
| def raise_for_status(self): |
| return None |
|
|
| def json(self): |
| return self._payload |
|
|
|
|
| def _fake_serper_ok(url, headers, json, timeout): |
| assert headers.get("X-API-KEY") == "test-serper-key" |
| return _FakeResponse( |
| { |
| "answerBox": { |
| "title": "iPhone 16 vs 15", |
| "link": "https://example.com/answer", |
| "snippet": "Apple replaced the mute switch with an action button.", |
| }, |
| "organic": [ |
| { |
| "title": "iPhone 16 Specs", |
| "link": "https://example.com/iphone-16", |
| "snippet": "A18 chip, 48 MP camera, ...", |
| }, |
| { |
| "title": "iPhone 15 Specs", |
| "link": "https://example.com/iphone-15", |
| "snippet": "A16 Bionic, Dynamic Island...", |
| }, |
| ], |
| } |
| ) |
|
|
|
|
| _app.SEARCH_CACHE.clear() |
| with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ |
| mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok): |
| serper_out = _app._run_search_single("iPhone 16 vs iPhone 15", max_results=5) |
|
|
| assert serper_out["ok"] is True, serper_out |
| assert serper_out.get("backend") == "serper", serper_out |
| assert serper_out["results"][0]["title"] == "iPhone 16 vs 15", serper_out |
| assert len(serper_out["results"]) == 3, serper_out |
| print("[PASS] Serper backend is preferred when SERPER_API_KEY is set") |
|
|
|
|
| def _fake_serper_fail(url, headers, json, timeout): |
| raise RuntimeError("serper: 429 quota exceeded") |
|
|
|
|
| class _WorkingDDGS: |
| def __enter__(self): |
| return self |
|
|
| def __exit__(self, exc_type, exc, tb): |
| return False |
|
|
| def text(self, *args, **kwargs): |
| yield { |
| "title": "DDG result", |
| "href": "https://example.org/ddg", |
| "body": "ddg fallback body", |
| } |
|
|
|
|
| _app.SEARCH_CACHE.clear() |
| with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ |
| mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \ |
| mock.patch.object(_app, "DDGS", _WorkingDDGS): |
| fallback_out = _app._run_search_single("anything", max_results=2) |
|
|
| assert fallback_out["ok"] is True, fallback_out |
| assert fallback_out.get("backend") == "duckduckgo", fallback_out |
| assert fallback_out["results"][0]["href"] == "https://example.org/ddg" |
| print("[PASS] Falls back to DuckDuckGo when Serper errors out") |
|
|
|
|
| _app.SEARCH_CACHE.clear() |
| with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ |
| mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \ |
| mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \ |
| mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None): |
| both_fail = _app._run_search_single("anything", max_results=2) |
|
|
| assert both_fail["ok"] is False, both_fail |
| assert "serper" in both_fail["error"].lower(), both_fail |
| assert "duckduckgo" in both_fail["error"].lower(), both_fail |
| assert "hint" in both_fail |
| print("[PASS] Returns graceful error when both Serper and DDG fail") |
|
|
|
|
| |
| |
| |
| import inspect as _inspect |
|
|
| assert _inspect.isgeneratorfunction(_app.build_research_agent), ( |
| "build_research_agent should be a generator so run_ui can stream progress" |
| ) |
| assert _inspect.isgeneratorfunction(_app.run_ui), ( |
| "run_ui should be a generator so Gradio streams per-turn status to the UI" |
| ) |
| print("[PASS] build_research_agent and run_ui are streaming generators") |
|
|
|
|
| |
| |
| |
| |
| |
| _fake_model_script = [ |
| ( |
| "<think>I should search the web for Mercury distance.</think>" |
| '<tool_call>{"name": "search", "arguments": {"query": ["Mercury distance AU"]}}</tool_call>', |
| "fake-model", |
| ), |
| ( |
| "<answer>\n" |
| "Here is the table:\n" |
| "| Planet | Distance (AU) |\n" |
| "|---|---|\n" |
| "| Mercury | 0.39 |\n" |
| "</answer>", |
| "fake-model", |
| ), |
| ] |
|
|
|
|
| def _fake_call_model(*args, **kwargs): |
| return _fake_model_script.pop(0) |
|
|
|
|
| class _FakeInferenceClient: |
| def __init__(self, *a, **k): |
| pass |
|
|
|
|
| _app.SEARCH_CACHE.clear() |
| with mock.patch.object(_app, "call_model", side_effect=_fake_call_model), \ |
| mock.patch.object(_app, "_build_client_for_model", |
| return_value=(_FakeInferenceClient(), "fake-model", [])), \ |
| mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \ |
| mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok): |
| gen = _app.build_research_agent( |
| question="How far is Mercury from the sun?", |
| model="fake-model", |
| max_turns=4, |
| max_search_results=3, |
| temperature=0.0, |
| ) |
| emitted = list(gen) |
|
|
| assert len(emitted) >= 3, f"expected multiple progress yields, got {len(emitted)}" |
| final_answer, final_trace = emitted[-1] |
| assert "Mercury" in final_answer, final_answer |
| assert "| Planet |" in final_answer, final_answer |
| assert "...</answer>" not in final_answer |
| |
| assert any("⏳ Researching" in ans for ans, _ in emitted[:-1]), ( |
| "no intermediate progress yield detected" |
| ) |
| print("[PASS] build_research_agent streams progress then a real final answer") |
|
|
| print() |
| print("All markdown-fix regression tests passed.") |
|
|