Spaces:

osunlp
/

QUEST

Running

App Files Files Community

QUEST / _test_markdown_fix.py

TomLii

Speed up Quest-4B research: add Serper backend and stream live progress

8e8119b 25 days ago

raw

history blame contribute delete

16.5 kB

	"""
	Regression tests for the '<answer>...</answer>' placeholder bug that caused the
	Space to render only a literal `...` instead of the real (often table-shaped)
	final answer.

	These tests are plain asserts, runnable with `python _test_markdown_fix.py`.
	They import the fixed helpers directly from `app.py` without booting Gradio.
	"""

	import os
	import sys
	from pathlib import Path

	# Do not start the Gradio UI when importing app.py.
	os.environ.setdefault("GRADIO_SERVER_PORT", "0")

	HERE = Path(__file__).resolve().parent
	sys.path.insert(0, str(HERE))

	from unittest import mock

	from app import (
	extract_answer,
	strip_think_blocks,
	ensure_markdown_table_blank_lines,
	decode_escaped_whitespace,
	_is_placeholder_answer,
	parse_tool_call,
	)


	def _check(name: str, actual, expected) -> None:
	ok = actual == expected
	status = "PASS" if ok else "FAIL"
	print(f"[{status}] {name}")
	if not ok:
	print(f" expected: {expected!r}")
	print(f" actual : {actual!r}")
	assert ok, name


	# -------------------------------------------------------------------------
	# 1. The original bug: Quest-4B echoes the template literally.
	# -------------------------------------------------------------------------
	_check(
	"echoed placeholder `<answer>...</answer>` is rejected",
	extract_answer("<answer>...</answer>"),
	None,
	)

	_check(
	"echoed unicode ellipsis `<answer>…</answer>` is rejected",
	extract_answer("<answer>…</answer>"),
	None,
	)

	_check(
	"whitespace-only `<answer> </answer>` is rejected",
	extract_answer("<answer> </answer>"),
	None,
	)

	_check(
	"placeholder detector recognises ASCII dots",
	_is_placeholder_answer("..."),
	True,
	)
	_check(
	"placeholder detector recognises unicode ellipsis",
	_is_placeholder_answer("…"),
	True,
	)
	_check(
	"placeholder detector recognises interpunct",
	_is_placeholder_answer("·"),
	True,
	)
	_check(
	"placeholder detector accepts real text",
	_is_placeholder_answer("The answer is 3..."),
	False,
	)


	# -------------------------------------------------------------------------
	# 2. A real Markdown table inside <answer> survives round-trip.
	# -------------------------------------------------------------------------
	table_body = "\| Color \| Hex \|\n\|---\|---\|\n\| Red \| #ff0000 \|\n\| Green \| #00ff00 \|"
	_check(
	"Markdown table inside <answer> is returned intact",
	extract_answer(f"<answer>\n{table_body}\n</answer>"),
	table_body,
	)


	# -------------------------------------------------------------------------
	# 3. <think> block is stripped before extracting the answer.
	# -------------------------------------------------------------------------
	_check(
	"<think>...</think> is removed from answer content",
	extract_answer("<think>reasoning goes here</think><answer>real answer</answer>"),
	"real answer",
	)

	_check(
	"multi-line <think> is removed",
	extract_answer(
	"<think>line 1\nline 2\nline 3</think>\n<answer>the truth</answer>"
	),
	"the truth",
	)

	_check(
	"strip_think_blocks leaves non-think content alone",
	strip_think_blocks("plain text"),
	"plain text",
	)


	# -------------------------------------------------------------------------
	# 4. Truncated output: <answer> opened, never closed.
	# -------------------------------------------------------------------------
	_check(
	"truncated `<answer>` with real text is still extracted",
	extract_answer("<answer>Here is the partial answer"),
	"Here is the partial answer",
	)

	_check(
	"truncated `<answer>` that is just dots is still rejected",
	extract_answer("<answer>..."),
	None,
	)


	# -------------------------------------------------------------------------
	# 5. ensure_markdown_table_blank_lines inserts the required break.
	# -------------------------------------------------------------------------
	glued = "Here is the comparison:\n\| Col \| Val \|\n\|---\|---\|\n\| a \| b \|"
	fixed = ensure_markdown_table_blank_lines(glued)
	assert "\n\n\| Col \| Val \|" in fixed, f"blank line was not inserted: {fixed!r}"
	print("[PASS] ensure_markdown_table_blank_lines inserts break before table")

	already_ok = "Here is the comparison:\n\n\| Col \| Val \|\n\|---\|---\|\n\| a \| b \|"
	_check(
	"ensure_markdown_table_blank_lines is a no-op when blank line already exists",
	ensure_markdown_table_blank_lines(already_ok),
	already_ok,
	)

	table_at_start = "\| Col \| Val \|\n\|---\|---\|\n\| a \| b \|"
	_check(
	"ensure_markdown_table_blank_lines leaves a table at the very start alone",
	ensure_markdown_table_blank_lines(table_at_start),
	table_at_start,
	)


	# -------------------------------------------------------------------------
	# 6. parse_tool_call still works after the <think>-stripping refactor.
	# -------------------------------------------------------------------------
	tool_out = (
	"<think>I should search for this</think>\n"
	'<tool_call>{"name": "search", "arguments": {"query": ["hello"]}}</tool_call>'
	)
	name, args, err = parse_tool_call(tool_out)
	assert err is None, f"unexpected parse error: {err}"
	_check("parse_tool_call extracts name", name, "search")
	_check("parse_tool_call extracts arguments", args, {"query": ["hello"]})


	# -------------------------------------------------------------------------
	# 7. Escaped-whitespace decoding (the 2nd reported bug):
	# the endpoint returned `\n` as literal 2-char sequences, so the
	# pipe table rendered as a one-line sentence of `\| a \| b \|\n...`.
	# -------------------------------------------------------------------------
	user_reported_payload = (
	"\\n\| Color \| Hex \|\\n\|---\|---\|\\n\| Red \| #FF0000 \|"
	"\\n\| Green \| #00FF00 \|\\n\| Blue \| #0000FF \|\\n"
	)
	decoded_user_payload = decode_escaped_whitespace(user_reported_payload)
	assert "\n\| Color \| Hex \|" in decoded_user_payload, decoded_user_payload
	assert "\\n" not in decoded_user_payload, decoded_user_payload
	print("[PASS] decode_escaped_whitespace converts the user-reported payload")

	# Extract from a full <answer> block whose content is escape-encoded.
	escape_encoded_answer = f"<answer>{user_reported_payload}</answer>"
	extracted_escape = extract_answer(escape_encoded_answer)
	assert extracted_escape is not None
	assert "\| Red \| #FF0000 \|" in extracted_escape
	assert "\\n" not in extracted_escape
	# And the separator must be on its own line so GFM recognises the table.
	assert "\|---\|---\|" in extracted_escape
	print("[PASS] extract_answer decodes escape-encoded <answer> into real newlines")

	# Heuristic: do NOT decode when escapes are rare (a real code example).
	code_example = 'Some prose with a single \\n in a code example.'
	_check(
	"decode_escaped_whitespace leaves lightly-escaped prose alone",
	decode_escaped_whitespace(code_example),
	code_example,
	)

	# Heuristic: do NOT decode when real newlines already dominate.
	mostly_real = "real\nnewlines\nhere\nwith\\none escape"
	_check(
	"decode_escaped_whitespace leaves mostly-real-newline text alone",
	decode_escaped_whitespace(mostly_real),
	mostly_real,
	)

	# Heuristic: DO decode when escapes clearly dominate.
	mostly_escaped = "one real\n then \\na \\nb \\nc \\nd"
	decoded_ok = decode_escaped_whitespace(mostly_escaped)
	assert decoded_ok.count("\n") > mostly_escaped.count("\n"), decoded_ok
	assert decoded_ok.count("\\n") == 0, decoded_ok
	print("[PASS] decode_escaped_whitespace decodes when escapes dominate")


	# -------------------------------------------------------------------------
	# 8. End-to-end: the originally-reported scenario now renders a real table.
	# -------------------------------------------------------------------------
	buggy_output = "<answer>...</answer>"
	good_output = (
	"<think>let me build the table</think>\n"
	"<answer>\n"
	"Here is the table:\n"
	"\| Planet \| Distance (AU) \|\n"
	"\|---\|---\|\n"
	"\| Mercury \| 0.39 \|\n"
	"\| Venus \| 0.72 \|\n"
	"\| Earth \| 1.00 \|\n"
	"</answer>"
	)

	# The buggy case must no longer be accepted as an answer.
	assert extract_answer(buggy_output) is None
	# The good case must round-trip AND come out table-ready.
	extracted = extract_answer(good_output)
	assert extracted is not None
	rendered_ready = ensure_markdown_table_blank_lines(extracted)
	assert "\n\n\| Planet \| Distance (AU) \|" in rendered_ready, rendered_ready
	print("[PASS] end-to-end: placeholder rejected, real table rendered with blank line")

	# -------------------------------------------------------------------------
	# 9. Search backend rate-limit no longer crashes the whole agent.
	# Simulates the DuckDuckGo 202 Ratelimit error the user reported.
	# -------------------------------------------------------------------------
	import app as _app

	class _FakeRatelimit(Exception):
	pass


	class _RatelimitedDDGS:
	"""Stand-in for DDGS that always raises the way ddgs does on 202."""

	def __enter__(self):
	return self

	def __exit__(self, exc_type, exc, tb):
	return False

	def text(self, args, *kwargs):
	raise _FakeRatelimit("https://html.duckduckgo.com/html 202 Ratelimit")


	# Clear in-memory cache so the mock is actually exercised.
	_app.SEARCH_CACHE.clear()

	with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
	mock.patch.object(_app.time, "sleep", lambda _a, *_k: None):
	out = _app._run_search_single("iPhone 15 vs iPhone 16 features", max_results=3)

	assert out["ok"] is False, out
	assert "Ratelimit" in out["error"], out
	assert out["results"] == []
	assert "hint" in out and "training knowledge" in out["hint"], out
	print("[PASS] _run_search_single converts DDG rate-limit into a graceful tool error")

	# The caller that invokes build_research_agent wraps tool responses into a
	# user message; the important thing is that _run_search_single NEVER raises,
	# so the agent loop can continue and let the model produce an <answer>.
	_app.SEARCH_CACHE.clear()
	with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
	mock.patch.object(_app.time, "sleep", lambda _a, *_k: None):
	try:
	_ = _app.run_search(["q1", "q2"], max_results=3)
	raised = False
	except Exception:
	raised = True
	assert not raised, "run_search should not raise when DDG rate-limits"
	print("[PASS] run_search swallows backend errors across multi-query calls")


	# -------------------------------------------------------------------------
	# 10. Serper backend is preferred when SERPER_API_KEY is set, and DDG is
	# used as a fallback. Verifies the latency fix for the iPhone query.
	# -------------------------------------------------------------------------
	class _FakeResponse:
	def __init__(self, payload):
	self._payload = payload

	def raise_for_status(self):
	return None

	def json(self):
	return self._payload


	def _fake_serper_ok(url, headers, json, timeout): # noqa: A002 - gradio-style arg
	assert headers.get("X-API-KEY") == "test-serper-key"
	return _FakeResponse(
	{
	"answerBox": {
	"title": "iPhone 16 vs 15",
	"link": "https://example.com/answer",
	"snippet": "Apple replaced the mute switch with an action button.",
	},
	"organic": [
	{
	"title": "iPhone 16 Specs",
	"link": "https://example.com/iphone-16",
	"snippet": "A18 chip, 48 MP camera, ...",
	},
	{
	"title": "iPhone 15 Specs",
	"link": "https://example.com/iphone-15",
	"snippet": "A16 Bionic, Dynamic Island...",
	},
	],
	}
	)


	_app.SEARCH_CACHE.clear()
	with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
	mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
	serper_out = _app._run_search_single("iPhone 16 vs iPhone 15", max_results=5)

	assert serper_out["ok"] is True, serper_out
	assert serper_out.get("backend") == "serper", serper_out
	assert serper_out["results"][0]["title"] == "iPhone 16 vs 15", serper_out # answer box first
	assert len(serper_out["results"]) == 3, serper_out
	print("[PASS] Serper backend is preferred when SERPER_API_KEY is set")


	def _fake_serper_fail(url, headers, json, timeout): # noqa: A002
	raise RuntimeError("serper: 429 quota exceeded")


	class _WorkingDDGS:
	def __enter__(self):
	return self

	def __exit__(self, exc_type, exc, tb):
	return False

	def text(self, args, *kwargs):
	yield {
	"title": "DDG result",
	"href": "https://example.org/ddg",
	"body": "ddg fallback body",
	}


	_app.SEARCH_CACHE.clear()
	with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
	mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
	mock.patch.object(_app, "DDGS", _WorkingDDGS):
	fallback_out = _app._run_search_single("anything", max_results=2)

	assert fallback_out["ok"] is True, fallback_out
	assert fallback_out.get("backend") == "duckduckgo", fallback_out
	assert fallback_out["results"][0]["href"] == "https://example.org/ddg"
	print("[PASS] Falls back to DuckDuckGo when Serper errors out")


	_app.SEARCH_CACHE.clear()
	with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
	mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
	mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
	mock.patch.object(_app.time, "sleep", lambda _a, *_k: None):
	both_fail = _app._run_search_single("anything", max_results=2)

	assert both_fail["ok"] is False, both_fail
	assert "serper" in both_fail["error"].lower(), both_fail
	assert "duckduckgo" in both_fail["error"].lower(), both_fail
	assert "hint" in both_fail
	print("[PASS] Returns graceful error when both Serper and DDG fail")


	# -------------------------------------------------------------------------
	# 11. build_research_agent streams progress (is a generator).
	# -------------------------------------------------------------------------
	import inspect as _inspect

	assert _inspect.isgeneratorfunction(_app.build_research_agent), (
	"build_research_agent should be a generator so run_ui can stream progress"
	)
	assert _inspect.isgeneratorfunction(_app.run_ui), (
	"run_ui should be a generator so Gradio streams per-turn status to the UI"
	)
	print("[PASS] build_research_agent and run_ui are streaming generators")


	# -------------------------------------------------------------------------
	# 12. End-to-end dry run of the generator: verify at least one progress
	# tuple is yielded BEFORE the final answer, and that the final yield
	# is a real answer (not a placeholder).
	# -------------------------------------------------------------------------
	_fake_model_script = [
	(
	"<think>I should search the web for Mercury distance.</think>"
	'<tool_call>{"name": "search", "arguments": {"query": ["Mercury distance AU"]}}</tool_call>',
	"fake-model",
	),
	(
	"<answer>\n"
	"Here is the table:\n"
	"\| Planet \| Distance (AU) \|\n"
	"\|---\|---\|\n"
	"\| Mercury \| 0.39 \|\n"
	"</answer>",
	"fake-model",
	),
	]


	def _fake_call_model(args, *kwargs):
	return _fake_model_script.pop(0)


	class _FakeInferenceClient:
	def __init__(self, a, *k):
	pass


	_app.SEARCH_CACHE.clear()
	with mock.patch.object(_app, "call_model", side_effect=_fake_call_model), \
	mock.patch.object(_app, "_build_client_for_model",
	return_value=(_FakeInferenceClient(), "fake-model", [])), \
	mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
	mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
	gen = _app.build_research_agent(
	question="How far is Mercury from the sun?",
	model="fake-model",
	max_turns=4,
	max_search_results=3,
	temperature=0.0,
	)
	emitted = list(gen)

	assert len(emitted) >= 3, f"expected multiple progress yields, got {len(emitted)}"
	final_answer, final_trace = emitted[-1]
	assert "Mercury" in final_answer, final_answer
	assert "\| Planet \|" in final_answer, final_answer
	assert "...</answer>" not in final_answer
	# Intermediate yields should have progress scaffolding.
	assert any("⏳ Researching" in ans for ans, _ in emitted[:-1]), (
	"no intermediate progress yield detected"
	)
	print("[PASS] build_research_agent streams progress then a real final answer")

	print()
	print("All markdown-fix regression tests passed.")