File size: 16,535 Bytes
8e8119b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
"""
Regression tests for the '<answer>...</answer>' placeholder bug that caused the
Space to render only a literal `...` instead of the real (often table-shaped)
final answer.

These tests are plain asserts, runnable with `python _test_markdown_fix.py`.
They import the fixed helpers directly from `app.py` without booting Gradio.
"""

import os
import sys
from pathlib import Path

# Do not start the Gradio UI when importing app.py.
os.environ.setdefault("GRADIO_SERVER_PORT", "0")

HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(HERE))

from unittest import mock

from app import (
    extract_answer,
    strip_think_blocks,
    ensure_markdown_table_blank_lines,
    decode_escaped_whitespace,
    _is_placeholder_answer,
    parse_tool_call,
)


def _check(name: str, actual, expected) -> None:
    ok = actual == expected
    status = "PASS" if ok else "FAIL"
    print(f"[{status}] {name}")
    if not ok:
        print(f"  expected: {expected!r}")
        print(f"  actual  : {actual!r}")
    assert ok, name


# -------------------------------------------------------------------------
# 1. The original bug: Quest-4B echoes the template literally.
# -------------------------------------------------------------------------
_check(
    "echoed placeholder `<answer>...</answer>` is rejected",
    extract_answer("<answer>...</answer>"),
    None,
)

_check(
    "echoed unicode ellipsis `<answer>…</answer>` is rejected",
    extract_answer("<answer>…</answer>"),
    None,
)

_check(
    "whitespace-only `<answer>   </answer>` is rejected",
    extract_answer("<answer>   </answer>"),
    None,
)

_check(
    "placeholder detector recognises ASCII dots",
    _is_placeholder_answer("..."),
    True,
)
_check(
    "placeholder detector recognises unicode ellipsis",
    _is_placeholder_answer("…"),
    True,
)
_check(
    "placeholder detector recognises interpunct",
    _is_placeholder_answer("·"),
    True,
)
_check(
    "placeholder detector accepts real text",
    _is_placeholder_answer("The answer is 3..."),
    False,
)


# -------------------------------------------------------------------------
# 2. A real Markdown table inside <answer> survives round-trip.
# -------------------------------------------------------------------------
table_body = "| Color | Hex |\n|---|---|\n| Red | #ff0000 |\n| Green | #00ff00 |"
_check(
    "Markdown table inside <answer> is returned intact",
    extract_answer(f"<answer>\n{table_body}\n</answer>"),
    table_body,
)


# -------------------------------------------------------------------------
# 3. <think> block is stripped before extracting the answer.
# -------------------------------------------------------------------------
_check(
    "<think>...</think> is removed from answer content",
    extract_answer("<think>reasoning goes here</think><answer>real answer</answer>"),
    "real answer",
)

_check(
    "multi-line <think> is removed",
    extract_answer(
        "<think>line 1\nline 2\nline 3</think>\n<answer>the truth</answer>"
    ),
    "the truth",
)

_check(
    "strip_think_blocks leaves non-think content alone",
    strip_think_blocks("plain text"),
    "plain text",
)


# -------------------------------------------------------------------------
# 4. Truncated output: <answer> opened, never closed.
# -------------------------------------------------------------------------
_check(
    "truncated `<answer>` with real text is still extracted",
    extract_answer("<answer>Here is the partial answer"),
    "Here is the partial answer",
)

_check(
    "truncated `<answer>` that is just dots is still rejected",
    extract_answer("<answer>..."),
    None,
)


# -------------------------------------------------------------------------
# 5. ensure_markdown_table_blank_lines inserts the required break.
# -------------------------------------------------------------------------
glued = "Here is the comparison:\n| Col | Val |\n|---|---|\n| a | b |"
fixed = ensure_markdown_table_blank_lines(glued)
assert "\n\n| Col | Val |" in fixed, f"blank line was not inserted: {fixed!r}"
print("[PASS] ensure_markdown_table_blank_lines inserts break before table")

already_ok = "Here is the comparison:\n\n| Col | Val |\n|---|---|\n| a | b |"
_check(
    "ensure_markdown_table_blank_lines is a no-op when blank line already exists",
    ensure_markdown_table_blank_lines(already_ok),
    already_ok,
)

table_at_start = "| Col | Val |\n|---|---|\n| a | b |"
_check(
    "ensure_markdown_table_blank_lines leaves a table at the very start alone",
    ensure_markdown_table_blank_lines(table_at_start),
    table_at_start,
)


# -------------------------------------------------------------------------
# 6. parse_tool_call still works after the <think>-stripping refactor.
# -------------------------------------------------------------------------
tool_out = (
    "<think>I should search for this</think>\n"
    '<tool_call>{"name": "search", "arguments": {"query": ["hello"]}}</tool_call>'
)
name, args, err = parse_tool_call(tool_out)
assert err is None, f"unexpected parse error: {err}"
_check("parse_tool_call extracts name", name, "search")
_check("parse_tool_call extracts arguments", args, {"query": ["hello"]})


# -------------------------------------------------------------------------
# 7. Escaped-whitespace decoding (the 2nd reported bug):
#    the endpoint returned `\n` as literal 2-char sequences, so the
#    pipe table rendered as a one-line sentence of `| a | b |\n...`.
# -------------------------------------------------------------------------
user_reported_payload = (
    "\\n| Color | Hex |\\n|---|---|\\n| Red | #FF0000 |"
    "\\n| Green | #00FF00 |\\n| Blue | #0000FF |\\n"
)
decoded_user_payload = decode_escaped_whitespace(user_reported_payload)
assert "\n| Color | Hex |" in decoded_user_payload, decoded_user_payload
assert "\\n" not in decoded_user_payload, decoded_user_payload
print("[PASS] decode_escaped_whitespace converts the user-reported payload")

# Extract from a full <answer> block whose content is escape-encoded.
escape_encoded_answer = f"<answer>{user_reported_payload}</answer>"
extracted_escape = extract_answer(escape_encoded_answer)
assert extracted_escape is not None
assert "| Red | #FF0000 |" in extracted_escape
assert "\\n" not in extracted_escape
# And the separator must be on its own line so GFM recognises the table.
assert "|---|---|" in extracted_escape
print("[PASS] extract_answer decodes escape-encoded <answer> into real newlines")

# Heuristic: do NOT decode when escapes are rare (a real code example).
code_example = 'Some prose with a single \\n in a code example.'
_check(
    "decode_escaped_whitespace leaves lightly-escaped prose alone",
    decode_escaped_whitespace(code_example),
    code_example,
)

# Heuristic: do NOT decode when real newlines already dominate.
mostly_real = "real\nnewlines\nhere\nwith\\none escape"
_check(
    "decode_escaped_whitespace leaves mostly-real-newline text alone",
    decode_escaped_whitespace(mostly_real),
    mostly_real,
)

# Heuristic: DO decode when escapes clearly dominate.
mostly_escaped = "one real\n then \\na \\nb \\nc \\nd"
decoded_ok = decode_escaped_whitespace(mostly_escaped)
assert decoded_ok.count("\n") > mostly_escaped.count("\n"), decoded_ok
assert decoded_ok.count("\\n") == 0, decoded_ok
print("[PASS] decode_escaped_whitespace decodes when escapes dominate")


# -------------------------------------------------------------------------
# 8. End-to-end: the originally-reported scenario now renders a real table.
# -------------------------------------------------------------------------
buggy_output = "<answer>...</answer>"
good_output = (
    "<think>let me build the table</think>\n"
    "<answer>\n"
    "Here is the table:\n"
    "| Planet | Distance (AU) |\n"
    "|---|---|\n"
    "| Mercury | 0.39 |\n"
    "| Venus | 0.72 |\n"
    "| Earth | 1.00 |\n"
    "</answer>"
)

# The buggy case must no longer be accepted as an answer.
assert extract_answer(buggy_output) is None
# The good case must round-trip AND come out table-ready.
extracted = extract_answer(good_output)
assert extracted is not None
rendered_ready = ensure_markdown_table_blank_lines(extracted)
assert "\n\n| Planet | Distance (AU) |" in rendered_ready, rendered_ready
print("[PASS] end-to-end: placeholder rejected, real table rendered with blank line")

# -------------------------------------------------------------------------
# 9. Search backend rate-limit no longer crashes the whole agent.
#    Simulates the DuckDuckGo 202 Ratelimit error the user reported.
# -------------------------------------------------------------------------
import app as _app

class _FakeRatelimit(Exception):
    pass


class _RatelimitedDDGS:
    """Stand-in for DDGS that always raises the way ddgs does on 202."""

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc, tb):
        return False

    def text(self, *args, **kwargs):
        raise _FakeRatelimit("https://html.duckduckgo.com/html 202 Ratelimit")


# Clear in-memory cache so the mock is actually exercised.
_app.SEARCH_CACHE.clear()

with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
     mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
    out = _app._run_search_single("iPhone 15 vs iPhone 16 features", max_results=3)

assert out["ok"] is False, out
assert "Ratelimit" in out["error"], out
assert out["results"] == []
assert "hint" in out and "training knowledge" in out["hint"], out
print("[PASS] _run_search_single converts DDG rate-limit into a graceful tool error")

# The caller that invokes build_research_agent wraps tool responses into a
# user message; the important thing is that _run_search_single NEVER raises,
# so the agent loop can continue and let the model produce an <answer>.
_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
     mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
    try:
        _ = _app.run_search(["q1", "q2"], max_results=3)
        raised = False
    except Exception:
        raised = True
assert not raised, "run_search should not raise when DDG rate-limits"
print("[PASS] run_search swallows backend errors across multi-query calls")


# -------------------------------------------------------------------------
# 10. Serper backend is preferred when SERPER_API_KEY is set, and DDG is
#     used as a fallback. Verifies the latency fix for the iPhone query.
# -------------------------------------------------------------------------
class _FakeResponse:
    def __init__(self, payload):
        self._payload = payload

    def raise_for_status(self):
        return None

    def json(self):
        return self._payload


def _fake_serper_ok(url, headers, json, timeout):  # noqa: A002 - gradio-style arg
    assert headers.get("X-API-KEY") == "test-serper-key"
    return _FakeResponse(
        {
            "answerBox": {
                "title": "iPhone 16 vs 15",
                "link": "https://example.com/answer",
                "snippet": "Apple replaced the mute switch with an action button.",
            },
            "organic": [
                {
                    "title": "iPhone 16 Specs",
                    "link": "https://example.com/iphone-16",
                    "snippet": "A18 chip, 48 MP camera, ...",
                },
                {
                    "title": "iPhone 15 Specs",
                    "link": "https://example.com/iphone-15",
                    "snippet": "A16 Bionic, Dynamic Island...",
                },
            ],
        }
    )


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
    serper_out = _app._run_search_single("iPhone 16 vs iPhone 15", max_results=5)

assert serper_out["ok"] is True, serper_out
assert serper_out.get("backend") == "serper", serper_out
assert serper_out["results"][0]["title"] == "iPhone 16 vs 15", serper_out  # answer box first
assert len(serper_out["results"]) == 3, serper_out
print("[PASS] Serper backend is preferred when SERPER_API_KEY is set")


def _fake_serper_fail(url, headers, json, timeout):  # noqa: A002
    raise RuntimeError("serper: 429 quota exceeded")


class _WorkingDDGS:
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc, tb):
        return False

    def text(self, *args, **kwargs):
        yield {
            "title": "DDG result",
            "href": "https://example.org/ddg",
            "body": "ddg fallback body",
        }


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
     mock.patch.object(_app, "DDGS", _WorkingDDGS):
    fallback_out = _app._run_search_single("anything", max_results=2)

assert fallback_out["ok"] is True, fallback_out
assert fallback_out.get("backend") == "duckduckgo", fallback_out
assert fallback_out["results"][0]["href"] == "https://example.org/ddg"
print("[PASS] Falls back to DuckDuckGo when Serper errors out")


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_fail), \
     mock.patch.object(_app, "DDGS", _RatelimitedDDGS), \
     mock.patch.object(_app.time, "sleep", lambda *_a, **_k: None):
    both_fail = _app._run_search_single("anything", max_results=2)

assert both_fail["ok"] is False, both_fail
assert "serper" in both_fail["error"].lower(), both_fail
assert "duckduckgo" in both_fail["error"].lower(), both_fail
assert "hint" in both_fail
print("[PASS] Returns graceful error when both Serper and DDG fail")


# -------------------------------------------------------------------------
# 11. build_research_agent streams progress (is a generator).
# -------------------------------------------------------------------------
import inspect as _inspect

assert _inspect.isgeneratorfunction(_app.build_research_agent), (
    "build_research_agent should be a generator so run_ui can stream progress"
)
assert _inspect.isgeneratorfunction(_app.run_ui), (
    "run_ui should be a generator so Gradio streams per-turn status to the UI"
)
print("[PASS] build_research_agent and run_ui are streaming generators")


# -------------------------------------------------------------------------
# 12. End-to-end dry run of the generator: verify at least one progress
#     tuple is yielded BEFORE the final answer, and that the final yield
#     is a real answer (not a placeholder).
# -------------------------------------------------------------------------
_fake_model_script = [
    (
        "<think>I should search the web for Mercury distance.</think>"
        '<tool_call>{"name": "search", "arguments": {"query": ["Mercury distance AU"]}}</tool_call>',
        "fake-model",
    ),
    (
        "<answer>\n"
        "Here is the table:\n"
        "| Planet | Distance (AU) |\n"
        "|---|---|\n"
        "| Mercury | 0.39 |\n"
        "</answer>",
        "fake-model",
    ),
]


def _fake_call_model(*args, **kwargs):
    return _fake_model_script.pop(0)


class _FakeInferenceClient:
    def __init__(self, *a, **k):
        pass


_app.SEARCH_CACHE.clear()
with mock.patch.object(_app, "call_model", side_effect=_fake_call_model), \
     mock.patch.object(_app, "_build_client_for_model",
                       return_value=(_FakeInferenceClient(), "fake-model", [])), \
     mock.patch.object(_app, "SERPER_API_KEY", "test-serper-key"), \
     mock.patch.object(_app.requests, "post", side_effect=_fake_serper_ok):
    gen = _app.build_research_agent(
        question="How far is Mercury from the sun?",
        model="fake-model",
        max_turns=4,
        max_search_results=3,
        temperature=0.0,
    )
    emitted = list(gen)

assert len(emitted) >= 3, f"expected multiple progress yields, got {len(emitted)}"
final_answer, final_trace = emitted[-1]
assert "Mercury" in final_answer, final_answer
assert "| Planet |" in final_answer, final_answer
assert "...</answer>" not in final_answer
# Intermediate yields should have progress scaffolding.
assert any("⏳ Researching" in ans for ans, _ in emitted[:-1]), (
    "no intermediate progress yield detected"
)
print("[PASS] build_research_agent streams progress then a real final answer")

print()
print("All markdown-fix regression tests passed.")