Spaces:
Running
Running
File size: 6,667 Bytes
88d2f2a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | """W14-FIX-STUB end-to-end verification.
Monkey-patches the LLM call to always return an empty string (the most
common LLM-glitch failure mode) and walks an event through the full
analysts -> translators -> synthesizer -> quality_eval -> polymarket
pipeline. Verifies every stub gate fires:
1. ``translators.propose_candidates`` emits a ``logger.warning`` and
sets ``candidate.is_stub = True``.
2. ``analysts._parse_response`` emits a ``logger.warning`` for
missing ``JSON:`` marker.
3. ``synthesizer.synthesize`` propagates ``is_stub`` to the
:class:`Question`.
4. ``quality_eval.score_question`` returns ``score=0.0`` and
``passed=False`` with ``"stub_detected"`` in the rationale.
5. ``polymarket.client._build_gamma_payload`` raises ``ValueError``
before any HTTP traffic.
Run::
python scripts/test_stub_blocking.py
"""
from __future__ import annotations
import asyncio
import logging
import sys
from typing import List
# Force-import everything we need from the package.
from polyglot_alpha import analysts, quality_eval, synthesizer, translators
from polyglot_alpha.polymarket.client import _build_gamma_payload
from polyglot_alpha.polymarket.types import Question as PMQuestion
from polyglot_alpha.schemas import NewsEvent
from polyglot_alpha.stub_detector import is_stub
# --------------------------------------------------------------------------- #
# Helpers #
# --------------------------------------------------------------------------- #
class WarningCollector(logging.Handler):
"""Capture WARNING+ records so we can assert what was logged."""
def __init__(self) -> None:
super().__init__(level=logging.WARNING)
self.records: List[logging.LogRecord] = []
def emit(self, record: logging.LogRecord) -> None:
self.records.append(record)
def messages_for(self, logger_name: str) -> List[str]:
return [r.getMessage() for r in self.records if r.name == logger_name]
def _install_collector() -> WarningCollector:
handler = WarningCollector()
root = logging.getLogger("polyglot_alpha")
root.setLevel(logging.WARNING)
root.addHandler(handler)
return handler
async def _broken_llm(prompt: str) -> str:
"""The 'broken LLM' fixture: returns empty string regardless of prompt."""
return ""
# --------------------------------------------------------------------------- #
# Test body #
# --------------------------------------------------------------------------- #
async def _run() -> int:
collector = _install_collector()
event = NewsEvent(
event_id="evt-stub-blocking-test",
url="https://example.com/news/stub-test",
title_zh="测试事件 — LLM glitch simulation",
body_zh="此事件被故意触发以验证 stub-blocking 链。",
cutoff_ts=1_800_000_000,
)
# ----- Layer 1: analysts. Empty LLM -> empty entities/risks + WARNING.
reports = await analysts.run_analysts(event, _broken_llm)
assert len(reports) >= 1, "expected at least one analyst report"
for r in reports:
assert r.relevant_entities == [], r.relevant_entities
assert r.risk_factors == [], r.risk_factors
analyst_warns = collector.messages_for("polyglot_alpha.analysts")
assert any("JSON parse failed" in m or "missing 'JSON:' marker" in m for m in analyst_warns), (
f"analysts: expected a JSON-parse warning, got: {analyst_warns!r}"
)
print("[1/5] analysts: emitted parse-fail WARNING (count=%d)" % len(analyst_warns))
# ----- Layer 2: translators. Empty LLM -> is_stub=True + WARNING.
candidates = await translators.propose_candidates(event, reports, _broken_llm, n=2)
assert len(candidates) == 2
for c in candidates:
assert getattr(c, "is_stub", False), (
f"translators: expected is_stub=True on candidate {c.translator_id}"
)
# Stub strings must come through verbatim so quality_eval can catch them.
assert is_stub(c.question_en) or is_stub(c.resolution_criteria), (
f"translators: expected stub text on {c.translator_id}, got {c.question_en!r}"
)
translator_warns = collector.messages_for("polyglot_alpha.translators")
assert any("falling back to stub" in m for m in translator_warns), (
f"translators: expected fallback WARNING, got: {translator_warns!r}"
)
print("[2/5] translators: emitted stub-fallback WARNING + is_stub=True on both candidates")
# ----- Layer 3: synthesizer. is_stub must propagate to the Question.
question = synthesizer.synthesize(event, candidates)
assert getattr(question, "is_stub", False), (
"synthesizer: expected is_stub=True propagated to Question"
)
print("[3/5] synthesizer: propagated is_stub=True to Question")
# ----- Layer 4: quality_eval. score=0.0, passed=False, reason contains stub_detected.
qs = quality_eval.score_question(question)
assert qs.score == 0.0, f"quality_eval: expected score=0.0, got {qs.score}"
assert qs.passed is False, "quality_eval: expected passed=False"
assert "stub_detected" in qs.rationale, (
f"quality_eval: expected 'stub_detected' in rationale, got {qs.rationale!r}"
)
print(f"[4/5] quality_eval: score=0.0, passed=False, rationale={qs.rationale!r}")
# ----- Layer 5: polymarket. _build_gamma_payload must raise BEFORE any HTTP.
pm_question = PMQuestion(
question_id="q-stub-blocking-test",
text=question.question_en,
end_date_iso=question.end_date_iso,
)
try:
_build_gamma_payload(pm_question, "builder-test", None)
except ValueError as exc:
assert "stub" in str(exc).lower(), f"polymarket: unexpected error: {exc}"
print(f"[5/5] polymarket: refused to build Gamma payload — {exc}")
else:
print("[5/5] FAIL: polymarket did NOT raise on stub question!", file=sys.stderr)
return 1
# --- Summary --------------------------------------------------------- #
print()
print("=" * 72)
print("ALL 5 STUB GATES FIRED CORRECTLY")
print(f" analysts WARNINGs: {len(analyst_warns)}")
print(f" translators WARNINGs: {len(translator_warns)}")
print(f" synthesizer is_stub: {getattr(question, 'is_stub', False)}")
print(f" quality_eval score: {qs.score} (passed={qs.passed})")
print(f" polymarket payload: blocked with ValueError")
print("=" * 72)
return 0
if __name__ == "__main__":
sys.exit(asyncio.run(_run()))
|