polyglot-alpha / tests /_pass_path_mocks.py
licaomeng
deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)
88d2f2a
"""Mock infrastructure for end-to-end PASS-path audit runs.
Goal: trigger the orchestrator's full lifecycle end-to-end without
spending real money on Anthropic LLM calls. Specifically, the panel's
11 judges (which collectively call Anthropic Haiku 4.5 ~10-14 times
per event) are short-circuited to a deterministic PASS PanelVerdict
in-process, and any other LLM entry points (synthesizer, critics,
moderator, refine) are routed through a MockLLM so any straggler that
slips past the panel patch still cannot reach api.anthropic.com.
Use :func:`install_mocks` from a top-level audit script BEFORE
invoking :func:`polyglot_alpha.orchestrator.run_lifecycle`. The patches
mutate ``polyglot_alpha.judges.panel`` and ``polyglot_alpha.llm`` at
module scope, so they are visible to every coroutine the orchestrator
spawns inside the same Python interpreter.
The patches DO NOT touch:
* On-chain calls (``polyglot_alpha.chain.*``) — Arc testnet is free gas.
* IPFS publish/fetch — local-file fallback is offline.
* SQLite persistence — same DB as the running backend.
* Polymarket — defaults to ``POLYMARKET_MODE=dry_run`` which never
posts to the live Gamma API. We assert that explicitly.
The MockLLM count is exposed via :data:`anthropic_call_count` so audit
scripts can assert ``count == 0`` against the patched panel.
"""
from __future__ import annotations
import asyncio
import json
import os
from typing import Any, Awaitable, Callable, Optional
# ---------------------------------------------------------------------------
# Internal call-count counter so the audit script can assert no real
# Anthropic call slipped through.
# ---------------------------------------------------------------------------
#: Incremented every time the mock panel.evaluate is invoked.
panel_evaluate_calls: int = 0
#: Incremented every time a MockLLM stand-in fields a prompt.
mock_llm_calls: int = 0
#: List of (label, prompt_preview) for debugging.
mock_llm_log: list[tuple[str, str]] = []
def _reset_counters() -> None:
global panel_evaluate_calls, mock_llm_calls
panel_evaluate_calls = 0
mock_llm_calls = 0
mock_llm_log.clear()
# ---------------------------------------------------------------------------
# Mock PanelVerdict factory
# ---------------------------------------------------------------------------
def _build_pass_verdict(question: Any) -> Any:
"""Return a deterministic PASS PanelVerdict.
We construct the real :class:`PanelVerdict` dataclass so the
orchestrator's downstream :func:`_evaluate_with_judges` adapter
converts it without surprise. All 8 D-judges pass, MQM raw=95,
BLEU raw=42, COMET raw=0.78, overall_score=92.
"""
from polyglot_alpha.judges.types import (
JudgeResult,
PanelVerdict,
VERDICT_PASS,
)
bleu = JudgeResult(
name="bleu",
passed=True,
score=0.42,
reason="Mock BLEU above threshold.",
evidence={"bleu_raw": 42.0, "mocked": True},
)
comet = JudgeResult(
name="comet",
passed=True,
score=0.78,
reason="Mock COMET above threshold.",
evidence={"comet_raw": 0.78, "mocked": True},
)
mqm = JudgeResult(
name="mqm_llm",
passed=True,
score=0.95,
reason="Mock MQM score=95 with zero major errors.",
evidence={
"score_raw": 95,
"major_count": 0,
"minor_count": 0,
"errors": [],
"rationale": "mocked",
"provider": "mock",
},
)
d_results = []
for d_name in (
"d1_structural",
"d2_stylistic",
"d3_framing",
"d4_granularity",
"d5_resolution_clarity",
"d6_source_reliability",
"d7_leading_check",
"d8_duplicate_detection",
):
d_results.append(
JudgeResult(
name=d_name,
passed=True,
score=1.0,
reason="Mocked PASS for end-to-end audit.",
evidence={"mocked": True},
)
)
style_passes = {f"d{i}": True for i in range(1, 9)}
return PanelVerdict(
overall_pass=True,
verdict=VERDICT_PASS,
overall_score=92,
translation_scores={
"bleu": 42.0,
"comet": 0.78,
"mqm": {
"score": 95,
"major_count": 0,
"minor_count": 0,
"errors": [],
},
},
style_alignment_passes=style_passes,
judge_results=[bleu, comet, mqm, *d_results],
notes=["mocked PASS verdict for end-to-end PASS-path audit"],
)
# ---------------------------------------------------------------------------
# Mock LLM (stand-in for AnthropicLLM)
# ---------------------------------------------------------------------------
_CANNED_JSON_QUESTION = json.dumps(
{
"question_en": (
"Will the FOMC raise rates by 25bp at the June 2026 meeting?"
),
"resolution_criteria": (
"Resolves YES if the Federal Reserve announces a 25bp rate hike at"
" the June 17-18, 2026 FOMC meeting; otherwise resolves NO."
),
"end_date_iso": "2026-12-31T23:59:59Z",
"tags": ["fomc", "rates", "macro", "mock"],
}
)
class _AuditMockLLM:
"""In-process stand-in returned by patched make_llm/AnthropicLLM.
Yields a deterministic JSON-shape response on every call. Returns the
SAME body regardless of model_id; agent differentiation is irrelevant
when the panel verdict is forced to PASS downstream.
"""
def __init__(
self, *args: Any, label: str = "audit_mock", **kwargs: Any
) -> None:
self.model = kwargs.get("model") or (args[0] if args else "mock")
self._label = label
async def complete(
self, system: str, user: str, **_kwargs: Any
) -> str:
return await self.__call__(user)
async def __call__(self, prompt: str) -> str:
global mock_llm_calls
await asyncio.sleep(0)
mock_llm_calls += 1
mock_llm_log.append((self._label, (prompt or "")[:120]))
return _CANNED_JSON_QUESTION
# ---------------------------------------------------------------------------
# Install / uninstall
# ---------------------------------------------------------------------------
_INSTALLED: dict[str, Any] = {}
def install_mocks() -> None:
"""Monkey-patch every Anthropic entry point and the panel adapter.
Idempotent: second call is a no-op so audit scripts can call this
from multiple entry points without breaking.
"""
if _INSTALLED.get("installed"):
return
_reset_counters()
from polyglot_alpha import llm as llm_mod
from polyglot_alpha.judges import panel as panel_mod
# ---- 1. Patch panel.evaluate to return a canned PASS verdict ----
_INSTALLED["panel.evaluate"] = panel_mod.evaluate
async def _patched_evaluate(question: Any, *_args: Any, **_kwargs: Any) -> Any:
global panel_evaluate_calls
panel_evaluate_calls += 1
await asyncio.sleep(0)
return _build_pass_verdict(question)
panel_mod.evaluate = _patched_evaluate # type: ignore[assignment]
# ---- 2. Patch llm.AnthropicLLM with a no-network stand-in ----
_INSTALLED["llm.AnthropicLLM"] = llm_mod.AnthropicLLM
class _PatchedAnthropicLLM(_AuditMockLLM):
pass
llm_mod.AnthropicLLM = _PatchedAnthropicLLM # type: ignore[assignment,misc]
# ---- 3. Patch llm.make_llm so per-agent llm factories also return mocks ----
_INSTALLED["llm.make_llm"] = llm_mod.make_llm
def _patched_make_llm(
model_id: str,
*,
mock: bool = False,
system: Optional[str] = None,
temperature: float = 0.2,
max_tokens: int = 1024,
) -> Callable[[str], Awaitable[str]]:
return _AuditMockLLM(model=model_id, label=f"make_llm:{model_id}")
llm_mod.make_llm = _patched_make_llm # type: ignore[assignment]
# ---- 4. Patch top-level llm.complete / complete_json (synthesizer path) ----
_INSTALLED["llm.complete"] = llm_mod.complete
_INSTALLED["llm.complete_json"] = llm_mod.complete_json
async def _patched_complete(
prompt: str, *_args: Any, **_kwargs: Any
) -> str:
global mock_llm_calls
mock_llm_calls += 1
mock_llm_log.append(("llm.complete", (prompt or "")[:120]))
await asyncio.sleep(0)
return _CANNED_JSON_QUESTION
async def _patched_complete_json(
prompt: str, *_args: Any, **_kwargs: Any
) -> Any:
raw = await _patched_complete(prompt)
return json.loads(raw)
llm_mod.complete = _patched_complete # type: ignore[assignment]
llm_mod.complete_json = _patched_complete_json # type: ignore[assignment]
# ---- 5. Hard guard: refuse to construct a real AsyncAnthropic client ----
_INSTALLED["llm.get_anthropic_client"] = llm_mod.get_anthropic_client
def _refuse_anthropic_client(api_key: Optional[str] = None) -> Any:
raise RuntimeError(
"audit-mode: refusing to construct a real AsyncAnthropic client"
)
llm_mod.get_anthropic_client = _refuse_anthropic_client # type: ignore[assignment]
# ---- 6. Force Polymarket into dry_run regardless of inherited env -------
# Defensive — the audit MUST NOT post to live Polymarket.
os.environ.setdefault("POLYMARKET_MODE", "dry_run")
# Treasury wallet so the 90/10 split fires; fall back to operator wallet.
os.environ.setdefault(
"PLATFORM_TREASURY_ADDRESS",
os.environ.get(
"HACKATHON_WALLET_ADDRESS",
"0x000000000000000000000000000000000000dead",
),
)
_INSTALLED["installed"] = True
def uninstall_mocks() -> None:
"""Restore the original module attributes. Mostly useful in pytest."""
if not _INSTALLED.get("installed"):
return
from polyglot_alpha import llm as llm_mod
from polyglot_alpha.judges import panel as panel_mod
panel_mod.evaluate = _INSTALLED["panel.evaluate"] # type: ignore[assignment]
llm_mod.AnthropicLLM = _INSTALLED["llm.AnthropicLLM"] # type: ignore[assignment]
llm_mod.make_llm = _INSTALLED["llm.make_llm"] # type: ignore[assignment]
llm_mod.complete = _INSTALLED["llm.complete"] # type: ignore[assignment]
llm_mod.complete_json = _INSTALLED["llm.complete_json"] # type: ignore[assignment]
llm_mod.get_anthropic_client = _INSTALLED["llm.get_anthropic_client"] # type: ignore[assignment]
_INSTALLED.clear()
__all__ = [
"install_mocks",
"uninstall_mocks",
"panel_evaluate_calls",
"mock_llm_calls",
"mock_llm_log",
]