Spaces:
Running
Running
| """Mock infrastructure for end-to-end PASS-path audit runs. | |
| Goal: trigger the orchestrator's full lifecycle end-to-end without | |
| spending real money on Anthropic LLM calls. Specifically, the panel's | |
| 11 judges (which collectively call Anthropic Haiku 4.5 ~10-14 times | |
| per event) are short-circuited to a deterministic PASS PanelVerdict | |
| in-process, and any other LLM entry points (synthesizer, critics, | |
| moderator, refine) are routed through a MockLLM so any straggler that | |
| slips past the panel patch still cannot reach api.anthropic.com. | |
| Use :func:`install_mocks` from a top-level audit script BEFORE | |
| invoking :func:`polyglot_alpha.orchestrator.run_lifecycle`. The patches | |
| mutate ``polyglot_alpha.judges.panel`` and ``polyglot_alpha.llm`` at | |
| module scope, so they are visible to every coroutine the orchestrator | |
| spawns inside the same Python interpreter. | |
| The patches DO NOT touch: | |
| * On-chain calls (``polyglot_alpha.chain.*``) — Arc testnet is free gas. | |
| * IPFS publish/fetch — local-file fallback is offline. | |
| * SQLite persistence — same DB as the running backend. | |
| * Polymarket — defaults to ``POLYMARKET_MODE=dry_run`` which never | |
| posts to the live Gamma API. We assert that explicitly. | |
| The MockLLM count is exposed via :data:`anthropic_call_count` so audit | |
| scripts can assert ``count == 0`` against the patched panel. | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import json | |
| import os | |
| from typing import Any, Awaitable, Callable, Optional | |
| # --------------------------------------------------------------------------- | |
| # Internal call-count counter so the audit script can assert no real | |
| # Anthropic call slipped through. | |
| # --------------------------------------------------------------------------- | |
| #: Incremented every time the mock panel.evaluate is invoked. | |
| panel_evaluate_calls: int = 0 | |
| #: Incremented every time a MockLLM stand-in fields a prompt. | |
| mock_llm_calls: int = 0 | |
| #: List of (label, prompt_preview) for debugging. | |
| mock_llm_log: list[tuple[str, str]] = [] | |
| def _reset_counters() -> None: | |
| global panel_evaluate_calls, mock_llm_calls | |
| panel_evaluate_calls = 0 | |
| mock_llm_calls = 0 | |
| mock_llm_log.clear() | |
| # --------------------------------------------------------------------------- | |
| # Mock PanelVerdict factory | |
| # --------------------------------------------------------------------------- | |
| def _build_pass_verdict(question: Any) -> Any: | |
| """Return a deterministic PASS PanelVerdict. | |
| We construct the real :class:`PanelVerdict` dataclass so the | |
| orchestrator's downstream :func:`_evaluate_with_judges` adapter | |
| converts it without surprise. All 8 D-judges pass, MQM raw=95, | |
| BLEU raw=42, COMET raw=0.78, overall_score=92. | |
| """ | |
| from polyglot_alpha.judges.types import ( | |
| JudgeResult, | |
| PanelVerdict, | |
| VERDICT_PASS, | |
| ) | |
| bleu = JudgeResult( | |
| name="bleu", | |
| passed=True, | |
| score=0.42, | |
| reason="Mock BLEU above threshold.", | |
| evidence={"bleu_raw": 42.0, "mocked": True}, | |
| ) | |
| comet = JudgeResult( | |
| name="comet", | |
| passed=True, | |
| score=0.78, | |
| reason="Mock COMET above threshold.", | |
| evidence={"comet_raw": 0.78, "mocked": True}, | |
| ) | |
| mqm = JudgeResult( | |
| name="mqm_llm", | |
| passed=True, | |
| score=0.95, | |
| reason="Mock MQM score=95 with zero major errors.", | |
| evidence={ | |
| "score_raw": 95, | |
| "major_count": 0, | |
| "minor_count": 0, | |
| "errors": [], | |
| "rationale": "mocked", | |
| "provider": "mock", | |
| }, | |
| ) | |
| d_results = [] | |
| for d_name in ( | |
| "d1_structural", | |
| "d2_stylistic", | |
| "d3_framing", | |
| "d4_granularity", | |
| "d5_resolution_clarity", | |
| "d6_source_reliability", | |
| "d7_leading_check", | |
| "d8_duplicate_detection", | |
| ): | |
| d_results.append( | |
| JudgeResult( | |
| name=d_name, | |
| passed=True, | |
| score=1.0, | |
| reason="Mocked PASS for end-to-end audit.", | |
| evidence={"mocked": True}, | |
| ) | |
| ) | |
| style_passes = {f"d{i}": True for i in range(1, 9)} | |
| return PanelVerdict( | |
| overall_pass=True, | |
| verdict=VERDICT_PASS, | |
| overall_score=92, | |
| translation_scores={ | |
| "bleu": 42.0, | |
| "comet": 0.78, | |
| "mqm": { | |
| "score": 95, | |
| "major_count": 0, | |
| "minor_count": 0, | |
| "errors": [], | |
| }, | |
| }, | |
| style_alignment_passes=style_passes, | |
| judge_results=[bleu, comet, mqm, *d_results], | |
| notes=["mocked PASS verdict for end-to-end PASS-path audit"], | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Mock LLM (stand-in for AnthropicLLM) | |
| # --------------------------------------------------------------------------- | |
| _CANNED_JSON_QUESTION = json.dumps( | |
| { | |
| "question_en": ( | |
| "Will the FOMC raise rates by 25bp at the June 2026 meeting?" | |
| ), | |
| "resolution_criteria": ( | |
| "Resolves YES if the Federal Reserve announces a 25bp rate hike at" | |
| " the June 17-18, 2026 FOMC meeting; otherwise resolves NO." | |
| ), | |
| "end_date_iso": "2026-12-31T23:59:59Z", | |
| "tags": ["fomc", "rates", "macro", "mock"], | |
| } | |
| ) | |
| class _AuditMockLLM: | |
| """In-process stand-in returned by patched make_llm/AnthropicLLM. | |
| Yields a deterministic JSON-shape response on every call. Returns the | |
| SAME body regardless of model_id; agent differentiation is irrelevant | |
| when the panel verdict is forced to PASS downstream. | |
| """ | |
| def __init__( | |
| self, *args: Any, label: str = "audit_mock", **kwargs: Any | |
| ) -> None: | |
| self.model = kwargs.get("model") or (args[0] if args else "mock") | |
| self._label = label | |
| async def complete( | |
| self, system: str, user: str, **_kwargs: Any | |
| ) -> str: | |
| return await self.__call__(user) | |
| async def __call__(self, prompt: str) -> str: | |
| global mock_llm_calls | |
| await asyncio.sleep(0) | |
| mock_llm_calls += 1 | |
| mock_llm_log.append((self._label, (prompt or "")[:120])) | |
| return _CANNED_JSON_QUESTION | |
| # --------------------------------------------------------------------------- | |
| # Install / uninstall | |
| # --------------------------------------------------------------------------- | |
| _INSTALLED: dict[str, Any] = {} | |
| def install_mocks() -> None: | |
| """Monkey-patch every Anthropic entry point and the panel adapter. | |
| Idempotent: second call is a no-op so audit scripts can call this | |
| from multiple entry points without breaking. | |
| """ | |
| if _INSTALLED.get("installed"): | |
| return | |
| _reset_counters() | |
| from polyglot_alpha import llm as llm_mod | |
| from polyglot_alpha.judges import panel as panel_mod | |
| # ---- 1. Patch panel.evaluate to return a canned PASS verdict ---- | |
| _INSTALLED["panel.evaluate"] = panel_mod.evaluate | |
| async def _patched_evaluate(question: Any, *_args: Any, **_kwargs: Any) -> Any: | |
| global panel_evaluate_calls | |
| panel_evaluate_calls += 1 | |
| await asyncio.sleep(0) | |
| return _build_pass_verdict(question) | |
| panel_mod.evaluate = _patched_evaluate # type: ignore[assignment] | |
| # ---- 2. Patch llm.AnthropicLLM with a no-network stand-in ---- | |
| _INSTALLED["llm.AnthropicLLM"] = llm_mod.AnthropicLLM | |
| class _PatchedAnthropicLLM(_AuditMockLLM): | |
| pass | |
| llm_mod.AnthropicLLM = _PatchedAnthropicLLM # type: ignore[assignment,misc] | |
| # ---- 3. Patch llm.make_llm so per-agent llm factories also return mocks ---- | |
| _INSTALLED["llm.make_llm"] = llm_mod.make_llm | |
| def _patched_make_llm( | |
| model_id: str, | |
| *, | |
| mock: bool = False, | |
| system: Optional[str] = None, | |
| temperature: float = 0.2, | |
| max_tokens: int = 1024, | |
| ) -> Callable[[str], Awaitable[str]]: | |
| return _AuditMockLLM(model=model_id, label=f"make_llm:{model_id}") | |
| llm_mod.make_llm = _patched_make_llm # type: ignore[assignment] | |
| # ---- 4. Patch top-level llm.complete / complete_json (synthesizer path) ---- | |
| _INSTALLED["llm.complete"] = llm_mod.complete | |
| _INSTALLED["llm.complete_json"] = llm_mod.complete_json | |
| async def _patched_complete( | |
| prompt: str, *_args: Any, **_kwargs: Any | |
| ) -> str: | |
| global mock_llm_calls | |
| mock_llm_calls += 1 | |
| mock_llm_log.append(("llm.complete", (prompt or "")[:120])) | |
| await asyncio.sleep(0) | |
| return _CANNED_JSON_QUESTION | |
| async def _patched_complete_json( | |
| prompt: str, *_args: Any, **_kwargs: Any | |
| ) -> Any: | |
| raw = await _patched_complete(prompt) | |
| return json.loads(raw) | |
| llm_mod.complete = _patched_complete # type: ignore[assignment] | |
| llm_mod.complete_json = _patched_complete_json # type: ignore[assignment] | |
| # ---- 5. Hard guard: refuse to construct a real AsyncAnthropic client ---- | |
| _INSTALLED["llm.get_anthropic_client"] = llm_mod.get_anthropic_client | |
| def _refuse_anthropic_client(api_key: Optional[str] = None) -> Any: | |
| raise RuntimeError( | |
| "audit-mode: refusing to construct a real AsyncAnthropic client" | |
| ) | |
| llm_mod.get_anthropic_client = _refuse_anthropic_client # type: ignore[assignment] | |
| # ---- 6. Force Polymarket into dry_run regardless of inherited env ------- | |
| # Defensive — the audit MUST NOT post to live Polymarket. | |
| os.environ.setdefault("POLYMARKET_MODE", "dry_run") | |
| # Treasury wallet so the 90/10 split fires; fall back to operator wallet. | |
| os.environ.setdefault( | |
| "PLATFORM_TREASURY_ADDRESS", | |
| os.environ.get( | |
| "HACKATHON_WALLET_ADDRESS", | |
| "0x000000000000000000000000000000000000dead", | |
| ), | |
| ) | |
| _INSTALLED["installed"] = True | |
| def uninstall_mocks() -> None: | |
| """Restore the original module attributes. Mostly useful in pytest.""" | |
| if not _INSTALLED.get("installed"): | |
| return | |
| from polyglot_alpha import llm as llm_mod | |
| from polyglot_alpha.judges import panel as panel_mod | |
| panel_mod.evaluate = _INSTALLED["panel.evaluate"] # type: ignore[assignment] | |
| llm_mod.AnthropicLLM = _INSTALLED["llm.AnthropicLLM"] # type: ignore[assignment] | |
| llm_mod.make_llm = _INSTALLED["llm.make_llm"] # type: ignore[assignment] | |
| llm_mod.complete = _INSTALLED["llm.complete"] # type: ignore[assignment] | |
| llm_mod.complete_json = _INSTALLED["llm.complete_json"] # type: ignore[assignment] | |
| llm_mod.get_anthropic_client = _INSTALLED["llm.get_anthropic_client"] # type: ignore[assignment] | |
| _INSTALLED.clear() | |
| __all__ = [ | |
| "install_mocks", | |
| "uninstall_mocks", | |
| "panel_evaluate_calls", | |
| "mock_llm_calls", | |
| "mock_llm_log", | |
| ] | |