Spaces:

messili
/

polyglot-alpha

Running

polyglot-alpha / tests /_pass_path_mocks.py

licaomeng

deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)

88d2f2a 5 days ago

10.8 kB

	"""Mock infrastructure for end-to-end PASS-path audit runs.

	Goal: trigger the orchestrator's full lifecycle end-to-end without
	spending real money on Anthropic LLM calls. Specifically, the panel's
	11 judges (which collectively call Anthropic Haiku 4.5 ~10-14 times
	per event) are short-circuited to a deterministic PASS PanelVerdict
	in-process, and any other LLM entry points (synthesizer, critics,
	moderator, refine) are routed through a MockLLM so any straggler that
	slips past the panel patch still cannot reach api.anthropic.com.

	Use :func:`install_mocks` from a top-level audit script BEFORE
	invoking :func:`polyglot_alpha.orchestrator.run_lifecycle`. The patches
	mutate ``polyglot_alpha.judges.panel`` and ``polyglot_alpha.llm`` at
	module scope, so they are visible to every coroutine the orchestrator
	spawns inside the same Python interpreter.

	The patches DO NOT touch:

	* On-chain calls (``polyglot_alpha.chain.*``) — Arc testnet is free gas.
	* IPFS publish/fetch — local-file fallback is offline.
	* SQLite persistence — same DB as the running backend.
	* Polymarket — defaults to ``POLYMARKET_MODE=dry_run`` which never
	posts to the live Gamma API. We assert that explicitly.

	The MockLLM count is exposed via :data:`anthropic_call_count` so audit
	scripts can assert ``count == 0`` against the patched panel.
	"""

	from __future__ import annotations

	import asyncio
	import json
	import os
	from typing import Any, Awaitable, Callable, Optional


	# ---------------------------------------------------------------------------
	# Internal call-count counter so the audit script can assert no real
	# Anthropic call slipped through.
	# ---------------------------------------------------------------------------

	#: Incremented every time the mock panel.evaluate is invoked.
	panel_evaluate_calls: int = 0

	#: Incremented every time a MockLLM stand-in fields a prompt.
	mock_llm_calls: int = 0

	#: List of (label, prompt_preview) for debugging.
	mock_llm_log: list[tuple[str, str]] = []


	def _reset_counters() -> None:
	global panel_evaluate_calls, mock_llm_calls
	panel_evaluate_calls = 0
	mock_llm_calls = 0
	mock_llm_log.clear()


	# ---------------------------------------------------------------------------
	# Mock PanelVerdict factory
	# ---------------------------------------------------------------------------


	def _build_pass_verdict(question: Any) -> Any:
	"""Return a deterministic PASS PanelVerdict.

	We construct the real :class:`PanelVerdict` dataclass so the
	orchestrator's downstream :func:`_evaluate_with_judges` adapter
	converts it without surprise. All 8 D-judges pass, MQM raw=95,
	BLEU raw=42, COMET raw=0.78, overall_score=92.
	"""

	from polyglot_alpha.judges.types import (
	JudgeResult,
	PanelVerdict,
	VERDICT_PASS,
	)

	bleu = JudgeResult(
	name="bleu",
	passed=True,
	score=0.42,
	reason="Mock BLEU above threshold.",
	evidence={"bleu_raw": 42.0, "mocked": True},
	)
	comet = JudgeResult(
	name="comet",
	passed=True,
	score=0.78,
	reason="Mock COMET above threshold.",
	evidence={"comet_raw": 0.78, "mocked": True},
	)
	mqm = JudgeResult(
	name="mqm_llm",
	passed=True,
	score=0.95,
	reason="Mock MQM score=95 with zero major errors.",
	evidence={
	"score_raw": 95,
	"major_count": 0,
	"minor_count": 0,
	"errors": [],
	"rationale": "mocked",
	"provider": "mock",
	},
	)
	d_results = []
	for d_name in (
	"d1_structural",
	"d2_stylistic",
	"d3_framing",
	"d4_granularity",
	"d5_resolution_clarity",
	"d6_source_reliability",
	"d7_leading_check",
	"d8_duplicate_detection",
	):
	d_results.append(
	JudgeResult(
	name=d_name,
	passed=True,
	score=1.0,
	reason="Mocked PASS for end-to-end audit.",
	evidence={"mocked": True},
	)
	)

	style_passes = {f"d{i}": True for i in range(1, 9)}
	return PanelVerdict(
	overall_pass=True,
	verdict=VERDICT_PASS,
	overall_score=92,
	translation_scores={
	"bleu": 42.0,
	"comet": 0.78,
	"mqm": {
	"score": 95,
	"major_count": 0,
	"minor_count": 0,
	"errors": [],
	},
	},
	style_alignment_passes=style_passes,
	judge_results=[bleu, comet, mqm, *d_results],
	notes=["mocked PASS verdict for end-to-end PASS-path audit"],
	)


	# ---------------------------------------------------------------------------
	# Mock LLM (stand-in for AnthropicLLM)
	# ---------------------------------------------------------------------------


	_CANNED_JSON_QUESTION = json.dumps(
	{
	"question_en": (
	"Will the FOMC raise rates by 25bp at the June 2026 meeting?"
	),
	"resolution_criteria": (
	"Resolves YES if the Federal Reserve announces a 25bp rate hike at"
	" the June 17-18, 2026 FOMC meeting; otherwise resolves NO."
	),
	"end_date_iso": "2026-12-31T23:59:59Z",
	"tags": ["fomc", "rates", "macro", "mock"],
	}
	)


	class _AuditMockLLM:
	"""In-process stand-in returned by patched make_llm/AnthropicLLM.

	Yields a deterministic JSON-shape response on every call. Returns the
	SAME body regardless of model_id; agent differentiation is irrelevant
	when the panel verdict is forced to PASS downstream.
	"""

	def __init__(
	self, args: Any, label: str = "audit_mock", *kwargs: Any
	) -> None:
	self.model = kwargs.get("model") or (args[0] if args else "mock")
	self._label = label

	async def complete(
	self, system: str, user: str, **_kwargs: Any
	) -> str:
	return await self.__call__(user)

	async def __call__(self, prompt: str) -> str:
	global mock_llm_calls
	await asyncio.sleep(0)
	mock_llm_calls += 1
	mock_llm_log.append((self._label, (prompt or "")[:120]))
	return _CANNED_JSON_QUESTION


	# ---------------------------------------------------------------------------
	# Install / uninstall
	# ---------------------------------------------------------------------------


	_INSTALLED: dict[str, Any] = {}


	def install_mocks() -> None:
	"""Monkey-patch every Anthropic entry point and the panel adapter.

	Idempotent: second call is a no-op so audit scripts can call this
	from multiple entry points without breaking.
	"""

	if _INSTALLED.get("installed"):
	return

	_reset_counters()

	from polyglot_alpha import llm as llm_mod
	from polyglot_alpha.judges import panel as panel_mod

	# ---- 1. Patch panel.evaluate to return a canned PASS verdict ----
	_INSTALLED["panel.evaluate"] = panel_mod.evaluate

	async def _patched_evaluate(question: Any, _args: Any, *_kwargs: Any) -> Any:
	global panel_evaluate_calls
	panel_evaluate_calls += 1
	await asyncio.sleep(0)
	return _build_pass_verdict(question)

	panel_mod.evaluate = _patched_evaluate # type: ignore[assignment]

	# ---- 2. Patch llm.AnthropicLLM with a no-network stand-in ----
	_INSTALLED["llm.AnthropicLLM"] = llm_mod.AnthropicLLM

	class _PatchedAnthropicLLM(_AuditMockLLM):
	pass

	llm_mod.AnthropicLLM = _PatchedAnthropicLLM # type: ignore[assignment,misc]

	# ---- 3. Patch llm.make_llm so per-agent llm factories also return mocks ----
	_INSTALLED["llm.make_llm"] = llm_mod.make_llm

	def _patched_make_llm(
	model_id: str,
	*,
	mock: bool = False,
	system: Optional[str] = None,
	temperature: float = 0.2,
	max_tokens: int = 1024,
	) -> Callable[[str], Awaitable[str]]:
	return _AuditMockLLM(model=model_id, label=f"make_llm:{model_id}")

	llm_mod.make_llm = _patched_make_llm # type: ignore[assignment]

	# ---- 4. Patch top-level llm.complete / complete_json (synthesizer path) ----
	_INSTALLED["llm.complete"] = llm_mod.complete
	_INSTALLED["llm.complete_json"] = llm_mod.complete_json

	async def _patched_complete(
	prompt: str, _args: Any, *_kwargs: Any
	) -> str:
	global mock_llm_calls
	mock_llm_calls += 1
	mock_llm_log.append(("llm.complete", (prompt or "")[:120]))
	await asyncio.sleep(0)
	return _CANNED_JSON_QUESTION

	async def _patched_complete_json(
	prompt: str, _args: Any, *_kwargs: Any
	) -> Any:
	raw = await _patched_complete(prompt)
	return json.loads(raw)

	llm_mod.complete = _patched_complete # type: ignore[assignment]
	llm_mod.complete_json = _patched_complete_json # type: ignore[assignment]

	# ---- 5. Hard guard: refuse to construct a real AsyncAnthropic client ----
	_INSTALLED["llm.get_anthropic_client"] = llm_mod.get_anthropic_client

	def _refuse_anthropic_client(api_key: Optional[str] = None) -> Any:
	raise RuntimeError(
	"audit-mode: refusing to construct a real AsyncAnthropic client"
	)

	llm_mod.get_anthropic_client = _refuse_anthropic_client # type: ignore[assignment]

	# ---- 6. Force Polymarket into dry_run regardless of inherited env -------
	# Defensive — the audit MUST NOT post to live Polymarket.
	os.environ.setdefault("POLYMARKET_MODE", "dry_run")
	# Treasury wallet so the 90/10 split fires; fall back to operator wallet.
	os.environ.setdefault(
	"PLATFORM_TREASURY_ADDRESS",
	os.environ.get(
	"HACKATHON_WALLET_ADDRESS",
	"0x000000000000000000000000000000000000dead",
	),
	)

	_INSTALLED["installed"] = True


	def uninstall_mocks() -> None:
	"""Restore the original module attributes. Mostly useful in pytest."""

	if not _INSTALLED.get("installed"):
	return

	from polyglot_alpha import llm as llm_mod
	from polyglot_alpha.judges import panel as panel_mod

	panel_mod.evaluate = _INSTALLED["panel.evaluate"] # type: ignore[assignment]
	llm_mod.AnthropicLLM = _INSTALLED["llm.AnthropicLLM"] # type: ignore[assignment]
	llm_mod.make_llm = _INSTALLED["llm.make_llm"] # type: ignore[assignment]
	llm_mod.complete = _INSTALLED["llm.complete"] # type: ignore[assignment]
	llm_mod.complete_json = _INSTALLED["llm.complete_json"] # type: ignore[assignment]
	llm_mod.get_anthropic_client = _INSTALLED["llm.get_anthropic_client"] # type: ignore[assignment]
	_INSTALLED.clear()


	__all__ = [
	"install_mocks",
	"uninstall_mocks",
	"panel_evaluate_calls",
	"mock_llm_calls",
	"mock_llm_log",
	]