Spaces:

messili
/

polyglot-alpha

Running

File size: 10,775 Bytes

88d2f2a

"""Mock infrastructure for end-to-end PASS-path audit runs.

Goal: trigger the orchestrator's full lifecycle end-to-end without
spending real money on Anthropic LLM calls. Specifically, the panel's
11 judges (which collectively call Anthropic Haiku 4.5 ~10-14 times
per event) are short-circuited to a deterministic PASS PanelVerdict
in-process, and any other LLM entry points (synthesizer, critics,
moderator, refine) are routed through a MockLLM so any straggler that
slips past the panel patch still cannot reach api.anthropic.com.

Use :func:`install_mocks` from a top-level audit script BEFORE
invoking :func:`polyglot_alpha.orchestrator.run_lifecycle`. The patches
mutate ``polyglot_alpha.judges.panel`` and ``polyglot_alpha.llm`` at
module scope, so they are visible to every coroutine the orchestrator
spawns inside the same Python interpreter.

The patches DO NOT touch:

* On-chain calls (``polyglot_alpha.chain.*``) — Arc testnet is free gas.
* IPFS publish/fetch — local-file fallback is offline.
* SQLite persistence — same DB as the running backend.
* Polymarket — defaults to ``POLYMARKET_MODE=dry_run`` which never
  posts to the live Gamma API. We assert that explicitly.

The MockLLM count is exposed via :data:`anthropic_call_count` so audit
scripts can assert ``count == 0`` against the patched panel.
"""

from __future__ import annotations

import asyncio
import json
import os
from typing import Any, Awaitable, Callable, Optional


# ---------------------------------------------------------------------------
# Internal call-count counter so the audit script can assert no real
# Anthropic call slipped through.
# ---------------------------------------------------------------------------

#: Incremented every time the mock panel.evaluate is invoked.
panel_evaluate_calls: int = 0

#: Incremented every time a MockLLM stand-in fields a prompt.
mock_llm_calls: int = 0

#: List of (label, prompt_preview) for debugging.
mock_llm_log: list[tuple[str, str]] = []


def _reset_counters() -> None:
    global panel_evaluate_calls, mock_llm_calls
    panel_evaluate_calls = 0
    mock_llm_calls = 0
    mock_llm_log.clear()


# ---------------------------------------------------------------------------
# Mock PanelVerdict factory
# ---------------------------------------------------------------------------


def _build_pass_verdict(question: Any) -> Any:
    """Return a deterministic PASS PanelVerdict.

    We construct the real :class:`PanelVerdict` dataclass so the
    orchestrator's downstream :func:`_evaluate_with_judges` adapter
    converts it without surprise. All 8 D-judges pass, MQM raw=95,
    BLEU raw=42, COMET raw=0.78, overall_score=92.
    """

    from polyglot_alpha.judges.types import (
        JudgeResult,
        PanelVerdict,
        VERDICT_PASS,
    )

    bleu = JudgeResult(
        name="bleu",
        passed=True,
        score=0.42,
        reason="Mock BLEU above threshold.",
        evidence={"bleu_raw": 42.0, "mocked": True},
    )
    comet = JudgeResult(
        name="comet",
        passed=True,
        score=0.78,
        reason="Mock COMET above threshold.",
        evidence={"comet_raw": 0.78, "mocked": True},
    )
    mqm = JudgeResult(
        name="mqm_llm",
        passed=True,
        score=0.95,
        reason="Mock MQM score=95 with zero major errors.",
        evidence={
            "score_raw": 95,
            "major_count": 0,
            "minor_count": 0,
            "errors": [],
            "rationale": "mocked",
            "provider": "mock",
        },
    )
    d_results = []
    for d_name in (
        "d1_structural",
        "d2_stylistic",
        "d3_framing",
        "d4_granularity",
        "d5_resolution_clarity",
        "d6_source_reliability",
        "d7_leading_check",
        "d8_duplicate_detection",
    ):
        d_results.append(
            JudgeResult(
                name=d_name,
                passed=True,
                score=1.0,
                reason="Mocked PASS for end-to-end audit.",
                evidence={"mocked": True},
            )
        )

    style_passes = {f"d{i}": True for i in range(1, 9)}
    return PanelVerdict(
        overall_pass=True,
        verdict=VERDICT_PASS,
        overall_score=92,
        translation_scores={
            "bleu": 42.0,
            "comet": 0.78,
            "mqm": {
                "score": 95,
                "major_count": 0,
                "minor_count": 0,
                "errors": [],
            },
        },
        style_alignment_passes=style_passes,
        judge_results=[bleu, comet, mqm, *d_results],
        notes=["mocked PASS verdict for end-to-end PASS-path audit"],
    )


# ---------------------------------------------------------------------------
# Mock LLM (stand-in for AnthropicLLM)
# ---------------------------------------------------------------------------


_CANNED_JSON_QUESTION = json.dumps(
    {
        "question_en": (
            "Will the FOMC raise rates by 25bp at the June 2026 meeting?"
        ),
        "resolution_criteria": (
            "Resolves YES if the Federal Reserve announces a 25bp rate hike at"
            " the June 17-18, 2026 FOMC meeting; otherwise resolves NO."
        ),
        "end_date_iso": "2026-12-31T23:59:59Z",
        "tags": ["fomc", "rates", "macro", "mock"],
    }
)


class _AuditMockLLM:
    """In-process stand-in returned by patched make_llm/AnthropicLLM.

    Yields a deterministic JSON-shape response on every call. Returns the
    SAME body regardless of model_id; agent differentiation is irrelevant
    when the panel verdict is forced to PASS downstream.
    """

    def __init__(
        self, *args: Any, label: str = "audit_mock", **kwargs: Any
    ) -> None:
        self.model = kwargs.get("model") or (args[0] if args else "mock")
        self._label = label

    async def complete(
        self, system: str, user: str, **_kwargs: Any
    ) -> str:
        return await self.__call__(user)

    async def __call__(self, prompt: str) -> str:
        global mock_llm_calls
        await asyncio.sleep(0)
        mock_llm_calls += 1
        mock_llm_log.append((self._label, (prompt or "")[:120]))
        return _CANNED_JSON_QUESTION


# ---------------------------------------------------------------------------
# Install / uninstall
# ---------------------------------------------------------------------------


_INSTALLED: dict[str, Any] = {}


def install_mocks() -> None:
    """Monkey-patch every Anthropic entry point and the panel adapter.

    Idempotent: second call is a no-op so audit scripts can call this
    from multiple entry points without breaking.
    """

    if _INSTALLED.get("installed"):
        return

    _reset_counters()

    from polyglot_alpha import llm as llm_mod
    from polyglot_alpha.judges import panel as panel_mod

    # ---- 1. Patch panel.evaluate to return a canned PASS verdict ----
    _INSTALLED["panel.evaluate"] = panel_mod.evaluate

    async def _patched_evaluate(question: Any, *_args: Any, **_kwargs: Any) -> Any:
        global panel_evaluate_calls
        panel_evaluate_calls += 1
        await asyncio.sleep(0)
        return _build_pass_verdict(question)

    panel_mod.evaluate = _patched_evaluate  # type: ignore[assignment]

    # ---- 2. Patch llm.AnthropicLLM with a no-network stand-in ----
    _INSTALLED["llm.AnthropicLLM"] = llm_mod.AnthropicLLM

    class _PatchedAnthropicLLM(_AuditMockLLM):
        pass

    llm_mod.AnthropicLLM = _PatchedAnthropicLLM  # type: ignore[assignment,misc]

    # ---- 3. Patch llm.make_llm so per-agent llm factories also return mocks ----
    _INSTALLED["llm.make_llm"] = llm_mod.make_llm

    def _patched_make_llm(
        model_id: str,
        *,
        mock: bool = False,
        system: Optional[str] = None,
        temperature: float = 0.2,
        max_tokens: int = 1024,
    ) -> Callable[[str], Awaitable[str]]:
        return _AuditMockLLM(model=model_id, label=f"make_llm:{model_id}")

    llm_mod.make_llm = _patched_make_llm  # type: ignore[assignment]

    # ---- 4. Patch top-level llm.complete / complete_json (synthesizer path) ----
    _INSTALLED["llm.complete"] = llm_mod.complete
    _INSTALLED["llm.complete_json"] = llm_mod.complete_json

    async def _patched_complete(
        prompt: str, *_args: Any, **_kwargs: Any
    ) -> str:
        global mock_llm_calls
        mock_llm_calls += 1
        mock_llm_log.append(("llm.complete", (prompt or "")[:120]))
        await asyncio.sleep(0)
        return _CANNED_JSON_QUESTION

    async def _patched_complete_json(
        prompt: str, *_args: Any, **_kwargs: Any
    ) -> Any:
        raw = await _patched_complete(prompt)
        return json.loads(raw)

    llm_mod.complete = _patched_complete  # type: ignore[assignment]
    llm_mod.complete_json = _patched_complete_json  # type: ignore[assignment]

    # ---- 5. Hard guard: refuse to construct a real AsyncAnthropic client ----
    _INSTALLED["llm.get_anthropic_client"] = llm_mod.get_anthropic_client

    def _refuse_anthropic_client(api_key: Optional[str] = None) -> Any:
        raise RuntimeError(
            "audit-mode: refusing to construct a real AsyncAnthropic client"
        )

    llm_mod.get_anthropic_client = _refuse_anthropic_client  # type: ignore[assignment]

    # ---- 6. Force Polymarket into dry_run regardless of inherited env -------
    # Defensive — the audit MUST NOT post to live Polymarket.
    os.environ.setdefault("POLYMARKET_MODE", "dry_run")
    # Treasury wallet so the 90/10 split fires; fall back to operator wallet.
    os.environ.setdefault(
        "PLATFORM_TREASURY_ADDRESS",
        os.environ.get(
            "HACKATHON_WALLET_ADDRESS",
            "0x000000000000000000000000000000000000dead",
        ),
    )

    _INSTALLED["installed"] = True


def uninstall_mocks() -> None:
    """Restore the original module attributes. Mostly useful in pytest."""

    if not _INSTALLED.get("installed"):
        return

    from polyglot_alpha import llm as llm_mod
    from polyglot_alpha.judges import panel as panel_mod

    panel_mod.evaluate = _INSTALLED["panel.evaluate"]  # type: ignore[assignment]
    llm_mod.AnthropicLLM = _INSTALLED["llm.AnthropicLLM"]  # type: ignore[assignment]
    llm_mod.make_llm = _INSTALLED["llm.make_llm"]  # type: ignore[assignment]
    llm_mod.complete = _INSTALLED["llm.complete"]  # type: ignore[assignment]
    llm_mod.complete_json = _INSTALLED["llm.complete_json"]  # type: ignore[assignment]
    llm_mod.get_anthropic_client = _INSTALLED["llm.get_anthropic_client"]  # type: ignore[assignment]
    _INSTALLED.clear()


__all__ = [
    "install_mocks",
    "uninstall_mocks",
    "panel_evaluate_calls",
    "mock_llm_calls",
    "mock_llm_log",
]