polyglot-alpha / tests /test_internal_debate.py
licaomeng
deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)
88d2f2a
"""Unit tests for the internal-debate orchestrator.
All tests stay offline: LLM callables are replaced with ``AsyncMock`` or
plain async stubs so no network / OPENROUTER_API_KEY is required.
Run with: ``.venv/bin/pytest -xvs tests/test_internal_debate.py``
"""
from __future__ import annotations
import asyncio
import hashlib
import json
from typing import Any, Dict, List
from unittest.mock import AsyncMock
import pytest
from polyglot_alpha.agents.critics import CRITIC_MODEL_A, CRITIC_MODEL_B
from polyglot_alpha.agents.internal_debate import (
InternalDebateResult,
run_internal_debate,
)
from polyglot_alpha.agents.moderator import MODERATOR_MODEL
# --------------------------------------------------------------------------- #
# Fixtures #
# --------------------------------------------------------------------------- #
@pytest.fixture()
def event() -> Dict[str, Any]:
return {
"event_id": "0xdebate_smoke",
"title_zh": "美联储宣布加息25个基点",
"body_zh": "美联储于2026年5月议息会议宣布加息25个基点。",
"url": "https://example.com/fed",
"cutoff_ts": 1735689600,
}
@pytest.fixture()
def two_candidates() -> List[Dict[str, Any]]:
"""Two translator candidates; candidate 0 is intentionally stronger."""
return [
{
"translator_id": "t0",
"title": "Will the Fed raise rates by 25bps at the June 2026 FOMC meeting?",
"question_en": "Will the Fed raise rates by 25bps at the June 2026 FOMC meeting?",
"category": "macro",
"resolution_criteria": (
"Resolves YES if the Federal Reserve's June 2026 FOMC "
"statement raises the federal funds target range by "
"exactly 25 basis points, as published on federalreserve.gov "
"by 2026-06-30T23:59:59Z."
),
"resolution_source": "federalreserve.gov",
"end_date_iso": "2026-06-30T23:59:59Z",
"tags": ["macro", "fed", "rates"],
"meta": {"model": "deepseek/deepseek-chat"},
},
{
"translator_id": "t1",
"title": "Will the Fed hike soon?",
"question_en": "Will the Fed hike soon?",
"category": "macro",
"resolution_criteria": "Resolves YES if the Fed hikes.",
"resolution_source": "",
"end_date_iso": "2026-12-31T23:59:59Z",
"tags": ["fed"],
"meta": {"model": "qwen/qwen-2.5-72b-instruct"},
},
]
def _critic_json(
issues: List[str], strengths: List[str], verdict: str, confidence: float
) -> str:
return json.dumps(
{
"issues": issues,
"strengths": strengths,
"verdict": verdict,
"confidence": confidence,
}
)
def _moderator_json(
winning_index: int,
reasoning: List[str],
confidence: float,
critique_signal: str,
) -> str:
return json.dumps(
{
"winning_index": winning_index,
"reasoning": reasoning,
"confidence": confidence,
"critique_signal": critique_signal,
}
)
def _hash_candidate(candidate: Dict[str, Any]) -> str:
encoded = json.dumps(candidate, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(encoded.encode()).hexdigest()
# --------------------------------------------------------------------------- #
# Tests #
# --------------------------------------------------------------------------- #
@pytest.mark.asyncio
async def test_internal_debate_full_flow_mock_llms(
event: Dict[str, Any], two_candidates: List[Dict[str, Any]]
) -> None:
"""All 3 stages fire: critic round, moderator, refine — mock LLMs only."""
async def proposer(_event: Dict[str, Any]) -> List[Dict[str, Any]]:
# Deep-copy via json roundtrip to mimic the real translator stage,
# which produces fresh dicts on each call.
return json.loads(json.dumps(two_candidates))
critic_a = AsyncMock(
return_value=_critic_json(
issues=["'soon' is vague", "no source", "drift"],
strengths=["concise"],
verdict="needs_refinement",
confidence=0.84,
)
)
critic_b = AsyncMock(
return_value=_critic_json(
issues=["minor: monetary-policy tag"],
strengths=["precise threshold", "explicit source"],
verdict="accept_as_is",
confidence=0.91,
)
)
def critic_factory(model_id: str):
if model_id == CRITIC_MODEL_A:
return critic_a
if model_id == CRITIC_MODEL_B:
return critic_b
raise AssertionError(f"unexpected critic model: {model_id}")
moderator_response = _moderator_json(
winning_index=0,
reasoning=[
"candidate 0 has explicit federalreserve.gov source",
"candidate 0 specifies exact 25bps threshold",
"candidate 1 has unmeasurable 'soon'",
],
confidence=0.9,
critique_signal=(
"Add monetary-policy tag and clarify upper-bound interpretation."
),
)
moderator_mock = AsyncMock(return_value=moderator_response)
def mod_factory(_mid: str):
return moderator_mock
refined_payload = {
"title": "Will the Fed raise rates by 25bps at the June 2026 FOMC meeting?",
"question_en": (
"Will the Fed raise the federal funds upper-bound rate by 25bps "
"at the June 2026 FOMC meeting?"
),
"category": "macro",
"resolution_criteria": (
"Resolves YES if the Federal Reserve's June 2026 FOMC statement, "
"as published on federalreserve.gov by 2026-06-30T23:59:59Z, raises "
"the federal funds target range upper bound by exactly 25 bps."
),
"resolution_source": "federalreserve.gov",
"end_date_iso": "2026-06-30T23:59:59Z",
"tags": ["macro", "fed", "rates", "monetary-policy"],
}
refine_mock = AsyncMock(return_value=json.dumps(refined_payload))
result = await run_internal_debate(
event,
propose_candidates_fn=proposer,
critic_llm_factory=critic_factory,
moderator_llm_factory=mod_factory,
refine_llm=refine_mock,
)
assert isinstance(result, InternalDebateResult)
# All three stages fired exactly once each (modulo 2x critic in parallel).
assert critic_a.await_count == 1
assert critic_b.await_count == 1
assert moderator_mock.await_count == 1
assert refine_mock.await_count == 1
# Moderator picked candidate 0; refine added the monetary-policy tag.
assert result.moderator_verdict.winning_index == 0
assert "monetary-policy" in result.final_candidate["tags"]
# Refine result body is reflected in final_candidate.
assert result.final_candidate["question_en"].startswith(
"Will the Fed raise the federal funds upper-bound rate"
)
# Intermediate trace is preserved for auditing.
assert len(result.intermediate_candidates) == 2
assert result.intermediate_candidates[0]["translator_id"] == "t0"
assert len(result.critiques) == 2
assert {c.verdict for c in result.critiques} == {
"needs_refinement",
"accept_as_is",
}
# Duration + LLM call counters populated.
assert result.total_duration_ms >= 0
# 2 proposer "calls" (one per candidate) + 2 critics + 1 moderator + 1 refine
assert result.total_llm_calls == 6
@pytest.mark.asyncio
async def test_internal_debate_handles_critic_timeout(
event: Dict[str, Any], two_candidates: List[Dict[str, Any]]
) -> None:
"""A hanging critic must soft-fail without sinking the debate."""
async def proposer(_e: Dict[str, Any]) -> List[Dict[str, Any]]:
return json.loads(json.dumps(two_candidates))
# CRITIC_MODEL_A hangs forever; CRITIC_MODEL_B returns instantly.
async def slow_llm(_prompt: str) -> str:
await asyncio.sleep(5.0)
return _critic_json([], [], "reject", 1.0)
fast_mock = AsyncMock(
return_value=_critic_json(
["minor"], ["clear"], "accept_as_is", 0.7
)
)
def critic_factory(model_id: str):
if model_id == CRITIC_MODEL_A:
return slow_llm
return fast_mock
# Moderator always picks candidate 0 here.
moderator_mock = AsyncMock(
return_value=_moderator_json(
winning_index=0,
reasoning=["A", "B", "C"],
confidence=0.8,
critique_signal="ok",
)
)
result = await run_internal_debate(
event,
propose_candidates_fn=proposer,
critic_llm_factory=critic_factory,
moderator_llm_factory=lambda _mid: moderator_mock,
refine_llm=AsyncMock(return_value="{}"), # refine no-op
critic_timeout=0.05,
)
# The hanging critic soft-failed; the other came back normally.
soft_failed = [c for c in result.critiques if c.critic_model == CRITIC_MODEL_A]
assert len(soft_failed) == 1
assert soft_failed[0].verdict == "accept_as_is" # the soft-fail default
assert "soft-fail" in soft_failed[0].raw_response
# Pipeline still produced a final candidate (moderator chose 0).
assert result.final_candidate["translator_id"] == "t0"
assert result.moderator_verdict.winning_index == 0
@pytest.mark.asyncio
async def test_internal_debate_refine_no_op_preserves_candidate(
event: Dict[str, Any], two_candidates: List[Dict[str, Any]]
) -> None:
"""Malformed refine JSON -> refine is a no-op, winning candidate untouched."""
async def proposer(_e: Dict[str, Any]) -> List[Dict[str, Any]]:
return json.loads(json.dumps(two_candidates))
critic_factory = lambda _mid: AsyncMock( # noqa: E731
return_value=_critic_json([], ["good"], "accept_as_is", 0.8)
)
moderator_mock = AsyncMock(
return_value=_moderator_json(
winning_index=0,
reasoning=["x", "y", "z"],
confidence=0.9,
critique_signal="add detail",
)
)
# Refine LLM returns garbage prose -> refine module produces no-op result.
refine_garbage = AsyncMock(return_value="sorry I can't help with that")
result = await run_internal_debate(
event,
propose_candidates_fn=proposer,
critic_llm_factory=critic_factory,
moderator_llm_factory=lambda _mid: moderator_mock,
refine_llm=refine_garbage,
)
# final_candidate equals candidate 0 (winning candidate, no edits).
winning = two_candidates[0]
assert result.final_candidate == winning
assert refine_garbage.await_count == 1
# diff_summary explains the no-op.
diff_text = " ".join(result.refine_result.diff_summary).lower()
assert "malformed json" in diff_text or "original candidate kept" in diff_text
@pytest.mark.asyncio
async def test_candidate_hash_matches_final_candidate(
event: Dict[str, Any], two_candidates: List[Dict[str, Any]]
) -> None:
"""Critical provenance property: hash(final_candidate) is reproducible
and matches what BaseTranslatorAgent commits on-chain."""
async def proposer(_e: Dict[str, Any]) -> List[Dict[str, Any]]:
return json.loads(json.dumps(two_candidates))
critic_factory = lambda _mid: AsyncMock( # noqa: E731
return_value=_critic_json([], ["good"], "accept_as_is", 0.8)
)
moderator_mock = AsyncMock(
return_value=_moderator_json(
winning_index=0,
reasoning=["a", "b", "c"],
confidence=0.9,
critique_signal="be precise",
)
)
# Refine emits a small, deterministic edit.
refined = dict(two_candidates[0])
refined["resolution_criteria"] = (
"Resolves YES if the FOMC June 2026 statement on federalreserve.gov "
"raises the federal funds upper bound by 25 bps before 2026-06-30T23:59:59Z."
)
# title / category / end_date_iso are preserved by refine.py contract.
refine_mock = AsyncMock(return_value=json.dumps(refined))
result = await run_internal_debate(
event,
propose_candidates_fn=proposer,
critic_llm_factory=critic_factory,
moderator_llm_factory=lambda _mid: moderator_mock,
refine_llm=refine_mock,
)
# Compute the hash externally — exactly what an external operator would.
expected_hash = _hash_candidate(result.final_candidate)
# And compute it the way BaseTranslatorAgent does — must match.
from polyglot_alpha.agents.base import BaseTranslatorAgent
on_chain_hash = BaseTranslatorAgent.hash_candidate_dict(
result.final_candidate
)
assert on_chain_hash.hex() == expected_hash
# The hash MUST differ from a hash of the pre-refine winning candidate
# whenever refine actually changed something — otherwise the
# post-refine provenance is meaningless.
pre_refine_hash = BaseTranslatorAgent.hash_candidate_dict(two_candidates[0])
assert pre_refine_hash != on_chain_hash, (
"candidate_hash should reflect the post-refine candidate, "
"not the pre-debate translator output"
)
@pytest.mark.asyncio
async def test_internal_debate_requires_two_candidates(
event: Dict[str, Any], two_candidates: List[Dict[str, Any]]
) -> None:
"""Proposer returning <2 candidates raises ValueError."""
async def lonely(_e: Dict[str, Any]) -> List[Dict[str, Any]]:
return [two_candidates[0]]
with pytest.raises(ValueError, match="at least 2 candidates"):
await run_internal_debate(event, propose_candidates_fn=lonely)