Spaces:
Sleeping
Sleeping
| """Unit tests for the internal-debate orchestrator. | |
| All tests stay offline: LLM callables are replaced with ``AsyncMock`` or | |
| plain async stubs so no network / OPENROUTER_API_KEY is required. | |
| Run with: ``.venv/bin/pytest -xvs tests/test_internal_debate.py`` | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import hashlib | |
| import json | |
| from typing import Any, Dict, List | |
| from unittest.mock import AsyncMock | |
| import pytest | |
| from polyglot_alpha.agents.critics import CRITIC_MODEL_A, CRITIC_MODEL_B | |
| from polyglot_alpha.agents.internal_debate import ( | |
| InternalDebateResult, | |
| run_internal_debate, | |
| ) | |
| from polyglot_alpha.agents.moderator import MODERATOR_MODEL | |
| # --------------------------------------------------------------------------- # | |
| # Fixtures # | |
| # --------------------------------------------------------------------------- # | |
| def event() -> Dict[str, Any]: | |
| return { | |
| "event_id": "0xdebate_smoke", | |
| "title_zh": "美联储宣布加息25个基点", | |
| "body_zh": "美联储于2026年5月议息会议宣布加息25个基点。", | |
| "url": "https://example.com/fed", | |
| "cutoff_ts": 1735689600, | |
| } | |
| def two_candidates() -> List[Dict[str, Any]]: | |
| """Two translator candidates; candidate 0 is intentionally stronger.""" | |
| return [ | |
| { | |
| "translator_id": "t0", | |
| "title": "Will the Fed raise rates by 25bps at the June 2026 FOMC meeting?", | |
| "question_en": "Will the Fed raise rates by 25bps at the June 2026 FOMC meeting?", | |
| "category": "macro", | |
| "resolution_criteria": ( | |
| "Resolves YES if the Federal Reserve's June 2026 FOMC " | |
| "statement raises the federal funds target range by " | |
| "exactly 25 basis points, as published on federalreserve.gov " | |
| "by 2026-06-30T23:59:59Z." | |
| ), | |
| "resolution_source": "federalreserve.gov", | |
| "end_date_iso": "2026-06-30T23:59:59Z", | |
| "tags": ["macro", "fed", "rates"], | |
| "meta": {"model": "deepseek/deepseek-chat"}, | |
| }, | |
| { | |
| "translator_id": "t1", | |
| "title": "Will the Fed hike soon?", | |
| "question_en": "Will the Fed hike soon?", | |
| "category": "macro", | |
| "resolution_criteria": "Resolves YES if the Fed hikes.", | |
| "resolution_source": "", | |
| "end_date_iso": "2026-12-31T23:59:59Z", | |
| "tags": ["fed"], | |
| "meta": {"model": "qwen/qwen-2.5-72b-instruct"}, | |
| }, | |
| ] | |
| def _critic_json( | |
| issues: List[str], strengths: List[str], verdict: str, confidence: float | |
| ) -> str: | |
| return json.dumps( | |
| { | |
| "issues": issues, | |
| "strengths": strengths, | |
| "verdict": verdict, | |
| "confidence": confidence, | |
| } | |
| ) | |
| def _moderator_json( | |
| winning_index: int, | |
| reasoning: List[str], | |
| confidence: float, | |
| critique_signal: str, | |
| ) -> str: | |
| return json.dumps( | |
| { | |
| "winning_index": winning_index, | |
| "reasoning": reasoning, | |
| "confidence": confidence, | |
| "critique_signal": critique_signal, | |
| } | |
| ) | |
| def _hash_candidate(candidate: Dict[str, Any]) -> str: | |
| encoded = json.dumps(candidate, sort_keys=True, separators=(",", ":")) | |
| return hashlib.sha256(encoded.encode()).hexdigest() | |
| # --------------------------------------------------------------------------- # | |
| # Tests # | |
| # --------------------------------------------------------------------------- # | |
| async def test_internal_debate_full_flow_mock_llms( | |
| event: Dict[str, Any], two_candidates: List[Dict[str, Any]] | |
| ) -> None: | |
| """All 3 stages fire: critic round, moderator, refine — mock LLMs only.""" | |
| async def proposer(_event: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| # Deep-copy via json roundtrip to mimic the real translator stage, | |
| # which produces fresh dicts on each call. | |
| return json.loads(json.dumps(two_candidates)) | |
| critic_a = AsyncMock( | |
| return_value=_critic_json( | |
| issues=["'soon' is vague", "no source", "drift"], | |
| strengths=["concise"], | |
| verdict="needs_refinement", | |
| confidence=0.84, | |
| ) | |
| ) | |
| critic_b = AsyncMock( | |
| return_value=_critic_json( | |
| issues=["minor: monetary-policy tag"], | |
| strengths=["precise threshold", "explicit source"], | |
| verdict="accept_as_is", | |
| confidence=0.91, | |
| ) | |
| ) | |
| def critic_factory(model_id: str): | |
| if model_id == CRITIC_MODEL_A: | |
| return critic_a | |
| if model_id == CRITIC_MODEL_B: | |
| return critic_b | |
| raise AssertionError(f"unexpected critic model: {model_id}") | |
| moderator_response = _moderator_json( | |
| winning_index=0, | |
| reasoning=[ | |
| "candidate 0 has explicit federalreserve.gov source", | |
| "candidate 0 specifies exact 25bps threshold", | |
| "candidate 1 has unmeasurable 'soon'", | |
| ], | |
| confidence=0.9, | |
| critique_signal=( | |
| "Add monetary-policy tag and clarify upper-bound interpretation." | |
| ), | |
| ) | |
| moderator_mock = AsyncMock(return_value=moderator_response) | |
| def mod_factory(_mid: str): | |
| return moderator_mock | |
| refined_payload = { | |
| "title": "Will the Fed raise rates by 25bps at the June 2026 FOMC meeting?", | |
| "question_en": ( | |
| "Will the Fed raise the federal funds upper-bound rate by 25bps " | |
| "at the June 2026 FOMC meeting?" | |
| ), | |
| "category": "macro", | |
| "resolution_criteria": ( | |
| "Resolves YES if the Federal Reserve's June 2026 FOMC statement, " | |
| "as published on federalreserve.gov by 2026-06-30T23:59:59Z, raises " | |
| "the federal funds target range upper bound by exactly 25 bps." | |
| ), | |
| "resolution_source": "federalreserve.gov", | |
| "end_date_iso": "2026-06-30T23:59:59Z", | |
| "tags": ["macro", "fed", "rates", "monetary-policy"], | |
| } | |
| refine_mock = AsyncMock(return_value=json.dumps(refined_payload)) | |
| result = await run_internal_debate( | |
| event, | |
| propose_candidates_fn=proposer, | |
| critic_llm_factory=critic_factory, | |
| moderator_llm_factory=mod_factory, | |
| refine_llm=refine_mock, | |
| ) | |
| assert isinstance(result, InternalDebateResult) | |
| # All three stages fired exactly once each (modulo 2x critic in parallel). | |
| assert critic_a.await_count == 1 | |
| assert critic_b.await_count == 1 | |
| assert moderator_mock.await_count == 1 | |
| assert refine_mock.await_count == 1 | |
| # Moderator picked candidate 0; refine added the monetary-policy tag. | |
| assert result.moderator_verdict.winning_index == 0 | |
| assert "monetary-policy" in result.final_candidate["tags"] | |
| # Refine result body is reflected in final_candidate. | |
| assert result.final_candidate["question_en"].startswith( | |
| "Will the Fed raise the federal funds upper-bound rate" | |
| ) | |
| # Intermediate trace is preserved for auditing. | |
| assert len(result.intermediate_candidates) == 2 | |
| assert result.intermediate_candidates[0]["translator_id"] == "t0" | |
| assert len(result.critiques) == 2 | |
| assert {c.verdict for c in result.critiques} == { | |
| "needs_refinement", | |
| "accept_as_is", | |
| } | |
| # Duration + LLM call counters populated. | |
| assert result.total_duration_ms >= 0 | |
| # 2 proposer "calls" (one per candidate) + 2 critics + 1 moderator + 1 refine | |
| assert result.total_llm_calls == 6 | |
| async def test_internal_debate_handles_critic_timeout( | |
| event: Dict[str, Any], two_candidates: List[Dict[str, Any]] | |
| ) -> None: | |
| """A hanging critic must soft-fail without sinking the debate.""" | |
| async def proposer(_e: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| return json.loads(json.dumps(two_candidates)) | |
| # CRITIC_MODEL_A hangs forever; CRITIC_MODEL_B returns instantly. | |
| async def slow_llm(_prompt: str) -> str: | |
| await asyncio.sleep(5.0) | |
| return _critic_json([], [], "reject", 1.0) | |
| fast_mock = AsyncMock( | |
| return_value=_critic_json( | |
| ["minor"], ["clear"], "accept_as_is", 0.7 | |
| ) | |
| ) | |
| def critic_factory(model_id: str): | |
| if model_id == CRITIC_MODEL_A: | |
| return slow_llm | |
| return fast_mock | |
| # Moderator always picks candidate 0 here. | |
| moderator_mock = AsyncMock( | |
| return_value=_moderator_json( | |
| winning_index=0, | |
| reasoning=["A", "B", "C"], | |
| confidence=0.8, | |
| critique_signal="ok", | |
| ) | |
| ) | |
| result = await run_internal_debate( | |
| event, | |
| propose_candidates_fn=proposer, | |
| critic_llm_factory=critic_factory, | |
| moderator_llm_factory=lambda _mid: moderator_mock, | |
| refine_llm=AsyncMock(return_value="{}"), # refine no-op | |
| critic_timeout=0.05, | |
| ) | |
| # The hanging critic soft-failed; the other came back normally. | |
| soft_failed = [c for c in result.critiques if c.critic_model == CRITIC_MODEL_A] | |
| assert len(soft_failed) == 1 | |
| assert soft_failed[0].verdict == "accept_as_is" # the soft-fail default | |
| assert "soft-fail" in soft_failed[0].raw_response | |
| # Pipeline still produced a final candidate (moderator chose 0). | |
| assert result.final_candidate["translator_id"] == "t0" | |
| assert result.moderator_verdict.winning_index == 0 | |
| async def test_internal_debate_refine_no_op_preserves_candidate( | |
| event: Dict[str, Any], two_candidates: List[Dict[str, Any]] | |
| ) -> None: | |
| """Malformed refine JSON -> refine is a no-op, winning candidate untouched.""" | |
| async def proposer(_e: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| return json.loads(json.dumps(two_candidates)) | |
| critic_factory = lambda _mid: AsyncMock( # noqa: E731 | |
| return_value=_critic_json([], ["good"], "accept_as_is", 0.8) | |
| ) | |
| moderator_mock = AsyncMock( | |
| return_value=_moderator_json( | |
| winning_index=0, | |
| reasoning=["x", "y", "z"], | |
| confidence=0.9, | |
| critique_signal="add detail", | |
| ) | |
| ) | |
| # Refine LLM returns garbage prose -> refine module produces no-op result. | |
| refine_garbage = AsyncMock(return_value="sorry I can't help with that") | |
| result = await run_internal_debate( | |
| event, | |
| propose_candidates_fn=proposer, | |
| critic_llm_factory=critic_factory, | |
| moderator_llm_factory=lambda _mid: moderator_mock, | |
| refine_llm=refine_garbage, | |
| ) | |
| # final_candidate equals candidate 0 (winning candidate, no edits). | |
| winning = two_candidates[0] | |
| assert result.final_candidate == winning | |
| assert refine_garbage.await_count == 1 | |
| # diff_summary explains the no-op. | |
| diff_text = " ".join(result.refine_result.diff_summary).lower() | |
| assert "malformed json" in diff_text or "original candidate kept" in diff_text | |
| async def test_candidate_hash_matches_final_candidate( | |
| event: Dict[str, Any], two_candidates: List[Dict[str, Any]] | |
| ) -> None: | |
| """Critical provenance property: hash(final_candidate) is reproducible | |
| and matches what BaseTranslatorAgent commits on-chain.""" | |
| async def proposer(_e: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| return json.loads(json.dumps(two_candidates)) | |
| critic_factory = lambda _mid: AsyncMock( # noqa: E731 | |
| return_value=_critic_json([], ["good"], "accept_as_is", 0.8) | |
| ) | |
| moderator_mock = AsyncMock( | |
| return_value=_moderator_json( | |
| winning_index=0, | |
| reasoning=["a", "b", "c"], | |
| confidence=0.9, | |
| critique_signal="be precise", | |
| ) | |
| ) | |
| # Refine emits a small, deterministic edit. | |
| refined = dict(two_candidates[0]) | |
| refined["resolution_criteria"] = ( | |
| "Resolves YES if the FOMC June 2026 statement on federalreserve.gov " | |
| "raises the federal funds upper bound by 25 bps before 2026-06-30T23:59:59Z." | |
| ) | |
| # title / category / end_date_iso are preserved by refine.py contract. | |
| refine_mock = AsyncMock(return_value=json.dumps(refined)) | |
| result = await run_internal_debate( | |
| event, | |
| propose_candidates_fn=proposer, | |
| critic_llm_factory=critic_factory, | |
| moderator_llm_factory=lambda _mid: moderator_mock, | |
| refine_llm=refine_mock, | |
| ) | |
| # Compute the hash externally — exactly what an external operator would. | |
| expected_hash = _hash_candidate(result.final_candidate) | |
| # And compute it the way BaseTranslatorAgent does — must match. | |
| from polyglot_alpha.agents.base import BaseTranslatorAgent | |
| on_chain_hash = BaseTranslatorAgent.hash_candidate_dict( | |
| result.final_candidate | |
| ) | |
| assert on_chain_hash.hex() == expected_hash | |
| # The hash MUST differ from a hash of the pre-refine winning candidate | |
| # whenever refine actually changed something — otherwise the | |
| # post-refine provenance is meaningless. | |
| pre_refine_hash = BaseTranslatorAgent.hash_candidate_dict(two_candidates[0]) | |
| assert pre_refine_hash != on_chain_hash, ( | |
| "candidate_hash should reflect the post-refine candidate, " | |
| "not the pre-debate translator output" | |
| ) | |
| async def test_internal_debate_requires_two_candidates( | |
| event: Dict[str, Any], two_candidates: List[Dict[str, Any]] | |
| ) -> None: | |
| """Proposer returning <2 candidates raises ValueError.""" | |
| async def lonely(_e: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| return [two_candidates[0]] | |
| with pytest.raises(ValueError, match="at least 2 candidates"): | |
| await run_internal_debate(event, propose_candidates_fn=lonely) | |