polyglot-alpha / tests /test_refine.py
licaomeng
deploy: main@8970ffb → HF Spaces (2026-05-27T05:19Z)
88d2f2a
"""Unit tests for the Layer 5 refine module.
All tests stay fully offline: the LLM is injected as an in-process async
callable so no network calls fire.
"""
from __future__ import annotations
import asyncio
import json
from typing import Any, Awaitable, Callable, Dict
import pytest
from polyglot_alpha.agents.refine import (
DEFAULT_REFINE_TIMEOUT_S,
PRESERVED_FIELDS,
RefineResult,
refine_with_critique,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture()
def event() -> Dict[str, Any]:
return {
"event_id": "evt_refine_001",
"title_zh": "中国宣布新关税政策",
"body_zh": "中国财政部宣布将就关税政策做出回应。",
}
@pytest.fixture()
def winning_candidate() -> Dict[str, Any]:
"""A representative moderator-picked candidate dict."""
return {
"translator_id": "t0",
"title": "Will China announce new tariffs by 2026-06-30?",
"question_en": "Will China announce new tariffs by 2026-06-30?",
"category": "geopolitics",
"end_date_iso": "2026-06-30T23:59:59Z",
"resolution_criteria": "Resolves YES if tariffs are announced.",
"resolution_source": "",
"tags": ["china", "tariffs"],
"meta": {"model": "deepseek/deepseek-chat"},
}
def _llm_returning(payload: Any) -> Callable[[str], Awaitable[str]]:
"""Build an LLM stub that returns a fixed string."""
text = payload if isinstance(payload, str) else json.dumps(payload)
async def _call(_prompt: str) -> str:
return text
return _call
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_refine_returns_revised_question(
winning_candidate: Dict[str, Any], event: Dict[str, Any]
) -> None:
revised = {
# The LLM tries to mutate the title — refine must restore it.
"title": "DIFFERENT TITLE (should be reverted)",
"question_en": "Will China's Ministry of Finance announce new tariffs by 2026-06-30?",
"category": "geopolitics",
"end_date_iso": "2026-06-30T23:59:59Z",
"resolution_criteria": (
"Resolves YES if an official report by China's Ministry of "
"Finance announces new tariffs before 2026-06-30T23:59:59Z."
),
"resolution_source": "https://www.mof.gov.cn/",
"tags": ["china", "tariffs", "trade-policy"],
}
result = asyncio.run(
refine_with_critique(
winning_candidate,
critique_signal=(
"Resolution criteria are too vague; name the official "
"source and the exact cutoff."
),
event=event,
llm=_llm_returning(revised),
)
)
assert isinstance(result, RefineResult)
assert result.refine_model == "deepseek/deepseek-chat"
# Edited fields propagate.
assert (
result.refined_question["resolution_criteria"]
== revised["resolution_criteria"]
)
assert result.refined_question["resolution_source"] == "https://www.mof.gov.cn/"
# Identity fields are forcibly preserved.
assert (
result.refined_question["title"]
== winning_candidate["title"]
)
assert result.duration_ms >= 0
assert result.raw_response # raw LLM text retained
def test_refine_preserves_required_fields(
winning_candidate: Dict[str, Any], event: Dict[str, Any]
) -> None:
"""title / category / end_date_iso must survive any LLM mutation."""
revised = {
"title": "MALICIOUS REWRITE",
"category": "MALICIOUS REWRITE",
"end_date_iso": "2099-01-01T00:00:00Z",
"question_en": "Refined wording.",
"resolution_criteria": "Refined criteria.",
"resolution_source": "https://example.com/",
"tags": [],
}
result = asyncio.run(
refine_with_critique(
winning_candidate,
critique_signal="Add an official source.",
event=event,
llm=_llm_returning(revised),
)
)
for field_name in PRESERVED_FIELDS:
assert (
result.refined_question[field_name]
== winning_candidate[field_name]
), f"refine must not mutate preserved field {field_name!r}"
def test_refine_no_op_on_malformed_json(
winning_candidate: Dict[str, Any], event: Dict[str, Any]
) -> None:
result = asyncio.run(
refine_with_critique(
winning_candidate,
critique_signal="Tighten the criteria.",
event=event,
llm=_llm_returning("this is not JSON at all, just prose"),
)
)
assert result.refined_question == winning_candidate
assert any(
"malformed JSON" in bullet for bullet in result.diff_summary
), result.diff_summary
assert result.refine_model == "deepseek/deepseek-chat"
def test_refine_no_op_on_timeout(
winning_candidate: Dict[str, Any], event: Dict[str, Any]
) -> None:
async def _hang(_prompt: str) -> str:
await asyncio.sleep(10)
return "{}"
result = asyncio.run(
refine_with_critique(
winning_candidate,
critique_signal="Improve precision.",
event=event,
llm=_hang,
timeout_s=0.05,
)
)
assert result.refined_question == winning_candidate
assert any("timed out" in bullet for bullet in result.diff_summary), (
result.diff_summary
)
def test_diff_summary_detects_resolution_criteria_changes(
winning_candidate: Dict[str, Any], event: Dict[str, Any]
) -> None:
"""diff_summary must surface ``official report by`` and similar precision
markers added to the resolution_criteria."""
revised = dict(winning_candidate)
revised["resolution_criteria"] = (
"Resolves YES if an official report by China's Ministry of Finance "
"announces tariffs before 2026-06-30T23:59:59Z."
)
result = asyncio.run(
refine_with_critique(
winning_candidate,
critique_signal="Name the official source.",
event=event,
llm=_llm_returning(revised),
)
)
joined = " | ".join(result.diff_summary)
assert "official report by" in joined, result.diff_summary
assert "precision markers" in joined, result.diff_summary
def test_model_id_resolution_uses_explicit_override(
winning_candidate: Dict[str, Any], event: Dict[str, Any]
) -> None:
"""When ``model_id`` is passed, it overrides ``meta.model``."""
result = asyncio.run(
refine_with_critique(
winning_candidate,
critique_signal="Improve precision.",
event=event,
model_id="qwen/qwen-2.5-72b-instruct",
llm=_llm_returning(dict(winning_candidate)),
)
)
assert result.refine_model == "qwen/qwen-2.5-72b-instruct"
def test_model_id_resolution_falls_back_when_meta_missing(
event: Dict[str, Any],
) -> None:
"""No meta.model + no explicit override falls back to the default."""
bare_candidate = {
"title": "Will it rain by 2026-06-30?",
"category": "weather",
"end_date_iso": "2026-06-30T23:59:59Z",
"question_en": "Will it rain by 2026-06-30?",
"resolution_criteria": "Resolves YES if it rains.",
}
result = asyncio.run(
refine_with_critique(
bare_candidate,
critique_signal="add a source",
event=event,
llm=_llm_returning(dict(bare_candidate)),
)
)
# Default fallback documented in refine.py — Anthropic Haiku 4.5
# after the OpenRouter swap.
from polyglot_alpha.llm import CLAUDE_HAIKU
assert result.refine_model == CLAUDE_HAIKU
def test_default_timeout_constant() -> None:
assert DEFAULT_REFINE_TIMEOUT_S == 45.0