Spaces:
Running
Running
| """Unit tests for the Layer 5 refine module. | |
| All tests stay fully offline: the LLM is injected as an in-process async | |
| callable so no network calls fire. | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import json | |
| from typing import Any, Awaitable, Callable, Dict | |
| import pytest | |
| from polyglot_alpha.agents.refine import ( | |
| DEFAULT_REFINE_TIMEOUT_S, | |
| PRESERVED_FIELDS, | |
| RefineResult, | |
| refine_with_critique, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Fixtures | |
| # --------------------------------------------------------------------------- | |
| def event() -> Dict[str, Any]: | |
| return { | |
| "event_id": "evt_refine_001", | |
| "title_zh": "中国宣布新关税政策", | |
| "body_zh": "中国财政部宣布将就关税政策做出回应。", | |
| } | |
| def winning_candidate() -> Dict[str, Any]: | |
| """A representative moderator-picked candidate dict.""" | |
| return { | |
| "translator_id": "t0", | |
| "title": "Will China announce new tariffs by 2026-06-30?", | |
| "question_en": "Will China announce new tariffs by 2026-06-30?", | |
| "category": "geopolitics", | |
| "end_date_iso": "2026-06-30T23:59:59Z", | |
| "resolution_criteria": "Resolves YES if tariffs are announced.", | |
| "resolution_source": "", | |
| "tags": ["china", "tariffs"], | |
| "meta": {"model": "deepseek/deepseek-chat"}, | |
| } | |
| def _llm_returning(payload: Any) -> Callable[[str], Awaitable[str]]: | |
| """Build an LLM stub that returns a fixed string.""" | |
| text = payload if isinstance(payload, str) else json.dumps(payload) | |
| async def _call(_prompt: str) -> str: | |
| return text | |
| return _call | |
| # --------------------------------------------------------------------------- | |
| # Tests | |
| # --------------------------------------------------------------------------- | |
| def test_refine_returns_revised_question( | |
| winning_candidate: Dict[str, Any], event: Dict[str, Any] | |
| ) -> None: | |
| revised = { | |
| # The LLM tries to mutate the title — refine must restore it. | |
| "title": "DIFFERENT TITLE (should be reverted)", | |
| "question_en": "Will China's Ministry of Finance announce new tariffs by 2026-06-30?", | |
| "category": "geopolitics", | |
| "end_date_iso": "2026-06-30T23:59:59Z", | |
| "resolution_criteria": ( | |
| "Resolves YES if an official report by China's Ministry of " | |
| "Finance announces new tariffs before 2026-06-30T23:59:59Z." | |
| ), | |
| "resolution_source": "https://www.mof.gov.cn/", | |
| "tags": ["china", "tariffs", "trade-policy"], | |
| } | |
| result = asyncio.run( | |
| refine_with_critique( | |
| winning_candidate, | |
| critique_signal=( | |
| "Resolution criteria are too vague; name the official " | |
| "source and the exact cutoff." | |
| ), | |
| event=event, | |
| llm=_llm_returning(revised), | |
| ) | |
| ) | |
| assert isinstance(result, RefineResult) | |
| assert result.refine_model == "deepseek/deepseek-chat" | |
| # Edited fields propagate. | |
| assert ( | |
| result.refined_question["resolution_criteria"] | |
| == revised["resolution_criteria"] | |
| ) | |
| assert result.refined_question["resolution_source"] == "https://www.mof.gov.cn/" | |
| # Identity fields are forcibly preserved. | |
| assert ( | |
| result.refined_question["title"] | |
| == winning_candidate["title"] | |
| ) | |
| assert result.duration_ms >= 0 | |
| assert result.raw_response # raw LLM text retained | |
| def test_refine_preserves_required_fields( | |
| winning_candidate: Dict[str, Any], event: Dict[str, Any] | |
| ) -> None: | |
| """title / category / end_date_iso must survive any LLM mutation.""" | |
| revised = { | |
| "title": "MALICIOUS REWRITE", | |
| "category": "MALICIOUS REWRITE", | |
| "end_date_iso": "2099-01-01T00:00:00Z", | |
| "question_en": "Refined wording.", | |
| "resolution_criteria": "Refined criteria.", | |
| "resolution_source": "https://example.com/", | |
| "tags": [], | |
| } | |
| result = asyncio.run( | |
| refine_with_critique( | |
| winning_candidate, | |
| critique_signal="Add an official source.", | |
| event=event, | |
| llm=_llm_returning(revised), | |
| ) | |
| ) | |
| for field_name in PRESERVED_FIELDS: | |
| assert ( | |
| result.refined_question[field_name] | |
| == winning_candidate[field_name] | |
| ), f"refine must not mutate preserved field {field_name!r}" | |
| def test_refine_no_op_on_malformed_json( | |
| winning_candidate: Dict[str, Any], event: Dict[str, Any] | |
| ) -> None: | |
| result = asyncio.run( | |
| refine_with_critique( | |
| winning_candidate, | |
| critique_signal="Tighten the criteria.", | |
| event=event, | |
| llm=_llm_returning("this is not JSON at all, just prose"), | |
| ) | |
| ) | |
| assert result.refined_question == winning_candidate | |
| assert any( | |
| "malformed JSON" in bullet for bullet in result.diff_summary | |
| ), result.diff_summary | |
| assert result.refine_model == "deepseek/deepseek-chat" | |
| def test_refine_no_op_on_timeout( | |
| winning_candidate: Dict[str, Any], event: Dict[str, Any] | |
| ) -> None: | |
| async def _hang(_prompt: str) -> str: | |
| await asyncio.sleep(10) | |
| return "{}" | |
| result = asyncio.run( | |
| refine_with_critique( | |
| winning_candidate, | |
| critique_signal="Improve precision.", | |
| event=event, | |
| llm=_hang, | |
| timeout_s=0.05, | |
| ) | |
| ) | |
| assert result.refined_question == winning_candidate | |
| assert any("timed out" in bullet for bullet in result.diff_summary), ( | |
| result.diff_summary | |
| ) | |
| def test_diff_summary_detects_resolution_criteria_changes( | |
| winning_candidate: Dict[str, Any], event: Dict[str, Any] | |
| ) -> None: | |
| """diff_summary must surface ``official report by`` and similar precision | |
| markers added to the resolution_criteria.""" | |
| revised = dict(winning_candidate) | |
| revised["resolution_criteria"] = ( | |
| "Resolves YES if an official report by China's Ministry of Finance " | |
| "announces tariffs before 2026-06-30T23:59:59Z." | |
| ) | |
| result = asyncio.run( | |
| refine_with_critique( | |
| winning_candidate, | |
| critique_signal="Name the official source.", | |
| event=event, | |
| llm=_llm_returning(revised), | |
| ) | |
| ) | |
| joined = " | ".join(result.diff_summary) | |
| assert "official report by" in joined, result.diff_summary | |
| assert "precision markers" in joined, result.diff_summary | |
| def test_model_id_resolution_uses_explicit_override( | |
| winning_candidate: Dict[str, Any], event: Dict[str, Any] | |
| ) -> None: | |
| """When ``model_id`` is passed, it overrides ``meta.model``.""" | |
| result = asyncio.run( | |
| refine_with_critique( | |
| winning_candidate, | |
| critique_signal="Improve precision.", | |
| event=event, | |
| model_id="qwen/qwen-2.5-72b-instruct", | |
| llm=_llm_returning(dict(winning_candidate)), | |
| ) | |
| ) | |
| assert result.refine_model == "qwen/qwen-2.5-72b-instruct" | |
| def test_model_id_resolution_falls_back_when_meta_missing( | |
| event: Dict[str, Any], | |
| ) -> None: | |
| """No meta.model + no explicit override falls back to the default.""" | |
| bare_candidate = { | |
| "title": "Will it rain by 2026-06-30?", | |
| "category": "weather", | |
| "end_date_iso": "2026-06-30T23:59:59Z", | |
| "question_en": "Will it rain by 2026-06-30?", | |
| "resolution_criteria": "Resolves YES if it rains.", | |
| } | |
| result = asyncio.run( | |
| refine_with_critique( | |
| bare_candidate, | |
| critique_signal="add a source", | |
| event=event, | |
| llm=_llm_returning(dict(bare_candidate)), | |
| ) | |
| ) | |
| # Default fallback documented in refine.py — Anthropic Haiku 4.5 | |
| # after the OpenRouter swap. | |
| from polyglot_alpha.llm import CLAUDE_HAIKU | |
| assert result.refine_model == CLAUDE_HAIKU | |
| def test_default_timeout_constant() -> None: | |
| assert DEFAULT_REFINE_TIMEOUT_S == 45.0 | |