DeepBoner / tests /integration /test_simple_mode_synthesis.py
VibecoderMcSwaggins's picture
feat: Implement Free Tier synthesis using HuggingFace Inference
e18ea9a
from unittest.mock import AsyncMock
import pytest
from src.orchestrators.simple import Orchestrator
from src.utils.models import (
AssessmentDetails,
Citation,
Evidence,
JudgeAssessment,
OrchestratorConfig,
SearchResult,
)
def make_evidence(title: str) -> Evidence:
return Evidence(
content="content",
citation=Citation(title=title, url="http://test.com", date="2025", source="pubmed"),
)
@pytest.mark.integration
@pytest.mark.asyncio
async def test_simple_mode_synthesizes_before_max_iterations():
"""Verify simple mode produces useful output with mocked judge."""
# Mock search to return evidence
mock_search = AsyncMock()
mock_search.execute.return_value = SearchResult(
query="test query",
evidence=[make_evidence(f"Paper {i}") for i in range(5)],
errors=[],
sources_searched=["pubmed"],
total_found=5,
)
# Mock judge to return GOOD scores eventually
# We can use MockJudgeHandler or a pure mock. Let's use a pure mock to control scores precisely.
mock_judge = AsyncMock()
# Since mock_judge has 'synthesize' attr by default (as a Mock),
# simple mode uses free-tier path.
# We must mock the return value of synthesize to simulate a successful narrative generation.
mock_judge.synthesize.return_value = "This is a synthesized report for MagicDrug."
# Iteration 1: Low scores
assess_1 = JudgeAssessment(
details=AssessmentDetails(
mechanism_score=2,
mechanism_reasoning="reasoning is sufficient for valid model",
clinical_evidence_score=2,
clinical_reasoning="reasoning is sufficient for valid model",
drug_candidates=[],
key_findings=[],
),
sufficient=False,
confidence=0.5,
recommendation="continue",
next_search_queries=["q2"],
reasoning="need more evidence to support conclusions about this topic",
)
# Iteration 2: High scores (should trigger synthesis)
assess_2 = JudgeAssessment(
details=AssessmentDetails(
mechanism_score=8,
mechanism_reasoning="reasoning is sufficient for valid model",
clinical_evidence_score=7,
clinical_reasoning="reasoning is sufficient for valid model",
drug_candidates=["MagicDrug"],
key_findings=["It works"],
),
sufficient=False, # Judge is conservative
confidence=0.9,
recommendation="continue", # Judge still says continue (simulating bias)
next_search_queries=[],
reasoning="good scores but maybe more evidence needed technically",
)
mock_judge.assess.side_effect = [assess_1, assess_2]
orchestrator = Orchestrator(
search_handler=mock_search,
judge_handler=mock_judge,
config=OrchestratorConfig(max_iterations=5),
)
events = []
async for event in orchestrator.run("test query"):
events.append(event)
if event.type == "complete":
break
# Must have synthesis with drug candidates
complete_events = [e for e in events if e.type == "complete"]
assert len(complete_events) == 1
complete_event = complete_events[0]
assert "MagicDrug" in complete_event.message
# SPEC_12: LLM synthesis produces narrative prose, not template with "Drug Candidates" header
# Check for narrative structure (LLM may omit ### prefix) OR template fallback
assert (
"Executive Summary" in complete_event.message
or "Drug Candidates" in complete_event.message
or "synthesized report" in complete_event.message
)
assert complete_event.data.get("synthesis_reason") == "high_scores_with_candidates"
assert complete_event.iteration == 2 # Should stop at it 2
@pytest.mark.integration
@pytest.mark.asyncio
async def test_partial_synthesis_generation():
"""Verify partial synthesis includes drug candidates even if max iterations reached."""
mock_search = AsyncMock()
mock_search.execute.return_value = SearchResult(
query="test", evidence=[], errors=[], sources_searched=["pubmed"], total_found=0
)
mock_judge = AsyncMock()
# Always return low scores but WITH candidates
# Scores 3+3 = 6 < 8 (late threshold), so it should NOT synthesize early
mock_judge.assess.return_value = JudgeAssessment(
details=AssessmentDetails(
mechanism_score=3,
mechanism_reasoning="reasoning is sufficient for valid model",
clinical_evidence_score=3,
clinical_reasoning="reasoning is sufficient for valid model",
drug_candidates=["PartialDrug"],
key_findings=["Partial finding"],
),
sufficient=False,
confidence=0.5,
recommendation="continue",
next_search_queries=[],
reasoning="keep going to find more evidence about this topic please",
)
orchestrator = Orchestrator(
search_handler=mock_search,
judge_handler=mock_judge,
config=OrchestratorConfig(max_iterations=2),
)
events = []
async for event in orchestrator.run("test"):
events.append(event)
complete_events = [e for e in events if e.type == "complete"]
assert len(complete_events) == 1, (
f"Expected exactly one complete event, got {len(complete_events)}"
)
complete_event = complete_events[0]
assert complete_event.data.get("max_reached") is True
# The output message should contain the drug candidate from the last assessment
assert "PartialDrug" in complete_event.message
assert "Maximum iterations reached" in complete_event.message