Spaces:
Running
Running
File size: 5,718 Bytes
5cac97d e18ea9a 5cac97d 89f1173 e18ea9a 89f1173 5cac97d 4b245e3 5cac97d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
from unittest.mock import AsyncMock
import pytest
from src.orchestrators.simple import Orchestrator
from src.utils.models import (
AssessmentDetails,
Citation,
Evidence,
JudgeAssessment,
OrchestratorConfig,
SearchResult,
)
def make_evidence(title: str) -> Evidence:
return Evidence(
content="content",
citation=Citation(title=title, url="http://test.com", date="2025", source="pubmed"),
)
@pytest.mark.integration
@pytest.mark.asyncio
async def test_simple_mode_synthesizes_before_max_iterations():
"""Verify simple mode produces useful output with mocked judge."""
# Mock search to return evidence
mock_search = AsyncMock()
mock_search.execute.return_value = SearchResult(
query="test query",
evidence=[make_evidence(f"Paper {i}") for i in range(5)],
errors=[],
sources_searched=["pubmed"],
total_found=5,
)
# Mock judge to return GOOD scores eventually
# We can use MockJudgeHandler or a pure mock. Let's use a pure mock to control scores precisely.
mock_judge = AsyncMock()
# Since mock_judge has 'synthesize' attr by default (as a Mock),
# simple mode uses free-tier path.
# We must mock the return value of synthesize to simulate a successful narrative generation.
mock_judge.synthesize.return_value = "This is a synthesized report for MagicDrug."
# Iteration 1: Low scores
assess_1 = JudgeAssessment(
details=AssessmentDetails(
mechanism_score=2,
mechanism_reasoning="reasoning is sufficient for valid model",
clinical_evidence_score=2,
clinical_reasoning="reasoning is sufficient for valid model",
drug_candidates=[],
key_findings=[],
),
sufficient=False,
confidence=0.5,
recommendation="continue",
next_search_queries=["q2"],
reasoning="need more evidence to support conclusions about this topic",
)
# Iteration 2: High scores (should trigger synthesis)
assess_2 = JudgeAssessment(
details=AssessmentDetails(
mechanism_score=8,
mechanism_reasoning="reasoning is sufficient for valid model",
clinical_evidence_score=7,
clinical_reasoning="reasoning is sufficient for valid model",
drug_candidates=["MagicDrug"],
key_findings=["It works"],
),
sufficient=False, # Judge is conservative
confidence=0.9,
recommendation="continue", # Judge still says continue (simulating bias)
next_search_queries=[],
reasoning="good scores but maybe more evidence needed technically",
)
mock_judge.assess.side_effect = [assess_1, assess_2]
orchestrator = Orchestrator(
search_handler=mock_search,
judge_handler=mock_judge,
config=OrchestratorConfig(max_iterations=5),
)
events = []
async for event in orchestrator.run("test query"):
events.append(event)
if event.type == "complete":
break
# Must have synthesis with drug candidates
complete_events = [e for e in events if e.type == "complete"]
assert len(complete_events) == 1
complete_event = complete_events[0]
assert "MagicDrug" in complete_event.message
# SPEC_12: LLM synthesis produces narrative prose, not template with "Drug Candidates" header
# Check for narrative structure (LLM may omit ### prefix) OR template fallback
assert (
"Executive Summary" in complete_event.message
or "Drug Candidates" in complete_event.message
or "synthesized report" in complete_event.message
)
assert complete_event.data.get("synthesis_reason") == "high_scores_with_candidates"
assert complete_event.iteration == 2 # Should stop at it 2
@pytest.mark.integration
@pytest.mark.asyncio
async def test_partial_synthesis_generation():
"""Verify partial synthesis includes drug candidates even if max iterations reached."""
mock_search = AsyncMock()
mock_search.execute.return_value = SearchResult(
query="test", evidence=[], errors=[], sources_searched=["pubmed"], total_found=0
)
mock_judge = AsyncMock()
# Always return low scores but WITH candidates
# Scores 3+3 = 6 < 8 (late threshold), so it should NOT synthesize early
mock_judge.assess.return_value = JudgeAssessment(
details=AssessmentDetails(
mechanism_score=3,
mechanism_reasoning="reasoning is sufficient for valid model",
clinical_evidence_score=3,
clinical_reasoning="reasoning is sufficient for valid model",
drug_candidates=["PartialDrug"],
key_findings=["Partial finding"],
),
sufficient=False,
confidence=0.5,
recommendation="continue",
next_search_queries=[],
reasoning="keep going to find more evidence about this topic please",
)
orchestrator = Orchestrator(
search_handler=mock_search,
judge_handler=mock_judge,
config=OrchestratorConfig(max_iterations=2),
)
events = []
async for event in orchestrator.run("test"):
events.append(event)
complete_events = [e for e in events if e.type == "complete"]
assert len(complete_events) == 1, (
f"Expected exactly one complete event, got {len(complete_events)}"
)
complete_event = complete_events[0]
assert complete_event.data.get("max_reached") is True
# The output message should contain the drug candidate from the last assessment
assert "PartialDrug" in complete_event.message
assert "Maximum iterations reached" in complete_event.message
|