|
|
"""Unit tests for JudgeHandler.""" |
|
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch |
|
|
|
|
|
import pytest |
|
|
|
|
|
from src.agent_factory.judges import JudgeHandler, MockJudgeHandler |
|
|
from src.utils.models import AssessmentDetails, Citation, Evidence, JudgeAssessment |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestJudgeHandler: |
|
|
"""Tests for JudgeHandler.""" |
|
|
|
|
|
@pytest.mark.asyncio |
|
|
async def test_assess_returns_assessment(self): |
|
|
"""JudgeHandler should return JudgeAssessment from LLM.""" |
|
|
|
|
|
expected_confidence = 0.85 |
|
|
mock_assessment = JudgeAssessment( |
|
|
details=AssessmentDetails( |
|
|
mechanism_score=8, |
|
|
mechanism_reasoning="Strong mechanistic evidence", |
|
|
clinical_evidence_score=7, |
|
|
clinical_reasoning="Good clinical support", |
|
|
drug_candidates=["Testosterone"], |
|
|
key_findings=["Libido enhancement effects"], |
|
|
), |
|
|
sufficient=True, |
|
|
confidence=expected_confidence, |
|
|
recommendation="synthesize", |
|
|
next_search_queries=[], |
|
|
reasoning="Evidence is sufficient for synthesis", |
|
|
) |
|
|
|
|
|
|
|
|
mock_result = MagicMock() |
|
|
mock_result.output = mock_assessment |
|
|
|
|
|
with ( |
|
|
patch("src.agent_factory.judges.get_model") as mock_get_model, |
|
|
patch("src.agent_factory.judges.Agent") as mock_agent_class, |
|
|
): |
|
|
mock_get_model.return_value = MagicMock() |
|
|
mock_agent = AsyncMock() |
|
|
mock_agent.run = AsyncMock(return_value=mock_result) |
|
|
mock_agent_class.return_value = mock_agent |
|
|
|
|
|
handler = JudgeHandler() |
|
|
|
|
|
handler.agent = mock_agent |
|
|
|
|
|
evidence = [ |
|
|
Evidence( |
|
|
content="Sildenafil shows efficacy in ED...", |
|
|
citation=Citation( |
|
|
source="pubmed", |
|
|
title="Sildenafil in ED", |
|
|
url="https://pubmed.ncbi.nlm.nih.gov/12345/", |
|
|
date="2024-01-01", |
|
|
), |
|
|
) |
|
|
] |
|
|
|
|
|
result = await handler.assess("sildenafil efficacy", evidence) |
|
|
|
|
|
assert result.sufficient is True |
|
|
assert result.recommendation == "synthesize" |
|
|
assert result.confidence == expected_confidence |
|
|
assert "Testosterone" in result.details.drug_candidates |
|
|
|
|
|
@pytest.mark.asyncio |
|
|
async def test_assess_empty_evidence(self): |
|
|
"""JudgeHandler should handle empty evidence gracefully.""" |
|
|
mock_assessment = JudgeAssessment( |
|
|
details=AssessmentDetails( |
|
|
mechanism_score=0, |
|
|
mechanism_reasoning="No evidence to assess", |
|
|
clinical_evidence_score=0, |
|
|
clinical_reasoning="No evidence to assess", |
|
|
drug_candidates=[], |
|
|
key_findings=[], |
|
|
), |
|
|
sufficient=False, |
|
|
confidence=0.0, |
|
|
recommendation="continue", |
|
|
next_search_queries=["sildenafil mechanism"], |
|
|
reasoning="No evidence found, need to search more", |
|
|
) |
|
|
|
|
|
mock_result = MagicMock() |
|
|
mock_result.output = mock_assessment |
|
|
|
|
|
with ( |
|
|
patch("src.agent_factory.judges.get_model") as mock_get_model, |
|
|
patch("src.agent_factory.judges.Agent") as mock_agent_class, |
|
|
): |
|
|
mock_get_model.return_value = MagicMock() |
|
|
mock_agent = AsyncMock() |
|
|
mock_agent.run = AsyncMock(return_value=mock_result) |
|
|
mock_agent_class.return_value = mock_agent |
|
|
|
|
|
handler = JudgeHandler() |
|
|
handler.agent = mock_agent |
|
|
|
|
|
result = await handler.assess("sildenafil efficacy", []) |
|
|
|
|
|
assert result.sufficient is False |
|
|
assert result.recommendation == "continue" |
|
|
assert len(result.next_search_queries) > 0 |
|
|
|
|
|
assert "sildenafil mechanism" in result.next_search_queries |
|
|
|
|
|
@pytest.mark.asyncio |
|
|
async def test_assess_handles_llm_failure(self): |
|
|
"""JudgeHandler should return fallback on LLM failure.""" |
|
|
with ( |
|
|
patch("src.agent_factory.judges.get_model") as mock_get_model, |
|
|
patch("src.agent_factory.judges.Agent") as mock_agent_class, |
|
|
): |
|
|
mock_get_model.return_value = MagicMock() |
|
|
mock_agent = AsyncMock() |
|
|
mock_agent.run = AsyncMock(side_effect=Exception("API Error")) |
|
|
mock_agent_class.return_value = mock_agent |
|
|
|
|
|
handler = JudgeHandler() |
|
|
handler.agent = mock_agent |
|
|
|
|
|
evidence = [ |
|
|
Evidence( |
|
|
content="Some content", |
|
|
citation=Citation( |
|
|
source="pubmed", |
|
|
title="Title", |
|
|
url="url", |
|
|
date="2024", |
|
|
), |
|
|
) |
|
|
] |
|
|
|
|
|
result = await handler.assess("test question", evidence) |
|
|
|
|
|
|
|
|
assert result.sufficient is False |
|
|
assert result.recommendation == "continue" |
|
|
assert "failed" in result.reasoning.lower() |
|
|
|
|
|
|
|
|
@pytest.mark.unit |
|
|
class TestMockJudgeHandler: |
|
|
"""Tests for MockJudgeHandler.""" |
|
|
|
|
|
@pytest.mark.asyncio |
|
|
async def test_mock_handler_returns_default(self): |
|
|
"""MockJudgeHandler should return default assessment.""" |
|
|
handler = MockJudgeHandler() |
|
|
|
|
|
evidence = [ |
|
|
Evidence( |
|
|
content="Content 1", |
|
|
citation=Citation(source="pubmed", title="T1", url="u1", date="2024"), |
|
|
), |
|
|
Evidence( |
|
|
content="Content 2", |
|
|
citation=Citation(source="pubmed", title="T2", url="u2", date="2024"), |
|
|
), |
|
|
] |
|
|
|
|
|
result = await handler.assess("test", evidence) |
|
|
|
|
|
expected_evidence_len = 2 |
|
|
|
|
|
expected_mech_score = min(10, expected_evidence_len * 2) |
|
|
|
|
|
assert handler.call_count == 1 |
|
|
assert handler.last_question == "test" |
|
|
assert handler.last_evidence is not None |
|
|
assert len(handler.last_evidence) == expected_evidence_len |
|
|
assert result.details.mechanism_score == expected_mech_score |
|
|
assert result.sufficient is False |
|
|
assert result.recommendation == "continue" |
|
|
|
|
|
assert "Demo mode" in result.reasoning |
|
|
|
|
|
@pytest.mark.asyncio |
|
|
async def test_mock_handler_custom_response(self): |
|
|
"""MockJudgeHandler should return custom response when provided.""" |
|
|
expected_score = 10 |
|
|
custom_assessment = JudgeAssessment( |
|
|
details=AssessmentDetails( |
|
|
mechanism_score=expected_score, |
|
|
mechanism_reasoning="Custom reasoning", |
|
|
clinical_evidence_score=expected_score, |
|
|
clinical_reasoning="Custom clinical", |
|
|
drug_candidates=["CustomDrug"], |
|
|
key_findings=["Custom finding"], |
|
|
), |
|
|
sufficient=True, |
|
|
confidence=1.0, |
|
|
recommendation="synthesize", |
|
|
next_search_queries=[], |
|
|
reasoning="Custom assessment logic for testing purposes must be at least 20 chars long", |
|
|
) |
|
|
|
|
|
handler = MockJudgeHandler(mock_response=custom_assessment) |
|
|
result = await handler.assess("test", []) |
|
|
|
|
|
assert result.details.mechanism_score == expected_score |
|
|
assert result.details.drug_candidates == ["CustomDrug"] |
|
|
|
|
|
@pytest.mark.asyncio |
|
|
async def test_mock_handler_insufficient_with_few_evidence(self): |
|
|
"""MockJudgeHandler should recommend continue with < 3 evidence.""" |
|
|
handler = MockJudgeHandler() |
|
|
|
|
|
|
|
|
evidence = [ |
|
|
Evidence( |
|
|
content="Content", |
|
|
citation=Citation(source="pubmed", title="T", url="u", date="2024"), |
|
|
), |
|
|
Evidence( |
|
|
content="Content 2", |
|
|
citation=Citation(source="pubmed", title="T2", url="u2", date="2024"), |
|
|
), |
|
|
] |
|
|
|
|
|
result = await handler.assess("test", evidence) |
|
|
|
|
|
assert result.sufficient is False |
|
|
assert result.recommendation == "continue" |
|
|
assert len(result.next_search_queries) > 0 |
|
|
|