Spaces:
Running
Running
Phase 3 Implementation Spec: Judge Vertical Slice
Goal: Implement the "Brain" of the agent — evaluating evidence quality. Philosophy: "Structured Output or Bust." Estimated Effort: 3-4 hours Prerequisite: Phase 2 complete
1. The Slice Definition
This slice covers:
- Input: Question + List of
Evidence. - Process:
- Construct prompt with evidence.
- Call LLM (PydanticAI).
- Parse into
JudgeAssessment.
- Output:
JudgeAssessmentobject.
Files:
src/utils/models.py: Add Judge modelssrc/prompts/judge.py: Prompt templatessrc/agent_factory/judges.py: Handler logic
2. Models (src/utils/models.py)
Add these to the existing models file:
class DrugCandidate(BaseModel):
"""A potential drug repurposing candidate."""
drug_name: str = Field(..., description="Name of the drug")
original_indication: str = Field(..., description="What the drug was originally approved for")
proposed_indication: str = Field(..., description="The new proposed use")
mechanism: str = Field(..., description="Proposed mechanism of action")
evidence_strength: Literal["weak", "moderate", "strong"] = Field(
...,
description="Strength of supporting evidence"
)
class JudgeAssessment(BaseModel):
"""The judge's assessment of the collected evidence."""
sufficient: bool = Field(
...,
description="Is there enough evidence to write a report?"
)
recommendation: Literal["continue", "synthesize"] = Field(
...,
description="Should we search more or synthesize a report?"
)
reasoning: str = Field(
...,
max_length=500,
description="Explanation of the assessment"
)
overall_quality_score: int = Field(
...,
ge=0,
le=10,
description="Overall quality of evidence (0-10)"
)
coverage_score: int = Field(
...,
ge=0,
le=10,
description="How well does evidence cover the query (0-10)"
)
candidates: list[DrugCandidate] = Field(
default_factory=list,
description="Drug candidates identified in the evidence"
)
next_search_queries: list[str] = Field(
default_factory=list,
max_length=5,
description="Suggested follow-up queries if more evidence needed"
)
gaps: list[str] = Field(
default_factory=list,
description="Information gaps identified in current evidence"
)
3. Prompts (src/prompts/judge.py)
"""Prompt templates for the Judge."""
from typing import List
from src.utils.models import Evidence
JUDGE_SYSTEM_PROMPT = """You are a biomedical research quality assessor specializing in drug repurposing.
Your job is to evaluate evidence retrieved from PubMed and web searches, and decide if:
1. There is SUFFICIENT evidence to write a research report
2. More searching is needed to fill gaps
## Evaluation Criteria
### For "sufficient" = True (ready to synthesize):
- At least 3 relevant pieces of evidence
- At least one peer-reviewed source (PubMed)
- Clear mechanism of action identified
- Drug candidates with at least "moderate" evidence strength
### For "sufficient" = False (continue searching):
- Fewer than 3 relevant pieces
- No clear drug candidates identified
- Major gaps in mechanism understanding
- All evidence is low quality
## Output Requirements
- Be STRICT. Only mark sufficient=True if evidence is genuinely adequate
- Always provide reasoning for your decision
- If continuing, suggest SPECIFIC, ACTIONABLE search queries
- Identify concrete gaps, not vague statements
## Important
- You are assessing DRUG REPURPOSING potential
- Focus on: mechanism of action, existing clinical data, safety profile
- Ignore marketing content or non-scientific sources"""
def format_evidence_for_prompt(evidence_list: List[Evidence]) -> str:
"""Format evidence list into a string for the prompt."""
if not evidence_list:
return "NO EVIDENCE COLLECTED YET"
formatted = []
for i, ev in enumerate(evidence_list, 1):
formatted.append(f"""
---
Source: {ev.citation.source.upper()}
Title: {ev.citation.title}
Date: {ev.citation.date}
URL: {ev.citation.url}
Content:
{ev.content[:1500]}
---")
return "\n".join(formatted)
def build_judge_user_prompt(question: str, evidence: List[Evidence]) -> str:
"""Build the user prompt for the judge."""
evidence_text = format_evidence_for_prompt(evidence)
return f"""## Research Question
{question}
## Collected Evidence ({len(evidence)} pieces)
{evidence_text}
## Your Task
Assess the evidence above and provide your structured assessment.
If evidence is insufficient, suggest 2-3 specific follow-up search queries."""
4. Handler (src/agent_factory/judges.py)
"""Judge handler - evaluates evidence quality."""
import structlog
from typing import List
from pydantic_ai import Agent
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.models.anthropic import AnthropicModel
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from src.utils.config import settings
from src.utils.exceptions import JudgeError
from src.utils.models import JudgeAssessment, Evidence
from src.prompts.judge import JUDGE_SYSTEM_PROMPT, build_judge_user_prompt
logger = structlog.get_logger()
def get_llm_model():
"""Get the configured LLM model for PydanticAI."""
if settings.llm_provider == "openai":
return OpenAIModel(
settings.llm_model,
api_key=settings.get_api_key(),
)
elif settings.llm_provider == "anthropic":
return AnthropicModel(
settings.llm_model,
api_key=settings.get_api_key(),
)
else:
raise JudgeError(f"Unknown LLM provider: {settings.llm_provider}")
# Initialize Agent
judge_agent = Agent(
model=get_llm_model(),
result_type=JudgeAssessment,
system_prompt=JUDGE_SYSTEM_PROMPT,
)
class JudgeHandler:
"""Handles evidence assessment using LLM."""
def __init__(self, agent: Agent | None = None):
"""
Initialize the judge handler.
Args:
agent: Optional PydanticAI agent (for testing injection)
"""
self.agent = agent or judge_agent
self._call_count = 0
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
retry=retry_if_exception_type((TimeoutError, ConnectionError)),
reraise=True,
)
async def assess(
self,
question: str,
evidence: List[Evidence],
) -> JudgeAssessment:
"""
Assess the quality and sufficiency of evidence.
Args:
question: The original research question
evidence: List of Evidence objects to assess
Returns:
JudgeAssessment with decision and recommendations
Raises:
JudgeError: If assessment fails after retries
"""
logger.info(
"Starting evidence assessment",
question=question[:100],
evidence_count=len(evidence),
)
self._call_count += 1
# Build the prompt
user_prompt = build_judge_user_prompt(question, evidence)
try:
# Run the agent - PydanticAI handles structured output
result = await self.agent.run(user_prompt)
# result.data is already a JudgeAssessment (typed!)
assessment = result.data
logger.info(
"Assessment complete",
sufficient=assessment.sufficient,
recommendation=assessment.recommendation,
quality_score=assessment.overall_quality_score,
candidates_found=len(assessment.candidates),
)
return assessment
except Exception as e:
logger.error("Judge assessment failed", error=str(e))
raise JudgeError(f"Failed to assess evidence: {e}") from e
async def should_continue(self, assessment: JudgeAssessment) -> bool:
"""
Decide if the search loop should continue based on the assessment.
Returns:
True if we should search more, False if we should stop (synthesize or give up).
"""
return not assessment.sufficient and assessment.recommendation == "continue"
@property
def call_count(self) -> int:
"""Number of LLM calls made (for budget tracking)."""
return self._call_count
5. TDD Workflow
Test File: tests/unit/agent_factory/test_judges.py
"""Unit tests for JudgeHandler."""
import pytest
from unittest.mock import AsyncMock, MagicMock
class TestJudgeHandler:
@pytest.mark.asyncio
async def test_assess_returns_assessment(self, mocker):
from src.agent_factory.judges import JudgeHandler
from src.utils.models import JudgeAssessment, Evidence, Citation
# Mock PydanticAI agent result
mock_result = MagicMock()
mock_result.data = JudgeAssessment(
sufficient=True,
recommendation="synthesize",
reasoning="Good",
overall_quality_score=8,
coverage_score=8
)
mock_agent = AsyncMock()
mock_agent.run = AsyncMock(return_value=mock_result)
handler = JudgeHandler(agent=mock_agent)
result = await handler.assess("q", [])
assert result.sufficient is True
@pytest.mark.asyncio
async def test_should_continue(self, mocker):
from src.agent_factory.judges import JudgeHandler
from src.utils.models import JudgeAssessment
handler = JudgeHandler(agent=AsyncMock())
# Continue case
assess1 = JudgeAssessment(
sufficient=False,
recommendation="continue",
reasoning="Need more",
overall_quality_score=5,
coverage_score=5
)
assert await handler.should_continue(assess1) is True
# Stop case
assess2 = JudgeAssessment(
sufficient=True,
recommendation="synthesize",
reasoning="Done",
overall_quality_score=8,
coverage_score=8
)
assert await handler.should_continue(assess2) is False
6. Implementation Checklist
- Update
src/utils/models.pywith Judge models - Create
src/prompts/judge.py - Implement
src/agent_factory/judges.py - Write tests in
tests/unit/agent_factory/test_judges.py - Run
uv run pytest tests/unit/agent_factory/