Spaces:
Running
Running
File size: 6,138 Bytes
d7e5abb fd1472e d7e5abb fd1472e d7e5abb 5cac97d d7e5abb 5cac97d d7e5abb 5cac97d d7e5abb fd1472e 5cac97d d7e5abb 5cac97d d7e5abb 5cac97d d7e5abb 5cac97d fd1472e 5cac97d d7e5abb 5cac97d d7e5abb 5cac97d d7e5abb fd1472e 9760706 d7e5abb 9760706 d7e5abb 5cac97d d7e5abb 5cac97d d7e5abb fd1472e 5cac97d d7e5abb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
"""Judge prompts for evidence assessment."""
from src.config.domain import ResearchDomain, get_domain_config
from src.utils.models import Evidence
def get_system_prompt(domain: ResearchDomain | str | None = None) -> str:
"""Get the system prompt for the judge agent."""
config = get_domain_config(domain)
return f"""{config.judge_system_prompt}
Your task is to SCORE evidence from biomedical literature. You do NOT decide whether to
continue searching or synthesize - that decision is made by the orchestration system
based on your scores.
## Your Role: Scoring Only
You provide objective scores. The system decides next steps based on explicit thresholds.
This separation prevents bias in the decision-making process.
## Scoring Criteria
1. **Mechanism Score (0-10)**: How well does the evidence explain the biological mechanism?
- 0-3: No clear mechanism, speculative
- 4-6: Some mechanistic insight, but gaps exist
- 7-10: Clear, well-supported mechanism of action
2. **Clinical Evidence Score (0-10)**: Strength of clinical/preclinical support?
- 0-3: No clinical data, only theoretical
- 4-6: Preclinical or early clinical data
- 7-10: Strong clinical evidence (trials, meta-analyses)
3. **Drug Candidates**: List SPECIFIC drug names mentioned in the evidence
- Only include drugs explicitly mentioned
- Do NOT hallucinate or infer drug names
- Include drug class if specific names aren't available (e.g., "SSRI antidepressants")
4. **Key Findings**: Extract 3-5 key findings from the evidence
- Focus on findings relevant to the research question
- Include mechanism insights and clinical outcomes
5. **Confidence (0.0-1.0)**: Your confidence in the scores
- Based on evidence quality and relevance
- Lower if evidence is tangential or low-quality
## Output Format
Return valid JSON with these fields:
- details.mechanism_score (int 0-10)
- details.mechanism_reasoning (string)
- details.clinical_evidence_score (int 0-10)
- details.clinical_reasoning (string)
- details.drug_candidates (list of strings)
- details.key_findings (list of strings)
- sufficient (boolean) - TRUE if scores suggest enough evidence
- confidence (float 0-1)
- recommendation ("continue" or "synthesize") - Your suggestion (system may override)
- next_search_queries (list) - If continuing, suggest FOCUSED queries
- reasoning (string)
## CRITICAL: Search Query Rules
When suggesting next_search_queries:
- STAY FOCUSED on the original research question
- Do NOT drift to tangential topics
- If question is about "female libido", do NOT suggest "bone health" or "muscle mass"
- Refine existing terms, don't explore random medical associations
"""
def get_scoring_prompt(domain: ResearchDomain | str | None = None) -> str:
"""Get the scoring instructions for the judge."""
config = get_domain_config(domain)
return config.judge_scoring_prompt
# Keep SYSTEM_PROMPT for backwards compatibility
SYSTEM_PROMPT = get_system_prompt()
MAX_EVIDENCE_FOR_JUDGE = 30 # Keep under token limits
async def select_evidence_for_judge(
evidence: list[Evidence],
query: str,
max_items: int = MAX_EVIDENCE_FOR_JUDGE,
) -> list[Evidence]:
"""
Select diverse, relevant evidence for judge evaluation.
Implements RAG best practices:
- Diversity selection over recency-only
- Lost-in-the-middle mitigation
- Relevance re-ranking
"""
if len(evidence) <= max_items:
return evidence
try:
from src.utils.text_utils import select_diverse_evidence
# Use embedding-based diversity selection
return await select_diverse_evidence(evidence, n=max_items, query=query)
except ImportError:
# Fallback: mix of recent + early (lost-in-the-middle mitigation)
early = evidence[: max_items // 3] # First third
recent = evidence[-(max_items * 2 // 3) :] # Last two-thirds
return early + recent
def format_user_prompt(
question: str,
evidence: list[Evidence],
iteration: int = 0,
max_iterations: int = 10,
total_evidence_count: int | None = None,
domain: ResearchDomain | str | None = None,
) -> str:
"""
Format user prompt with selected evidence and iteration context.
NOTE: Evidence should be pre-selected using select_evidence_for_judge().
This function assumes evidence is already capped.
"""
total_count = total_evidence_count or len(evidence)
max_content_len = 1500
scoring_prompt = get_scoring_prompt(domain)
def format_single_evidence(i: int, e: Evidence) -> str:
content = e.content
if len(content) > max_content_len:
content = content[:max_content_len] + "..."
return (
f"### Evidence {i + 1}\n"
f"**Source**: {e.citation.source.upper()} - {e.citation.title}\n"
f"**URL**: {e.citation.url}\n"
f"**Content**:\n{content}"
)
evidence_text = "\n\n".join([format_single_evidence(i, e) for i, e in enumerate(evidence)])
# Lost-in-the-middle mitigation: put critical context at START and END
return f"""## Research Question (IMPORTANT - stay focused on this)
{question}
## Search Progress
- **Iteration**: {iteration}/{max_iterations}
- **Total evidence collected**: {total_count} sources
- **Evidence shown below**: {len(evidence)} diverse sources (selected for relevance)
## Available Evidence
{evidence_text}
## Your Task
{scoring_prompt}
DO NOT decide "synthesize" vs "continue" - that decision is made by the system.
## REMINDER: Original Question (stay focused)
{question}
"""
def format_empty_evidence_prompt(question: str) -> str:
"""
Format prompt when no evidence was found.
Args:
question: The user's research question
Returns:
Formatted prompt string
"""
return f"""## Research Question
{question}
## Available Evidence
No evidence was found from the search.
## Your Task
Since no evidence was found, recommend search queries that might yield better results.
Set sufficient=False and recommendation=\"continue\".
Suggest 3-5 specific search queries.
"""
|