File size: 4,798 Bytes
d7e5abb
 
fd1472e
d7e5abb
 
fd1472e
 
f295ef3
fd1472e
f295ef3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7e5abb
fd1472e
 
 
f295ef3
 
fd1472e
 
 
 
 
5cac97d
 
d7e5abb
5cac97d
 
 
 
 
d7e5abb
5cac97d
d7e5abb
5cac97d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd1472e
5cac97d
 
 
d7e5abb
cd7c282
 
d7e5abb
fd1472e
9760706
 
 
 
 
 
d7e5abb
 
 
9760706
 
 
 
d7e5abb
5cac97d
d7e5abb
 
5cac97d
 
 
 
 
 
d7e5abb
 
 
 
 
fd1472e
d7e5abb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""Judge prompts for evidence assessment."""

from src.config.domain import ResearchDomain, get_domain_config
from src.utils.models import Evidence


def get_system_prompt(domain: ResearchDomain | str | None = None) -> str:
    """Get the system prompt for the judge agent (Magentic/Advanced Mode)."""
    config = get_domain_config(domain)

    return f"""You are an expert research judge specializing in {config.name}.
Your role is to evaluate evidence for interventions, assess efficacy and safety data,
and determine clinical applicability.

When asked to evaluate:

1. Review all evidence presented in the conversation
2. Score on two dimensions (0-10 each):
   - Mechanism Score: How well is the biological mechanism explained?
   - Clinical Score: How strong is the clinical/preclinical evidence?
3. Determine if evidence is SUFFICIENT for a final report:
   - Sufficient: Clear mechanism + supporting clinical data
   - Insufficient: Gaps in mechanism OR weak clinical evidence
4. If insufficient, suggest specific search queries to fill gaps

## CRITICAL OUTPUT FORMAT
To ensure the workflow terminates when appropriate, you MUST follow these rules:

IF evidence is SUFFICIENT (confidence >= 70%):
   Start your response with a line like:
   "βœ… SUFFICIENT EVIDENCE (confidence: 72%). STOP SEARCHING. Delegate to ReportAgent NOW."
   Use your actual numeric confidence instead of 72.
   Then explain why.

IF evidence is INSUFFICIENT:
   Start with "❌ INSUFFICIENT: <Reason>."
   Then provide scores and next queries.

Be rigorous but fair. Look for:
- Molecular targets and pathways
- Animal model studies
- Human clinical trials
- Safety data
- Drug-drug interactions"""


def get_scoring_prompt(domain: ResearchDomain | str | None = None) -> str:
    """Get the scoring instructions for the judge."""
    return """Score this evidence for relevance.
Provide ONLY scores and extracted data."""


# Keep SYSTEM_PROMPT for backwards compatibility
SYSTEM_PROMPT = get_system_prompt()

MAX_EVIDENCE_FOR_JUDGE = 30  # Keep under token limits


async def select_evidence_for_judge(
    evidence: list[Evidence],
    query: str,
    max_items: int = MAX_EVIDENCE_FOR_JUDGE,
) -> list[Evidence]:
    """
    Select diverse, relevant evidence for judge evaluation.

    Implements RAG best practices:
    - Diversity selection over recency-only
    - Lost-in-the-middle mitigation
    - Relevance re-ranking
    """
    if len(evidence) <= max_items:
        return evidence

    try:
        from src.utils.text_utils import select_diverse_evidence

        # Use embedding-based diversity selection
        return await select_diverse_evidence(evidence, n=max_items, query=query)
    except ImportError:
        # Fallback: mix of recent + early (lost-in-the-middle mitigation)
        early = evidence[: max_items // 3]  # First third
        recent = evidence[-(max_items * 2 // 3) :]  # Last two-thirds
        return early + recent


def format_user_prompt(
    question: str,
    evidence: list[Evidence],
    iteration: int = 0,
    max_iterations: int = 10,
    total_evidence_count: int | None = None,
    domain: ResearchDomain | str | None = None,
) -> str:
    """
    Format user prompt with selected evidence and iteration context.
    """
    # Use explicit None check - 0 is a valid count (empty evidence)
    total_count = total_evidence_count if total_evidence_count is not None else len(evidence)
    max_content_len = 1500
    scoring_prompt = get_scoring_prompt(domain)

    def format_single_evidence(i: int, e: Evidence) -> str:
        content = e.content
        if len(content) > max_content_len:
            content = content[:max_content_len] + "..."
        return (
            f"### Evidence {i + 1}\n"
            f"**Source**: {e.citation.source.upper()} - {e.citation.title}\n"
            f"**URL**: {e.citation.url}\n"
            f"**Content**:\n{content}"
        )

    evidence_text = "\n\n".join([format_single_evidence(i, e) for i, e in enumerate(evidence)])

    return f"""## Research Question (IMPORTANT - stay focused on this)
{question}

## Search Progress
- **Iteration**: {iteration}/{max_iterations}
- **Total evidence collected**: {total_count} sources
- **Evidence shown below**: {len(evidence)} diverse sources (selected for relevance)

## Available Evidence

{evidence_text}

## Your Task

{scoring_prompt}
"""


def format_empty_evidence_prompt(question: str) -> str:
    """
    Format prompt when no evidence was found.
    """
    return f"""## Research Question
{question}

## Available Evidence

No evidence was found from the search.

## Your Task

Since no evidence was found, recommend search queries that might yield better results.
Set sufficient=False and recommendation=\"continue\".
Suggest 3-5 specific search queries.
"""