Merge pull request #91 from The-Obstacle-Is-The-Way/main
Browse filesfeat(SPEC_11): Sexual Health Research Specialist + SPEC_12 docs
This view is limited to 50 files because it contains too many changes.
See raw diff
- BRAINSTORM_EMBEDDINGS_META.md +74 -0
- SPEC_12_NARRATIVE_SYNTHESIS.md +730 -0
- TOOL_ANALYSIS_CRITICAL.md +348 -0
- docs/specs/SPEC_11_SEXUAL_HEALTH_FOCUS.md +61 -178
- examples/README.md +10 -10
- examples/embeddings_demo/run_embeddings.py +1 -1
- examples/full_stack_demo/run_full.py +7 -7
- examples/hypothesis_demo/run_hypothesis.py +6 -6
- examples/modal_demo/run_analysis.py +3 -2
- examples/orchestrator_demo/run_agent.py +6 -5
- examples/orchestrator_demo/run_magentic.py +2 -2
- examples/search_demo/run_search.py +3 -3
- src/agent_factory/judges.py +7 -1
- src/agents/magentic_agents.py +1 -1
- src/agents/tools.py +3 -3
- src/app.py +19 -11
- src/config/domain.py +2 -2
- src/mcp_tools.py +12 -10
- src/middleware/sub_iteration.py +14 -2
- src/orchestrators/factory.py +1 -1
- src/orchestrators/simple.py +146 -10
- src/prompts/hypothesis.py +5 -5
- src/prompts/report.py +4 -3
- src/prompts/synthesis.py +209 -0
- src/tools/clinicaltrials.py +1 -1
- src/tools/query_utils.py +26 -33
- src/utils/exceptions.py +24 -0
- tests/conftest.py +5 -5
- tests/e2e/test_simple_mode.py +6 -6
- tests/integration/test_dual_mode_e2e.py +1 -1
- tests/integration/test_mcp_tools_live.py +1 -1
- tests/integration/test_simple_mode_synthesis.py +5 -1
- tests/unit/agent_factory/test_judges.py +12 -8
- tests/unit/agents/test_hypothesis_agent.py +11 -11
- tests/unit/agents/test_judge_agent.py +1 -1
- tests/unit/agents/test_report_agent.py +26 -21
- tests/unit/graph/test_nodes.py +7 -7
- tests/unit/orchestrators/test_simple_orchestrator_domain.py +2 -2
- tests/unit/orchestrators/test_simple_synthesis.py +279 -0
- tests/unit/orchestrators/test_termination.py +1 -1
- tests/unit/prompts/test_synthesis.py +217 -0
- tests/unit/services/test_embeddings.py +2 -2
- tests/unit/services/test_statistical_analyzer.py +2 -2
- tests/unit/test_mcp_tools.py +30 -17
- tests/unit/test_orchestrator.py +2 -2
- tests/unit/tools/test_clinicaltrials.py +8 -8
- tests/unit/tools/test_europepmc.py +8 -8
- tests/unit/tools/test_openalex.py +18 -19
- tests/unit/tools/test_pubmed.py +33 -8
- tests/unit/tools/test_query_utils.py +23 -23
BRAINSTORM_EMBEDDINGS_META.md
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Embeddings Brainstorm - Conclusions
|
| 2 |
+
|
| 3 |
+
**Date**: November 2025
|
| 4 |
+
**Status**: CLOSED - Conclusions reached, no action needed
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## The Question
|
| 9 |
+
|
| 10 |
+
Should DeepBoner implement:
|
| 11 |
+
1. Internal codebase embeddings/ingestion pipeline?
|
| 12 |
+
2. mGREP for internal tool selection?
|
| 13 |
+
3. Self-knowledge components for agents?
|
| 14 |
+
|
| 15 |
+
## The Answer: NO
|
| 16 |
+
|
| 17 |
+
After research and first-principles analysis, the conclusion is clear:
|
| 18 |
+
|
| 19 |
+
### Why Not Internal Embeddings/Ingestion
|
| 20 |
+
|
| 21 |
+
```text
|
| 22 |
+
DeepBoner's Core Task:
|
| 23 |
+
┌─────────────────────────────────────────────────────────┐
|
| 24 |
+
│ User Query: "Evidence for testosterone in HSDD?" │
|
| 25 |
+
│ ↓ │
|
| 26 |
+
│ 1. Search PubMed, ClinicalTrials, Europe PMC │
|
| 27 |
+
│ 2. Judge: Is evidence sufficient? │
|
| 28 |
+
│ 3. Synthesize: Generate report │
|
| 29 |
+
│ ↓ │
|
| 30 |
+
│ Output: Research report with citations │
|
| 31 |
+
└─────────────────────────────────────────────────────────┘
|
| 32 |
+
|
| 33 |
+
Does ANY step require self-knowledge of codebase? NO.
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
### Why Not mGREP for Tool Selection
|
| 37 |
+
|
| 38 |
+
| Approach | Complexity | Accuracy |
|
| 39 |
+
|----------|------------|----------|
|
| 40 |
+
| Embeddings + mGREP for tool selection | High | Medium (semantic similarity ≠ correct tool) |
|
| 41 |
+
| Direct prompting with tool descriptions | Low | High (LLM reasons about applicability) |
|
| 42 |
+
|
| 43 |
+
**No real agent system uses embeddings for tool selection.** All major frameworks (LangChain, OpenAI, Anthropic, Magentic) use prompt-based tool selection because:
|
| 44 |
+
1. LLMs are already doing semantic matching internally
|
| 45 |
+
2. Tool count is small (5-20) - fits easily in context
|
| 46 |
+
3. Prompts allow reasoning, not just similarity
|
| 47 |
+
|
| 48 |
+
### What We Already Have
|
| 49 |
+
|
| 50 |
+
DeepBoner already uses embeddings for the **right thing**: research evidence retrieval.
|
| 51 |
+
- `src/services/embeddings.py` - ChromaDB + sentence-transformers
|
| 52 |
+
- `src/services/llamaindex_rag.py` - OpenAI embeddings for premium tier
|
| 53 |
+
|
| 54 |
+
### The Real Priority
|
| 55 |
+
|
| 56 |
+
Instead of internal embeddings/mGREP, focus on:
|
| 57 |
+
1. **Deduplication** across PubMed/Europe PMC/OpenAlex
|
| 58 |
+
2. **Outcome measures** from ClinicalTrials.gov
|
| 59 |
+
3. **Citation graph traversal** via OpenAlex
|
| 60 |
+
|
| 61 |
+
See: `TOOL_ANALYSIS_CRITICAL.md` for detailed improvement roadmap.
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## Research Sources
|
| 66 |
+
|
| 67 |
+
- [SICA Paper (ICLR 2025)](https://arxiv.org/abs/2504.15228) - Self-improving agents
|
| 68 |
+
- [Gödel Agent (ACL 2025)](https://arxiv.org/abs/2410.04444) - Recursive self-modification
|
| 69 |
+
- [Introspection Paradox (EMNLP 2025)](https://aclanthology.org/2025.emnlp-main.352/) - Self-knowledge can hurt performance
|
| 70 |
+
- [Anthropic Introspection Research](https://www.anthropic.com/research/introspection) - ~20% accuracy on genuine introspection
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
*This document is closed. The conclusion is: don't implement internal embeddings/mGREP for this use case.*
|
SPEC_12_NARRATIVE_SYNTHESIS.md
ADDED
|
@@ -0,0 +1,730 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPEC_12: Narrative Report Synthesis
|
| 2 |
+
|
| 3 |
+
**Status**: Ready for Implementation
|
| 4 |
+
**Priority**: P1 - Core deliverable
|
| 5 |
+
**Related Issues**: #85, #86
|
| 6 |
+
**Related Spec**: SPEC_11 (Sexual Health Focus)
|
| 7 |
+
**Author**: Deep Audit against Microsoft Agent Framework
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## Problem Statement
|
| 12 |
+
|
| 13 |
+
DeepBoner's report generation outputs **structured metadata** instead of **synthesized prose**. The current implementation uses string templating with NO LLM call for narrative synthesis.
|
| 14 |
+
|
| 15 |
+
### Current Output (Simple Mode - What Users See)
|
| 16 |
+
|
| 17 |
+
```markdown
|
| 18 |
+
## Sexual Health Analysis
|
| 19 |
+
|
| 20 |
+
### Question
|
| 21 |
+
Testosterone therapy for hypoactive sexual desire disorder?
|
| 22 |
+
|
| 23 |
+
### Drug Candidates
|
| 24 |
+
- **Testosterone**
|
| 25 |
+
- **LibiGel**
|
| 26 |
+
|
| 27 |
+
### Key Findings
|
| 28 |
+
- Testosterone therapy improves sexual desire
|
| 29 |
+
|
| 30 |
+
### Assessment
|
| 31 |
+
- **Mechanism Score**: 8/10
|
| 32 |
+
- **Clinical Evidence Score**: 9/10
|
| 33 |
+
- **Confidence**: 90%
|
| 34 |
+
|
| 35 |
+
### Citations (33 sources)
|
| 36 |
+
1. [Title](url)...
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
### Expected Output (Professional Research Report)
|
| 40 |
+
|
| 41 |
+
```markdown
|
| 42 |
+
## Sexual Health Research Report: Testosterone Therapy for HSDD
|
| 43 |
+
|
| 44 |
+
### Executive Summary
|
| 45 |
+
|
| 46 |
+
Testosterone therapy represents a well-established, evidence-based treatment for
|
| 47 |
+
hypoactive sexual desire disorder (HSDD) in postmenopausal women. Our analysis of
|
| 48 |
+
33 peer-reviewed sources reveals consistent findings across multiple randomized
|
| 49 |
+
controlled trials, with transdermal testosterone demonstrating the strongest
|
| 50 |
+
efficacy-safety profile.
|
| 51 |
+
|
| 52 |
+
### Background
|
| 53 |
+
|
| 54 |
+
Hypoactive sexual desire disorder affects an estimated 12% of postmenopausal women
|
| 55 |
+
and is characterized by persistent lack of sexual interest causing personal distress.
|
| 56 |
+
The ISSWSH published clinical guidelines in 2021 establishing testosterone as a
|
| 57 |
+
recommended intervention...
|
| 58 |
+
|
| 59 |
+
### Evidence Synthesis
|
| 60 |
+
|
| 61 |
+
**Mechanism of Action**
|
| 62 |
+
|
| 63 |
+
Testosterone exerts its effects on sexual desire through multiple pathways. At the
|
| 64 |
+
hypothalamic level, testosterone modulates dopaminergic signaling. Evidence from
|
| 65 |
+
Smith et al. (2021) demonstrates androgen receptor activation correlates with
|
| 66 |
+
subjective measures of desire (r=0.67, p<0.001)...
|
| 67 |
+
|
| 68 |
+
### Recommendations
|
| 69 |
+
|
| 70 |
+
1. **Transdermal testosterone** (300 μg/day) is recommended for postmenopausal
|
| 71 |
+
women with HSDD not primarily related to modifiable factors
|
| 72 |
+
2. **Duration**: Continue for 6 months to assess efficacy; discontinue if no benefit
|
| 73 |
+
|
| 74 |
+
### Limitations
|
| 75 |
+
|
| 76 |
+
Long-term safety data beyond 24 months remains limited...
|
| 77 |
+
|
| 78 |
+
### References
|
| 79 |
+
1. Smith AB et al. (2021). Testosterone mechanisms... https://pubmed.ncbi.nlm.nih.gov/123/
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
## Root Cause Analysis
|
| 85 |
+
|
| 86 |
+
### Location 1: Simple Orchestrator (THE PRIMARY BUG)
|
| 87 |
+
|
| 88 |
+
**File**: `src/orchestrators/simple.py`
|
| 89 |
+
**Lines**: 448-505
|
| 90 |
+
**Method**: `_generate_synthesis()`
|
| 91 |
+
|
| 92 |
+
```python
|
| 93 |
+
def _generate_synthesis(
|
| 94 |
+
self,
|
| 95 |
+
query: str,
|
| 96 |
+
evidence: list[Evidence],
|
| 97 |
+
assessment: JudgeAssessment,
|
| 98 |
+
) -> str:
|
| 99 |
+
# ❌ NO LLM CALL - Just string templating!
|
| 100 |
+
drug_list = "\n".join([f"- **{d}**" for d in assessment.details.drug_candidates])
|
| 101 |
+
findings_list = "\n".join([f"- {f}" for f in assessment.details.key_findings])
|
| 102 |
+
|
| 103 |
+
return f"""{self.domain_config.report_title}
|
| 104 |
+
### Question
|
| 105 |
+
{query}
|
| 106 |
+
### Drug Candidates
|
| 107 |
+
{drug_list}
|
| 108 |
+
...
|
| 109 |
+
"""
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
**The Problem**: No LLM is ever called. It's just formatted data from JudgeAssessment.
|
| 113 |
+
|
| 114 |
+
### Location 2: Partial Synthesis (Max Iterations Fallback)
|
| 115 |
+
|
| 116 |
+
**File**: `src/orchestrators/simple.py`
|
| 117 |
+
**Lines**: 507-602
|
| 118 |
+
**Method**: `_generate_partial_synthesis()`
|
| 119 |
+
|
| 120 |
+
Same issue - string templating, no LLM call.
|
| 121 |
+
|
| 122 |
+
### Location 3: Report Agent (Advanced Mode)
|
| 123 |
+
|
| 124 |
+
**File**: `src/agents/report_agent.py`
|
| 125 |
+
**Lines**: 93-94
|
| 126 |
+
|
| 127 |
+
```python
|
| 128 |
+
result = await self._get_agent().run(prompt)
|
| 129 |
+
report = result.output # ResearchReport (structured data)
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
This DOES make an LLM call, but it outputs `ResearchReport` (structured Pydantic model), not narrative prose. The `to_markdown()` method just formats the structured fields.
|
| 133 |
+
|
| 134 |
+
### Location 4: Report System Prompt
|
| 135 |
+
|
| 136 |
+
**File**: `src/prompts/report.py`
|
| 137 |
+
**Lines**: 13-76
|
| 138 |
+
|
| 139 |
+
The system prompt tells the LLM to output structured JSON with fields like `hypotheses_tested: [...]` and `references: [...]`. It does NOT request narrative prose.
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## Microsoft Agent Framework Pattern (Reference)
|
| 144 |
+
|
| 145 |
+
**File**: `reference_repos/agent-framework/python/samples/getting_started/workflows/orchestration/concurrent_custom_aggregator.py`
|
| 146 |
+
**Lines**: 56-79
|
| 147 |
+
|
| 148 |
+
```python
|
| 149 |
+
# Define a custom aggregator callback that uses the chat client to SYNTHESIZE
|
| 150 |
+
async def summarize_results(results: list[Any]) -> str:
|
| 151 |
+
expert_sections: list[str] = []
|
| 152 |
+
for r in results:
|
| 153 |
+
messages = getattr(r.agent_run_response, "messages", [])
|
| 154 |
+
final_text = messages[-1].text if messages else "(no content)"
|
| 155 |
+
expert_sections.append(f"{r.executor_id}:\n{final_text}")
|
| 156 |
+
|
| 157 |
+
# ✅ LLM CALL for synthesis
|
| 158 |
+
system_msg = ChatMessage(
|
| 159 |
+
Role.SYSTEM,
|
| 160 |
+
text=(
|
| 161 |
+
"You are a helpful assistant that consolidates multiple domain expert outputs "
|
| 162 |
+
"into one cohesive, concise summary with clear takeaways."
|
| 163 |
+
),
|
| 164 |
+
)
|
| 165 |
+
user_msg = ChatMessage(Role.USER, text="\n\n".join(expert_sections))
|
| 166 |
+
|
| 167 |
+
response = await chat_client.get_response([system_msg, user_msg])
|
| 168 |
+
return response.messages[-1].text
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
**The pattern**: The aggregator makes an **LLM call** to synthesize, not string concatenation.
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
## Solution Design
|
| 176 |
+
|
| 177 |
+
### Architecture Change
|
| 178 |
+
|
| 179 |
+
```text
|
| 180 |
+
Current (Simple Mode):
|
| 181 |
+
Evidence → Judge → {structured data} → String Template → Bullet Points
|
| 182 |
+
|
| 183 |
+
Proposed (Simple Mode):
|
| 184 |
+
Evidence → Judge → {structured data} → LLM Synthesis → Narrative Prose
|
| 185 |
+
↓
|
| 186 |
+
Uses SynthesisPrompt
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
### Components to Create/Modify
|
| 190 |
+
|
| 191 |
+
| File | Action | Description |
|
| 192 |
+
|------|--------|-------------|
|
| 193 |
+
| `src/prompts/synthesis.py` | **NEW** | Narrative synthesis prompts |
|
| 194 |
+
| `src/orchestrators/simple.py` | **MODIFY** | Make `_generate_synthesis()` async, add LLM call |
|
| 195 |
+
| `src/config/domain.py` | **MODIFY** | Add `synthesis_system_prompt` field |
|
| 196 |
+
| `tests/unit/prompts/test_synthesis.py` | **NEW** | Test synthesis prompts |
|
| 197 |
+
| `tests/unit/orchestrators/test_simple_synthesis.py` | **NEW** | Test LLM synthesis |
|
| 198 |
+
|
| 199 |
+
---
|
| 200 |
+
|
| 201 |
+
## Implementation Plan
|
| 202 |
+
|
| 203 |
+
### Phase 1: Create Synthesis Prompts
|
| 204 |
+
|
| 205 |
+
**File**: `src/prompts/synthesis.py` (NEW)
|
| 206 |
+
|
| 207 |
+
```python
|
| 208 |
+
"""Prompts for narrative report synthesis."""
|
| 209 |
+
|
| 210 |
+
from src.config.domain import ResearchDomain, get_domain_config
|
| 211 |
+
|
| 212 |
+
def get_synthesis_system_prompt(domain: ResearchDomain | str | None = None) -> str:
|
| 213 |
+
"""Get the system prompt for narrative synthesis."""
|
| 214 |
+
config = get_domain_config(domain)
|
| 215 |
+
return f"""You are a scientific writer specializing in {config.name.lower()}.
|
| 216 |
+
Your task is to synthesize research evidence into a clear, NARRATIVE report.
|
| 217 |
+
|
| 218 |
+
## CRITICAL: Writing Style
|
| 219 |
+
- Write in PROSE PARAGRAPHS, not bullet points
|
| 220 |
+
- Use academic but accessible language
|
| 221 |
+
- Be specific about evidence strength (e.g., "in an RCT of N=200")
|
| 222 |
+
- Reference specific studies by author name
|
| 223 |
+
- Provide quantitative results where available (p-values, effect sizes)
|
| 224 |
+
|
| 225 |
+
## Report Structure
|
| 226 |
+
|
| 227 |
+
### Executive Summary (REQUIRED - 2-3 sentences)
|
| 228 |
+
Start with the bottom line. Example:
|
| 229 |
+
"Testosterone therapy demonstrates consistent efficacy for HSDD in postmenopausal
|
| 230 |
+
women, with transdermal formulations showing the best safety profile."
|
| 231 |
+
|
| 232 |
+
### Background (REQUIRED - 1 paragraph)
|
| 233 |
+
Explain the condition, its prevalence, and clinical significance.
|
| 234 |
+
|
| 235 |
+
### Evidence Synthesis (REQUIRED - 2-4 paragraphs)
|
| 236 |
+
Weave the evidence into a coherent NARRATIVE:
|
| 237 |
+
- Mechanism of Action: How does the intervention work?
|
| 238 |
+
- Clinical Evidence: What do trials show? Include effect sizes.
|
| 239 |
+
- Comparative Evidence: How does it compare to alternatives?
|
| 240 |
+
|
| 241 |
+
### Recommendations (REQUIRED - 3-5 items)
|
| 242 |
+
Provide actionable clinical recommendations.
|
| 243 |
+
|
| 244 |
+
### Limitations (REQUIRED - 1 paragraph)
|
| 245 |
+
Acknowledge gaps, biases, and areas needing more research.
|
| 246 |
+
|
| 247 |
+
### References (REQUIRED)
|
| 248 |
+
List key references with author, year, title, URL.
|
| 249 |
+
|
| 250 |
+
## CRITICAL RULES
|
| 251 |
+
1. ONLY cite papers from the provided evidence - NEVER hallucinate references
|
| 252 |
+
2. Write in complete sentences and paragraphs (PROSE, not lists)
|
| 253 |
+
3. Include specific statistics when available
|
| 254 |
+
4. Acknowledge uncertainty honestly
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
FEW_SHOT_EXAMPLE = '''
|
| 259 |
+
## Example: Strong Evidence Synthesis
|
| 260 |
+
|
| 261 |
+
INPUT:
|
| 262 |
+
- Query: "Alprostadil for erectile dysfunction"
|
| 263 |
+
- Evidence: 15 papers including meta-analysis of 8 RCTs (N=3,247)
|
| 264 |
+
- Mechanism Score: 9/10
|
| 265 |
+
- Clinical Score: 9/10
|
| 266 |
+
|
| 267 |
+
OUTPUT:
|
| 268 |
+
|
| 269 |
+
### Executive Summary
|
| 270 |
+
|
| 271 |
+
Alprostadil (prostaglandin E1) represents a well-established second-line treatment
|
| 272 |
+
for erectile dysfunction, with meta-analytic evidence demonstrating 87% efficacy
|
| 273 |
+
in achieving erections sufficient for intercourse. It offers a PDE5-independent
|
| 274 |
+
mechanism particularly valuable for patients who do not respond to oral therapies.
|
| 275 |
+
|
| 276 |
+
### Background
|
| 277 |
+
|
| 278 |
+
Erectile dysfunction affects approximately 30 million men in the United States,
|
| 279 |
+
with prevalence increasing with age. While PDE5 inhibitors remain first-line
|
| 280 |
+
therapy, approximately 30% of patients are non-responders. Alprostadil provides
|
| 281 |
+
an alternative mechanism through direct smooth muscle relaxation.
|
| 282 |
+
|
| 283 |
+
### Evidence Synthesis
|
| 284 |
+
|
| 285 |
+
**Mechanism of Action**
|
| 286 |
+
|
| 287 |
+
Alprostadil works through a distinct pathway from PDE5 inhibitors. It binds to
|
| 288 |
+
EP receptors on cavernosal smooth muscle, activating adenylate cyclase and
|
| 289 |
+
increasing intracellular cAMP. As noted by Smith et al. (2019), this mechanism
|
| 290 |
+
explains its efficacy in patients with endothelial dysfunction.
|
| 291 |
+
|
| 292 |
+
**Clinical Evidence**
|
| 293 |
+
|
| 294 |
+
A meta-analysis by Johnson et al. (2020) pooled data from 8 randomized controlled
|
| 295 |
+
trials (N=3,247). The primary endpoint of erection sufficient for intercourse was
|
| 296 |
+
achieved in 87% of alprostadil patients versus 12% placebo (RR 7.25, 95% CI:
|
| 297 |
+
5.8-9.1, p<0.001). The NNT was 1.3, indicating robust effect size.
|
| 298 |
+
|
| 299 |
+
### Recommendations
|
| 300 |
+
|
| 301 |
+
1. Consider alprostadil as second-line therapy when PDE5 inhibitors fail
|
| 302 |
+
2. Start with 10 μg intracavernosal injection, titrate to 40 μg
|
| 303 |
+
3. Provide in-office training for self-injection technique
|
| 304 |
+
|
| 305 |
+
### Limitations
|
| 306 |
+
|
| 307 |
+
Long-term data beyond 2 years is limited. Head-to-head comparisons with newer
|
| 308 |
+
therapies are lacking. Most trials excluded severe cardiovascular disease.
|
| 309 |
+
|
| 310 |
+
### References
|
| 311 |
+
|
| 312 |
+
1. Smith AB et al. (2019). Alprostadil mechanism. J Urol. https://pubmed.ncbi.nlm.nih.gov/123/
|
| 313 |
+
2. Johnson CD et al. (2020). Meta-analysis of alprostadil. J Sex Med. https://pubmed.ncbi.nlm.nih.gov/456/
|
| 314 |
+
'''
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def format_synthesis_prompt(
|
| 318 |
+
query: str,
|
| 319 |
+
evidence_summary: str,
|
| 320 |
+
drug_candidates: list[str],
|
| 321 |
+
key_findings: list[str],
|
| 322 |
+
mechanism_score: int,
|
| 323 |
+
clinical_score: int,
|
| 324 |
+
confidence: float,
|
| 325 |
+
) -> str:
|
| 326 |
+
"""Format the user prompt for synthesis."""
|
| 327 |
+
return f"""Synthesize a narrative research report for the following query.
|
| 328 |
+
|
| 329 |
+
## Research Question
|
| 330 |
+
{query}
|
| 331 |
+
|
| 332 |
+
## Evidence Summary
|
| 333 |
+
{evidence_summary}
|
| 334 |
+
|
| 335 |
+
## Identified Drug Candidates
|
| 336 |
+
{', '.join(drug_candidates) or 'None identified'}
|
| 337 |
+
|
| 338 |
+
## Key Findings from Evidence
|
| 339 |
+
{chr(10).join(f'- {f}' for f in key_findings) or 'No specific findings'}
|
| 340 |
+
|
| 341 |
+
## Assessment Scores
|
| 342 |
+
- Mechanism Score: {mechanism_score}/10
|
| 343 |
+
- Clinical Evidence Score: {clinical_score}/10
|
| 344 |
+
- Confidence: {confidence:.0%}
|
| 345 |
+
|
| 346 |
+
## Instructions
|
| 347 |
+
Generate a NARRATIVE research report following the structure above.
|
| 348 |
+
Write in prose paragraphs, NOT bullet points (except for Recommendations).
|
| 349 |
+
ONLY cite papers mentioned in the Evidence Summary above.
|
| 350 |
+
|
| 351 |
+
{FEW_SHOT_EXAMPLE}
|
| 352 |
+
"""
|
| 353 |
+
```
|
| 354 |
+
|
| 355 |
+
### Phase 2: Update Simple Orchestrator
|
| 356 |
+
|
| 357 |
+
**File**: `src/orchestrators/simple.py`
|
| 358 |
+
**Change**: Make `_generate_synthesis()` async and add LLM call
|
| 359 |
+
|
| 360 |
+
```python
|
| 361 |
+
# Add imports at top
|
| 362 |
+
from src.prompts.synthesis import get_synthesis_system_prompt, format_synthesis_prompt
|
| 363 |
+
from src.agent_factory.judges import get_model
|
| 364 |
+
from pydantic_ai import Agent
|
| 365 |
+
|
| 366 |
+
# Change method signature and implementation (lines 448-505)
|
| 367 |
+
async def _generate_synthesis(
|
| 368 |
+
self,
|
| 369 |
+
query: str,
|
| 370 |
+
evidence: list[Evidence],
|
| 371 |
+
assessment: JudgeAssessment,
|
| 372 |
+
) -> str:
|
| 373 |
+
"""
|
| 374 |
+
Generate the final synthesis response using LLM.
|
| 375 |
+
|
| 376 |
+
Args:
|
| 377 |
+
query: The original question
|
| 378 |
+
evidence: All collected evidence
|
| 379 |
+
assessment: The final assessment
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
Narrative synthesis as markdown
|
| 383 |
+
"""
|
| 384 |
+
# Build evidence summary for LLM context
|
| 385 |
+
evidence_lines = []
|
| 386 |
+
for e in evidence[:20]: # Limit context
|
| 387 |
+
authors = ", ".join(e.citation.authors[:2]) if e.citation.authors else "Unknown"
|
| 388 |
+
evidence_lines.append(
|
| 389 |
+
f"- {e.citation.title} ({authors}, {e.citation.date}): {e.content[:200]}..."
|
| 390 |
+
)
|
| 391 |
+
evidence_summary = "\n".join(evidence_lines)
|
| 392 |
+
|
| 393 |
+
# Format synthesis prompt
|
| 394 |
+
user_prompt = format_synthesis_prompt(
|
| 395 |
+
query=query,
|
| 396 |
+
evidence_summary=evidence_summary,
|
| 397 |
+
drug_candidates=assessment.details.drug_candidates,
|
| 398 |
+
key_findings=assessment.details.key_findings,
|
| 399 |
+
mechanism_score=assessment.details.mechanism_score,
|
| 400 |
+
clinical_score=assessment.details.clinical_evidence_score,
|
| 401 |
+
confidence=assessment.confidence,
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
# Create synthesis agent
|
| 405 |
+
system_prompt = get_synthesis_system_prompt(self.domain)
|
| 406 |
+
|
| 407 |
+
try:
|
| 408 |
+
agent: Agent[None, str] = Agent(
|
| 409 |
+
model=get_model(),
|
| 410 |
+
output_type=str,
|
| 411 |
+
system_prompt=system_prompt,
|
| 412 |
+
)
|
| 413 |
+
result = await agent.run(user_prompt)
|
| 414 |
+
narrative = result.output
|
| 415 |
+
except Exception as e:
|
| 416 |
+
# Fallback to template if LLM fails
|
| 417 |
+
logger.warning("LLM synthesis failed, using template", error=str(e))
|
| 418 |
+
return self._generate_template_synthesis(query, evidence, assessment)
|
| 419 |
+
|
| 420 |
+
# Add citations footer
|
| 421 |
+
citations = "\n".join(
|
| 422 |
+
f"{i + 1}. [{e.citation.title}]({e.citation.url}) "
|
| 423 |
+
f"({e.citation.source.upper()}, {e.citation.date})"
|
| 424 |
+
for i, e in enumerate(evidence[:10])
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
return f"""{narrative}
|
| 428 |
+
|
| 429 |
+
---
|
| 430 |
+
### Full Citation List ({len(evidence)} sources)
|
| 431 |
+
{citations}
|
| 432 |
+
|
| 433 |
+
*Analysis based on {len(evidence)} sources across {len(self.history)} iterations.*
|
| 434 |
+
"""
|
| 435 |
+
|
| 436 |
+
def _generate_template_synthesis(
|
| 437 |
+
self,
|
| 438 |
+
query: str,
|
| 439 |
+
evidence: list[Evidence],
|
| 440 |
+
assessment: JudgeAssessment,
|
| 441 |
+
) -> str:
|
| 442 |
+
"""Fallback template synthesis (no LLM)."""
|
| 443 |
+
# Keep the existing string template logic here as fallback
|
| 444 |
+
...
|
| 445 |
+
```
|
| 446 |
+
|
| 447 |
+
### Phase 3: Update Call Site
|
| 448 |
+
|
| 449 |
+
**File**: `src/orchestrators/simple.py`
|
| 450 |
+
**Line**: 393
|
| 451 |
+
|
| 452 |
+
```python
|
| 453 |
+
# Change from:
|
| 454 |
+
final_response = self._generate_synthesis(query, all_evidence, assessment)
|
| 455 |
+
|
| 456 |
+
# To:
|
| 457 |
+
final_response = await self._generate_synthesis(query, all_evidence, assessment)
|
| 458 |
+
```
|
| 459 |
+
|
| 460 |
+
### Phase 4: Update Domain Config
|
| 461 |
+
|
| 462 |
+
**File**: `src/config/domain.py`
|
| 463 |
+
|
| 464 |
+
Add optional `synthesis_system_prompt` field to `DomainConfig`:
|
| 465 |
+
|
| 466 |
+
```python
|
| 467 |
+
class DomainConfig(BaseModel):
|
| 468 |
+
# ... existing fields ...
|
| 469 |
+
|
| 470 |
+
# Synthesis (optional, can inherit from base)
|
| 471 |
+
synthesis_system_prompt: str | None = None
|
| 472 |
+
```
|
| 473 |
+
|
| 474 |
+
### Phase 5: Add Tests
|
| 475 |
+
|
| 476 |
+
**File**: `tests/unit/prompts/test_synthesis.py` (NEW)
|
| 477 |
+
|
| 478 |
+
```python
|
| 479 |
+
"""Tests for synthesis prompts."""
|
| 480 |
+
|
| 481 |
+
import pytest
|
| 482 |
+
|
| 483 |
+
from src.prompts.synthesis import (
|
| 484 |
+
get_synthesis_system_prompt,
|
| 485 |
+
format_synthesis_prompt,
|
| 486 |
+
FEW_SHOT_EXAMPLE,
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
def test_synthesis_system_prompt_is_narrative_focused() -> None:
|
| 491 |
+
"""System prompt should emphasize prose, not bullets."""
|
| 492 |
+
prompt = get_synthesis_system_prompt()
|
| 493 |
+
assert "PROSE PARAGRAPHS" in prompt
|
| 494 |
+
assert "not bullet points" in prompt.lower()
|
| 495 |
+
assert "Executive Summary" in prompt
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
def test_synthesis_system_prompt_warns_about_hallucination() -> None:
|
| 499 |
+
"""System prompt should warn about citation hallucination."""
|
| 500 |
+
prompt = get_synthesis_system_prompt()
|
| 501 |
+
assert "NEVER hallucinate" in prompt
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
def test_format_synthesis_prompt_includes_evidence() -> None:
|
| 505 |
+
"""User prompt should include evidence summary."""
|
| 506 |
+
prompt = format_synthesis_prompt(
|
| 507 |
+
query="testosterone libido",
|
| 508 |
+
evidence_summary="Study shows efficacy...",
|
| 509 |
+
drug_candidates=["Testosterone"],
|
| 510 |
+
key_findings=["Improved libido"],
|
| 511 |
+
mechanism_score=8,
|
| 512 |
+
clinical_score=7,
|
| 513 |
+
confidence=0.85,
|
| 514 |
+
)
|
| 515 |
+
assert "testosterone libido" in prompt
|
| 516 |
+
assert "Study shows efficacy" in prompt
|
| 517 |
+
assert "Testosterone" in prompt
|
| 518 |
+
assert "8/10" in prompt
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def test_few_shot_example_is_narrative() -> None:
|
| 522 |
+
"""Few-shot example should demonstrate narrative style."""
|
| 523 |
+
# Count paragraphs vs bullets
|
| 524 |
+
paragraphs = len([p for p in FEW_SHOT_EXAMPLE.split('\n\n') if len(p) > 100])
|
| 525 |
+
bullets = FEW_SHOT_EXAMPLE.count('\n- ')
|
| 526 |
+
|
| 527 |
+
# Prose should dominate (at least 2x more paragraphs than bullets)
|
| 528 |
+
assert paragraphs >= bullets, "Few-shot example should be mostly narrative"
|
| 529 |
+
```
|
| 530 |
+
|
| 531 |
+
**File**: `tests/unit/orchestrators/test_simple_synthesis.py` (NEW)
|
| 532 |
+
|
| 533 |
+
```python
|
| 534 |
+
"""Tests for simple orchestrator synthesis."""
|
| 535 |
+
|
| 536 |
+
import pytest
|
| 537 |
+
from unittest.mock import AsyncMock, MagicMock, patch
|
| 538 |
+
|
| 539 |
+
from src.orchestrators.simple import Orchestrator
|
| 540 |
+
from src.utils.models import Evidence, Citation, JudgeAssessment, JudgeDetails
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
@pytest.fixture
|
| 544 |
+
def sample_evidence() -> list[Evidence]:
|
| 545 |
+
return [
|
| 546 |
+
Evidence(
|
| 547 |
+
content="Testosterone therapy shows efficacy in HSDD treatment.",
|
| 548 |
+
citation=Citation(
|
| 549 |
+
source="pubmed",
|
| 550 |
+
title="Testosterone and Female Libido",
|
| 551 |
+
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 552 |
+
date="2023",
|
| 553 |
+
authors=["Smith J"],
|
| 554 |
+
),
|
| 555 |
+
)
|
| 556 |
+
]
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
@pytest.fixture
|
| 560 |
+
def sample_assessment() -> JudgeAssessment:
|
| 561 |
+
return JudgeAssessment(
|
| 562 |
+
sufficient=True,
|
| 563 |
+
confidence=0.85,
|
| 564 |
+
reasoning="Evidence is sufficient",
|
| 565 |
+
recommendation="synthesize",
|
| 566 |
+
next_search_queries=[],
|
| 567 |
+
details=JudgeDetails(
|
| 568 |
+
mechanism_score=8,
|
| 569 |
+
clinical_evidence_score=7,
|
| 570 |
+
drug_candidates=["Testosterone"],
|
| 571 |
+
key_findings=["Improved libido in postmenopausal women"],
|
| 572 |
+
),
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
@pytest.mark.asyncio
|
| 577 |
+
async def test_generate_synthesis_calls_llm(
|
| 578 |
+
sample_evidence: list[Evidence],
|
| 579 |
+
sample_assessment: JudgeAssessment,
|
| 580 |
+
) -> None:
|
| 581 |
+
"""Synthesis should make an LLM call, not just template."""
|
| 582 |
+
mock_search = MagicMock()
|
| 583 |
+
mock_judge = MagicMock()
|
| 584 |
+
|
| 585 |
+
orchestrator = Orchestrator(
|
| 586 |
+
search_handler=mock_search,
|
| 587 |
+
judge_handler=mock_judge,
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
with patch("src.orchestrators.simple.Agent") as mock_agent_class:
|
| 591 |
+
mock_agent = MagicMock()
|
| 592 |
+
mock_result = MagicMock()
|
| 593 |
+
mock_result.output = "This is a narrative synthesis with prose paragraphs."
|
| 594 |
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
| 595 |
+
mock_agent_class.return_value = mock_agent
|
| 596 |
+
|
| 597 |
+
result = await orchestrator._generate_synthesis(
|
| 598 |
+
query="testosterone HSDD",
|
| 599 |
+
evidence=sample_evidence,
|
| 600 |
+
assessment=sample_assessment,
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
# Verify LLM was called
|
| 604 |
+
mock_agent_class.assert_called_once()
|
| 605 |
+
mock_agent.run.assert_called_once()
|
| 606 |
+
|
| 607 |
+
# Verify output includes narrative
|
| 608 |
+
assert "narrative synthesis" in result.lower() or "prose" in result.lower()
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
@pytest.mark.asyncio
|
| 612 |
+
async def test_generate_synthesis_falls_back_on_error(
|
| 613 |
+
sample_evidence: list[Evidence],
|
| 614 |
+
sample_assessment: JudgeAssessment,
|
| 615 |
+
) -> None:
|
| 616 |
+
"""Synthesis should fall back to template if LLM fails."""
|
| 617 |
+
mock_search = MagicMock()
|
| 618 |
+
mock_judge = MagicMock()
|
| 619 |
+
|
| 620 |
+
orchestrator = Orchestrator(
|
| 621 |
+
search_handler=mock_search,
|
| 622 |
+
judge_handler=mock_judge,
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
with patch("src.orchestrators.simple.Agent") as mock_agent_class:
|
| 626 |
+
mock_agent_class.side_effect = Exception("LLM unavailable")
|
| 627 |
+
|
| 628 |
+
result = await orchestrator._generate_synthesis(
|
| 629 |
+
query="testosterone HSDD",
|
| 630 |
+
evidence=sample_evidence,
|
| 631 |
+
assessment=sample_assessment,
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
# Should still return something (template fallback)
|
| 635 |
+
assert "Sexual Health Analysis" in result or "testosterone" in result.lower()
|
| 636 |
+
```
|
| 637 |
+
|
| 638 |
+
---
|
| 639 |
+
|
| 640 |
+
## File Changes Summary
|
| 641 |
+
|
| 642 |
+
| File | Lines | Change Type | Description |
|
| 643 |
+
|------|-------|-------------|-------------|
|
| 644 |
+
| `src/prompts/synthesis.py` | ~150 | NEW | Narrative synthesis prompts |
|
| 645 |
+
| `src/orchestrators/simple.py` | 393, 448-505 | MODIFY | Async synthesis with LLM |
|
| 646 |
+
| `src/config/domain.py` | 57 | MODIFY | Add `synthesis_system_prompt` |
|
| 647 |
+
| `tests/unit/prompts/test_synthesis.py` | ~60 | NEW | Prompt tests |
|
| 648 |
+
| `tests/unit/orchestrators/test_simple_synthesis.py` | ~80 | NEW | Synthesis tests |
|
| 649 |
+
|
| 650 |
+
---
|
| 651 |
+
|
| 652 |
+
## Acceptance Criteria
|
| 653 |
+
|
| 654 |
+
- [ ] Report contains **paragraph-form prose**, not just bullet points
|
| 655 |
+
- [ ] Report has **executive summary** (2-3 sentences)
|
| 656 |
+
- [ ] Report has **background section** explaining the condition
|
| 657 |
+
- [ ] Report has **synthesized narrative** weaving evidence together
|
| 658 |
+
- [ ] Report has **actionable recommendations**
|
| 659 |
+
- [ ] Report has **limitations** section
|
| 660 |
+
- [ ] Citations are **properly formatted** (author, year, title, URL)
|
| 661 |
+
- [ ] No hallucinated references (CRITICAL)
|
| 662 |
+
- [ ] Falls back gracefully if LLM unavailable
|
| 663 |
+
- [ ] All existing tests still pass
|
| 664 |
+
- [ ] New tests achieve 90%+ coverage of synthesis code
|
| 665 |
+
|
| 666 |
+
---
|
| 667 |
+
|
| 668 |
+
## Test Criteria
|
| 669 |
+
|
| 670 |
+
```python
|
| 671 |
+
def test_report_is_narrative_not_bullets():
|
| 672 |
+
"""Report should be mostly prose, not bullet points."""
|
| 673 |
+
report = await orchestrator._generate_synthesis(...)
|
| 674 |
+
|
| 675 |
+
# Count paragraphs vs bullet points
|
| 676 |
+
paragraphs = len([p for p in report.split('\n\n') if len(p) > 100])
|
| 677 |
+
bullets = report.count('\n- ')
|
| 678 |
+
|
| 679 |
+
# Prose should dominate
|
| 680 |
+
assert paragraphs > bullets, "Report should be narrative, not bullet list"
|
| 681 |
+
|
| 682 |
+
def test_references_not_hallucinated():
|
| 683 |
+
"""All references must come from provided evidence."""
|
| 684 |
+
evidence_urls = {e.citation.url for e in evidence}
|
| 685 |
+
report = await orchestrator._generate_synthesis(...)
|
| 686 |
+
|
| 687 |
+
# Extract URLs from report
|
| 688 |
+
import re
|
| 689 |
+
report_urls = set(re.findall(r'https?://[^\s\)]+', report))
|
| 690 |
+
|
| 691 |
+
for url in report_urls:
|
| 692 |
+
# Allow pubmed URLs even if slightly different format
|
| 693 |
+
if "pubmed" in url or "clinicaltrials" in url:
|
| 694 |
+
assert any(evidence_url in url or url in evidence_url
|
| 695 |
+
for evidence_url in evidence_urls), f"Hallucinated: {url}"
|
| 696 |
+
```
|
| 697 |
+
|
| 698 |
+
---
|
| 699 |
+
|
| 700 |
+
## Related Microsoft Agent Framework Patterns
|
| 701 |
+
|
| 702 |
+
| Pattern | File | Application |
|
| 703 |
+
|---------|------|-------------|
|
| 704 |
+
| Custom Aggregator | `concurrent_custom_aggregator.py:56-79` | LLM-based synthesis |
|
| 705 |
+
| Fan-Out/Fan-In | `fan_out_fan_in_edges.py` | Multi-expert synthesis |
|
| 706 |
+
| Sequential Chain | `sequential_agents.py` | Writer→Reviewer pattern |
|
| 707 |
+
|
| 708 |
+
---
|
| 709 |
+
|
| 710 |
+
## Implementation Notes for Async Agent
|
| 711 |
+
|
| 712 |
+
1. **Start with `src/prompts/synthesis.py`** - This is independent and can be created first
|
| 713 |
+
2. **Then modify `src/orchestrators/simple.py`** - Change `_generate_synthesis` to async
|
| 714 |
+
3. **Update the call site** (line 393) - Add `await`
|
| 715 |
+
4. **Add tests** - Both unit and integration
|
| 716 |
+
5. **Run `make check`** - Ensure all 237+ tests still pass
|
| 717 |
+
|
| 718 |
+
The key insight from the MS Agent Framework is:
|
| 719 |
+
> The aggregator makes an **LLM call** to synthesize, not string concatenation.
|
| 720 |
+
|
| 721 |
+
Our `_generate_synthesis()` currently does NO LLM call. Fix that, and the reports will transform from bullet points to narrative prose.
|
| 722 |
+
|
| 723 |
+
---
|
| 724 |
+
|
| 725 |
+
## References
|
| 726 |
+
|
| 727 |
+
- GitHub Issue #85: Report lacks narrative synthesis
|
| 728 |
+
- GitHub Issue #86: Microsoft Agent Framework patterns
|
| 729 |
+
- `reference_repos/agent-framework/python/samples/getting_started/workflows/orchestration/concurrent_custom_aggregator.py`
|
| 730 |
+
- LangChain Deep Agents: Few-shot examples importance
|
TOOL_ANALYSIS_CRITICAL.md
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Critical Analysis: Search Tools - Limitations, Gaps, and Improvements
|
| 2 |
+
|
| 3 |
+
**Date**: November 2025
|
| 4 |
+
**Purpose**: Honest assessment of all search tools to identify what's working, what's broken, and what needs improvement WITHOUT horizontal sprawl.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Executive Summary
|
| 9 |
+
|
| 10 |
+
DeepBoner currently has **4 search tools**:
|
| 11 |
+
1. PubMed (NCBI E-utilities)
|
| 12 |
+
2. ClinicalTrials.gov (API v2)
|
| 13 |
+
3. Europe PMC (includes preprints)
|
| 14 |
+
4. OpenAlex (citation-aware)
|
| 15 |
+
|
| 16 |
+
**Overall Assessment**: Tools are functional but have significant gaps in:
|
| 17 |
+
- Deduplication (PubMed ∩ Europe PMC ∩ OpenAlex = massive overlap)
|
| 18 |
+
- Full-text retrieval (only abstracts currently)
|
| 19 |
+
- Citation graph traversal (OpenAlex has data but we don't use it)
|
| 20 |
+
- Query optimization (basic synonym expansion, no MeSH term mapping)
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## Tool 1: PubMed (NCBI E-utilities)
|
| 25 |
+
|
| 26 |
+
**File**: `src/tools/pubmed.py`
|
| 27 |
+
|
| 28 |
+
### What It Does Well
|
| 29 |
+
| Feature | Status | Notes |
|
| 30 |
+
|---------|--------|-------|
|
| 31 |
+
| Rate limiting | ✅ | Shared limiter, respects 3/sec (no key) or 10/sec (with key) |
|
| 32 |
+
| Retry logic | ✅ | tenacity with exponential backoff |
|
| 33 |
+
| Query preprocessing | ✅ | Strips question words, expands synonyms |
|
| 34 |
+
| Abstract parsing | ✅ | Handles XML edge cases (dict vs list) |
|
| 35 |
+
|
| 36 |
+
### Limitations (API-Level)
|
| 37 |
+
| Limitation | Severity | Workaround Possible? |
|
| 38 |
+
|------------|----------|---------------------|
|
| 39 |
+
| **10,000 result cap per query** | Medium | Yes - use date ranges to paginate |
|
| 40 |
+
| **Abstracts only** (no full text) | High | No - full text requires PMC or publisher |
|
| 41 |
+
| **No citation counts** | Medium | Yes - cross-reference with OpenAlex |
|
| 42 |
+
| **Rate limit (10/sec max)** | Low | Already handled |
|
| 43 |
+
|
| 44 |
+
### Current Implementation Gaps
|
| 45 |
+
```python
|
| 46 |
+
# GAP 1: No MeSH term expansion
|
| 47 |
+
# Current: expand_synonyms() uses hardcoded dict
|
| 48 |
+
# Better: Use NCBI's E-utilities to get MeSH terms for query
|
| 49 |
+
|
| 50 |
+
# GAP 2: No date filtering
|
| 51 |
+
# Current: Gets whatever PubMed returns (biased toward recent)
|
| 52 |
+
# Better: Add date range parameter for historical research
|
| 53 |
+
|
| 54 |
+
# GAP 3: No publication type filtering
|
| 55 |
+
# Current: Returns all types (reviews, case reports, RCTs)
|
| 56 |
+
# Better: Filter for RCTs and systematic reviews when appropriate
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### Priority Improvements
|
| 60 |
+
1. **HIGH**: Add publication type filter (Reviews, RCTs, Meta-analyses)
|
| 61 |
+
2. **MEDIUM**: Add date range parameter
|
| 62 |
+
3. **LOW**: MeSH term expansion via E-utilities
|
| 63 |
+
|
| 64 |
+
---
|
| 65 |
+
|
| 66 |
+
## Tool 2: ClinicalTrials.gov
|
| 67 |
+
|
| 68 |
+
**File**: `src/tools/clinicaltrials.py`
|
| 69 |
+
|
| 70 |
+
### What It Does Well
|
| 71 |
+
| Feature | Status | Notes |
|
| 72 |
+
|---------|--------|-------|
|
| 73 |
+
| API v2 usage | ✅ | Modern API, not deprecated v1 |
|
| 74 |
+
| Interventional filter | ✅ | Only gets drug/treatment studies |
|
| 75 |
+
| Status filter | ✅ | COMPLETED, ACTIVE, RECRUITING |
|
| 76 |
+
| httpx → requests workaround | ✅ | Bypasses WAF TLS fingerprint block |
|
| 77 |
+
|
| 78 |
+
### Limitations (API-Level)
|
| 79 |
+
| Limitation | Severity | Workaround Possible? |
|
| 80 |
+
|------------|----------|---------------------|
|
| 81 |
+
| **No results data** | High | Yes - available via different endpoint |
|
| 82 |
+
| **No outcome measures** | High | Yes - add to FIELDS list |
|
| 83 |
+
| **No adverse events** | Medium | Yes - separate API call |
|
| 84 |
+
| **Sparse drug mechanism data** | Medium | No - not in API |
|
| 85 |
+
|
| 86 |
+
### Current Implementation Gaps
|
| 87 |
+
```python
|
| 88 |
+
# GAP 1: Missing critical fields
|
| 89 |
+
FIELDS: ClassVar[list[str]] = [
|
| 90 |
+
"NCTId",
|
| 91 |
+
"BriefTitle",
|
| 92 |
+
"Phase",
|
| 93 |
+
"OverallStatus",
|
| 94 |
+
"Condition",
|
| 95 |
+
"InterventionName",
|
| 96 |
+
"StartDate",
|
| 97 |
+
"BriefSummary",
|
| 98 |
+
# MISSING:
|
| 99 |
+
# "PrimaryOutcome",
|
| 100 |
+
# "SecondaryOutcome",
|
| 101 |
+
# "ResultsFirstSubmitDate",
|
| 102 |
+
# "StudyResults", # Whether results are posted
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
# GAP 2: No results retrieval
|
| 106 |
+
# Many completed trials have posted results
|
| 107 |
+
# We could get actual efficacy data, not just trial existence
|
| 108 |
+
|
| 109 |
+
# GAP 3: No linked publications
|
| 110 |
+
# Trials often link to PubMed articles with results
|
| 111 |
+
# We could follow these links for richer evidence
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
### Priority Improvements
|
| 115 |
+
1. **HIGH**: Add outcome measures to FIELDS
|
| 116 |
+
2. **HIGH**: Check for and retrieve posted results
|
| 117 |
+
3. **MEDIUM**: Follow linked publications (NCT → PMID)
|
| 118 |
+
|
| 119 |
+
---
|
| 120 |
+
|
| 121 |
+
## Tool 3: Europe PMC
|
| 122 |
+
|
| 123 |
+
**File**: `src/tools/europepmc.py`
|
| 124 |
+
|
| 125 |
+
### What It Does Well
|
| 126 |
+
| Feature | Status | Notes |
|
| 127 |
+
|---------|--------|-------|
|
| 128 |
+
| Preprint coverage | ✅ | bioRxiv, medRxiv, ChemRxiv indexed |
|
| 129 |
+
| Preprint labeling | ✅ | `[PREPRINT - Not peer-reviewed]` marker |
|
| 130 |
+
| DOI/PMID fallback URLs | ✅ | Smart URL construction |
|
| 131 |
+
| Relevance scoring | ✅ | Preprints weighted lower (0.75 vs 0.9) |
|
| 132 |
+
|
| 133 |
+
### Limitations (API-Level)
|
| 134 |
+
| Limitation | Severity | Workaround Possible? |
|
| 135 |
+
|------------|----------|---------------------|
|
| 136 |
+
| **No full text for most articles** | High | Partial - CC-licensed available after 14 days |
|
| 137 |
+
| **Citation data limited** | Medium | Only journal articles, not preprints |
|
| 138 |
+
| **Preprint-publication linking gaps** | Medium | ~50% of links missing per Crossref |
|
| 139 |
+
| **License info sometimes missing** | Low | Manual review required |
|
| 140 |
+
|
| 141 |
+
### Current Implementation Gaps
|
| 142 |
+
```python
|
| 143 |
+
# GAP 1: No full-text retrieval
|
| 144 |
+
# Europe PMC has full text for many CC-licensed articles
|
| 145 |
+
# Could retrieve full text XML via separate endpoint
|
| 146 |
+
|
| 147 |
+
# GAP 2: Massive overlap with PubMed
|
| 148 |
+
# Europe PMC indexes all of PubMed/MEDLINE
|
| 149 |
+
# We're getting duplicates with no deduplication
|
| 150 |
+
|
| 151 |
+
# GAP 3: No citation network
|
| 152 |
+
# Europe PMC has "citedByCount" but we don't use it
|
| 153 |
+
# Could prioritize highly-cited preprints
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
### Priority Improvements
|
| 157 |
+
1. **HIGH**: Add deduplication with PubMed (by PMID)
|
| 158 |
+
2. **MEDIUM**: Retrieve citation counts for ranking
|
| 159 |
+
3. **LOW**: Full-text retrieval for CC-licensed articles
|
| 160 |
+
|
| 161 |
+
---
|
| 162 |
+
|
| 163 |
+
## Tool 4: OpenAlex
|
| 164 |
+
|
| 165 |
+
**File**: `src/tools/openalex.py`
|
| 166 |
+
|
| 167 |
+
### What It Does Well
|
| 168 |
+
| Feature | Status | Notes |
|
| 169 |
+
|---------|--------|-------|
|
| 170 |
+
| Citation counts | ✅ | Sorted by `cited_by_count:desc` |
|
| 171 |
+
| Abstract reconstruction | ✅ | Handles inverted index format |
|
| 172 |
+
| Concept extraction | ✅ | Hierarchical classification |
|
| 173 |
+
| Open access detection | ✅ | `is_oa` and `pdf_url` |
|
| 174 |
+
| Polite pool | ✅ | mailto for 100k/day limit |
|
| 175 |
+
| Rich metadata | ✅ | Best metadata of all tools |
|
| 176 |
+
|
| 177 |
+
### Limitations (API-Level)
|
| 178 |
+
| Limitation | Severity | Workaround Possible? |
|
| 179 |
+
|------------|----------|---------------------|
|
| 180 |
+
| **Author truncation at 100** | Low | Only affects mega-author papers |
|
| 181 |
+
| **No full text** | High | No - OpenAlex is metadata only |
|
| 182 |
+
| **Stale data (1-2 day lag)** | Low | Acceptable for research |
|
| 183 |
+
|
| 184 |
+
### Current Implementation Gaps
|
| 185 |
+
```python
|
| 186 |
+
# GAP 1: No citation graph traversal
|
| 187 |
+
# OpenAlex has `cited_by` and `references` endpoints
|
| 188 |
+
# We could find seminal papers by following citation chains
|
| 189 |
+
|
| 190 |
+
# GAP 2: No related works
|
| 191 |
+
# OpenAlex has ML-powered "related_works" field
|
| 192 |
+
# Could expand search to similar papers
|
| 193 |
+
|
| 194 |
+
# GAP 3: No concept filtering
|
| 195 |
+
# OpenAlex has hierarchical concepts
|
| 196 |
+
# Could filter for specific domains (e.g., "Sexual health" concept)
|
| 197 |
+
|
| 198 |
+
# GAP 4: Overlap with PubMed
|
| 199 |
+
# OpenAlex indexes most of PubMed
|
| 200 |
+
# More duplicates without deduplication
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
### Priority Improvements
|
| 204 |
+
1. **HIGH**: Add citation graph traversal (find seminal papers)
|
| 205 |
+
2. **HIGH**: Add deduplication with PubMed/Europe PMC
|
| 206 |
+
3. **MEDIUM**: Use `related_works` for query expansion
|
| 207 |
+
4. **LOW**: Concept-based filtering
|
| 208 |
+
|
| 209 |
+
---
|
| 210 |
+
|
| 211 |
+
## Cross-Tool Issues
|
| 212 |
+
|
| 213 |
+
### Issue 1: MASSIVE DUPLICATION
|
| 214 |
+
|
| 215 |
+
```
|
| 216 |
+
PubMed: 36M+ articles
|
| 217 |
+
Europe PMC: Indexes ALL of PubMed + preprints
|
| 218 |
+
OpenAlex: 250M+ works (includes PubMed)
|
| 219 |
+
|
| 220 |
+
Current behavior: All 3 return the same papers
|
| 221 |
+
Result: Duplicate evidence, wasted tokens, inflated counts
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
**Solution**: Deduplication by PMID/DOI
|
| 225 |
+
```python
|
| 226 |
+
# Proposed: Add to SearchHandler
|
| 227 |
+
def deduplicate_evidence(evidence_list: list[Evidence]) -> list[Evidence]:
|
| 228 |
+
seen_ids: set[str] = set()
|
| 229 |
+
unique: list[Evidence] = []
|
| 230 |
+
for e in evidence_list:
|
| 231 |
+
# Extract PMID or DOI from URL
|
| 232 |
+
paper_id = extract_paper_id(e.citation.url)
|
| 233 |
+
if paper_id not in seen_ids:
|
| 234 |
+
seen_ids.add(paper_id)
|
| 235 |
+
unique.append(e)
|
| 236 |
+
return unique
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
### Issue 2: NO FULL-TEXT RETRIEVAL
|
| 240 |
+
|
| 241 |
+
All tools return **abstracts only**. For deep research, this is limiting.
|
| 242 |
+
|
| 243 |
+
**What's Actually Possible**:
|
| 244 |
+
| Source | Full Text Access | How |
|
| 245 |
+
|--------|------------------|-----|
|
| 246 |
+
| PubMed Central (PMC) | Yes, for OA articles | Separate API: `efetch` with `db=pmc` |
|
| 247 |
+
| Europe PMC | Yes, CC-licensed after 14 days | `/fullTextXML/{id}` endpoint |
|
| 248 |
+
| OpenAlex | No | Metadata only |
|
| 249 |
+
| Unpaywall | Yes, OA link discovery | Separate API |
|
| 250 |
+
|
| 251 |
+
**Recommendation**: Add PMC full-text retrieval for open access articles.
|
| 252 |
+
|
| 253 |
+
### Issue 3: NO CITATION GRAPH
|
| 254 |
+
|
| 255 |
+
OpenAlex has rich citation data but we only use `cited_by_count` for sorting.
|
| 256 |
+
|
| 257 |
+
**Untapped Capabilities**:
|
| 258 |
+
- `cited_by`: Find papers that cite a key paper
|
| 259 |
+
- `references`: Find sources a paper cites
|
| 260 |
+
- `related_works`: ML-powered similar papers
|
| 261 |
+
|
| 262 |
+
**Use Case**: User asks about "testosterone therapy for HSDD". We find a seminal 2019 RCT. We could automatically find:
|
| 263 |
+
- Papers that cite it (newer evidence)
|
| 264 |
+
- Papers it cites (foundational research)
|
| 265 |
+
- Related papers (similar topics)
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## What's NOT Possible (API Constraints)
|
| 270 |
+
|
| 271 |
+
| Feature | Why Not Possible |
|
| 272 |
+
|---------|------------------|
|
| 273 |
+
| **bioRxiv direct search** | No keyword search API, only RSS feed of latest |
|
| 274 |
+
| **arXiv search** | API exists but irrelevant for sexual health |
|
| 275 |
+
| **PubMed full text** | Requires publisher access or PMC |
|
| 276 |
+
| **Real-time trial results** | ClinicalTrials.gov results are static snapshots |
|
| 277 |
+
| **Drug mechanism data** | Not in any API - would need ChEMBL or DrugBank |
|
| 278 |
+
|
| 279 |
+
---
|
| 280 |
+
|
| 281 |
+
## Recommended Improvements (Priority Order)
|
| 282 |
+
|
| 283 |
+
### Phase 1: Fix Fundamentals (High ROI)
|
| 284 |
+
1. **Deduplication** - Stop returning the same paper 3 times
|
| 285 |
+
2. **Outcome measures in ClinicalTrials** - Get actual efficacy data
|
| 286 |
+
3. **Citation counts from all sources** - Rank by influence, not recency
|
| 287 |
+
|
| 288 |
+
### Phase 2: Depth Improvements (Medium ROI)
|
| 289 |
+
4. **PMC full-text retrieval** - Get full papers for OA articles
|
| 290 |
+
5. **Citation graph traversal** - Find seminal papers automatically
|
| 291 |
+
6. **Publication type filtering** - Prioritize RCTs and meta-analyses
|
| 292 |
+
|
| 293 |
+
### Phase 3: Quality Improvements (Lower ROI, Nice-to-Have)
|
| 294 |
+
7. **MeSH term expansion** - Better PubMed queries
|
| 295 |
+
8. **Related works expansion** - Use OpenAlex ML similarity
|
| 296 |
+
9. **Date range filtering** - Historical vs recent research
|
| 297 |
+
|
| 298 |
+
---
|
| 299 |
+
|
| 300 |
+
## Neo4j Integration (Future Consideration)
|
| 301 |
+
|
| 302 |
+
**Question**: Should we add Neo4j for citation graph storage?
|
| 303 |
+
|
| 304 |
+
**Answer**: Not yet. Here's why:
|
| 305 |
+
|
| 306 |
+
| Approach | Complexity | Value |
|
| 307 |
+
|----------|------------|-------|
|
| 308 |
+
| OpenAlex API for citation traversal | Low | High |
|
| 309 |
+
| Neo4j for local citation graph | High | Medium (unless doing graph analytics) |
|
| 310 |
+
| Cron job to sync OpenAlex → Neo4j | Medium | Only if we need offline access |
|
| 311 |
+
|
| 312 |
+
**Recommendation**: Use OpenAlex API for citation traversal first. Only add Neo4j if:
|
| 313 |
+
1. We need to do complex graph queries (PageRank on citations, community detection)
|
| 314 |
+
2. We need offline access to citation data
|
| 315 |
+
3. We're hitting OpenAlex rate limits
|
| 316 |
+
|
| 317 |
+
---
|
| 318 |
+
|
| 319 |
+
## Summary: What's Broken vs What's Working
|
| 320 |
+
|
| 321 |
+
### Working Well
|
| 322 |
+
- Basic search across all 4 sources
|
| 323 |
+
- Rate limiting and retry logic
|
| 324 |
+
- Query preprocessing
|
| 325 |
+
- Evidence model with citations
|
| 326 |
+
|
| 327 |
+
### Needs Fixing (Current Scope)
|
| 328 |
+
- Deduplication (critical)
|
| 329 |
+
- Outcome measures in ClinicalTrials (critical)
|
| 330 |
+
- Citation-based ranking (important)
|
| 331 |
+
|
| 332 |
+
### Future Enhancements (Out of Current Scope)
|
| 333 |
+
- Full-text retrieval
|
| 334 |
+
- Citation graph traversal
|
| 335 |
+
- Neo4j integration
|
| 336 |
+
- Drug mechanism data (would need new data sources)
|
| 337 |
+
|
| 338 |
+
---
|
| 339 |
+
|
| 340 |
+
## Sources
|
| 341 |
+
|
| 342 |
+
- [NCBI E-utilities Documentation](https://www.ncbi.nlm.nih.gov/books/NBK25497/)
|
| 343 |
+
- [NCBI Rate Limits](https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/)
|
| 344 |
+
- [OpenAlex API Docs](https://docs.openalex.org/)
|
| 345 |
+
- [OpenAlex Limitations](https://docs.openalex.org/api-entities/authors/limitations)
|
| 346 |
+
- [Europe PMC RESTful API](https://europepmc.org/RestfulWebService)
|
| 347 |
+
- [Europe PMC Preprints](https://pmc.ncbi.nlm.nih.gov/articles/PMC11426508/)
|
| 348 |
+
- [ClinicalTrials.gov API](https://clinicaltrials.gov/data-api/api)
|
docs/specs/SPEC_11_SEXUAL_HEALTH_FOCUS.md
CHANGED
|
@@ -1,178 +1,61 @@
|
|
| 1 |
-
# SPEC_11:
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
examples
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
**
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
###
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
**
|
| 62 |
-
- `GENERAL_CONFIG`
|
| 63 |
-
- `DRUG_REPURPOSING_CONFIG`
|
| 64 |
-
- Their entries in `DOMAIN_CONFIGS`
|
| 65 |
-
|
| 66 |
-
### Phase 2: Update Gradio Examples
|
| 67 |
-
|
| 68 |
-
**File: `src/app.py`**
|
| 69 |
-
|
| 70 |
-
Replace examples with 3 sexual-health-only queries:
|
| 71 |
-
|
| 72 |
-
```python
|
| 73 |
-
examples=[
|
| 74 |
-
[
|
| 75 |
-
"What drugs improve female libido post-menopause?",
|
| 76 |
-
"simple",
|
| 77 |
-
"sexual_health",
|
| 78 |
-
None,
|
| 79 |
-
None,
|
| 80 |
-
],
|
| 81 |
-
[
|
| 82 |
-
"Testosterone therapy for hypoactive sexual desire disorder?",
|
| 83 |
-
"simple",
|
| 84 |
-
"sexual_health",
|
| 85 |
-
None,
|
| 86 |
-
None,
|
| 87 |
-
],
|
| 88 |
-
[
|
| 89 |
-
"Clinical trials for PDE5 inhibitors alternatives?",
|
| 90 |
-
"advanced",
|
| 91 |
-
"sexual_health",
|
| 92 |
-
None,
|
| 93 |
-
None,
|
| 94 |
-
],
|
| 95 |
-
],
|
| 96 |
-
```
|
| 97 |
-
|
| 98 |
-
### Phase 3: Simplify or Remove Domain Dropdown
|
| 99 |
-
|
| 100 |
-
**Option A: Remove dropdown entirely**
|
| 101 |
-
- Remove the `gr.Dropdown` for domain selection
|
| 102 |
-
- Hardcode `domain="sexual_health"` in the function
|
| 103 |
-
|
| 104 |
-
**Option B: Keep but simplify** (recommended for backwards compat)
|
| 105 |
-
- Only show `["sexual_health"]` in choices
|
| 106 |
-
- Default to `"sexual_health"`
|
| 107 |
-
- Keeps the parameter in case we want to add domains later
|
| 108 |
-
|
| 109 |
-
```python
|
| 110 |
-
gr.Dropdown(
|
| 111 |
-
choices=["sexual_health"], # Only one choice
|
| 112 |
-
value="sexual_health",
|
| 113 |
-
label="Research Domain",
|
| 114 |
-
info="Specialized for sexual health research",
|
| 115 |
-
visible=False, # Hide since there's only one option
|
| 116 |
-
),
|
| 117 |
-
```
|
| 118 |
-
|
| 119 |
-
### Phase 4: Update Tests
|
| 120 |
-
|
| 121 |
-
Update domain-related tests to only test SEXUAL_HEALTH:
|
| 122 |
-
|
| 123 |
-
```python
|
| 124 |
-
# BEFORE
|
| 125 |
-
def test_get_domain_config_general():
|
| 126 |
-
config = get_domain_config(ResearchDomain.GENERAL)
|
| 127 |
-
assert config.name == "General Research"
|
| 128 |
-
|
| 129 |
-
# AFTER
|
| 130 |
-
def test_get_domain_config_default():
|
| 131 |
-
config = get_domain_config()
|
| 132 |
-
assert config.name == "Sexual Health Research"
|
| 133 |
-
```
|
| 134 |
-
|
| 135 |
-
### Phase 5: Update Documentation
|
| 136 |
-
|
| 137 |
-
- `CLAUDE.md`: Update description to focus on sexual health
|
| 138 |
-
- `README.md`: Update if needed
|
| 139 |
-
- Remove references to "drug repurposing" or "general" modes
|
| 140 |
-
|
| 141 |
-
## Files to Modify
|
| 142 |
-
|
| 143 |
-
| File | Changes |
|
| 144 |
-
|------|---------|
|
| 145 |
-
| `src/config/domain.py` | Remove GENERAL, DRUG_REPURPOSING; change DEFAULT_DOMAIN |
|
| 146 |
-
| `src/app.py` | Update examples; simplify/hide domain dropdown |
|
| 147 |
-
| `src/utils/config.py` | Change default `research_domain` field |
|
| 148 |
-
| `tests/unit/config/test_domain.py` | Update to test only SEXUAL_HEALTH |
|
| 149 |
-
| `tests/unit/utils/test_config_domain.py` | Update enum tests |
|
| 150 |
-
| `tests/unit/test_app_domain.py` | Update to use SEXUAL_HEALTH |
|
| 151 |
-
| `CLAUDE.md` | Update project description |
|
| 152 |
-
|
| 153 |
-
## Example Queries (All Sexual Health)
|
| 154 |
-
|
| 155 |
-
1. **Female libido**: "What drugs improve female libido post-menopause?"
|
| 156 |
-
2. **Low desire**: "Testosterone therapy for hypoactive sexual desire disorder?"
|
| 157 |
-
3. **ED alternatives**: "Clinical trials for PDE5 inhibitors alternatives?"
|
| 158 |
-
|
| 159 |
-
Alternative options:
|
| 160 |
-
- "Flibanserin mechanism of action and efficacy?"
|
| 161 |
-
- "Bremelanotide for hypoactive sexual desire disorder?"
|
| 162 |
-
- "PT-141 clinical trial results?"
|
| 163 |
-
- "Natural supplements for erectile dysfunction?"
|
| 164 |
-
|
| 165 |
-
## Success Criteria
|
| 166 |
-
|
| 167 |
-
- [ ] Only `SEXUAL_HEALTH` domain exists in enum
|
| 168 |
-
- [ ] Default domain is `SEXUAL_HEALTH`
|
| 169 |
-
- [ ] All 3 Gradio examples are sexual health queries
|
| 170 |
-
- [ ] Domain dropdown is hidden or removed
|
| 171 |
-
- [ ] All tests pass with 227+ tests
|
| 172 |
-
- [ ] No references to "Metformin for Alzheimer's" or "general" domain
|
| 173 |
-
|
| 174 |
-
## Related Issues
|
| 175 |
-
|
| 176 |
-
- #75 (CLOSED) - Domain Identity Crisis (original issue, wrong recommendation)
|
| 177 |
-
- #76 (CLOSED) - Hardcoded prompts (implemented but too general)
|
| 178 |
-
- #85 (OPEN) - Report lacks narrative synthesis (next priority)
|
|
|
|
| 1 |
+
# SPEC_11: Sexual Health Research Specialist (Final Polish)
|
| 2 |
+
|
| 3 |
+
**Status**: APPROVED
|
| 4 |
+
**Priority**: P0 (Critical Fix)
|
| 5 |
+
**Effort**: Low (Cleanup & Polish)
|
| 6 |
+
**Related Issues**: #75, #89
|
| 7 |
+
|
| 8 |
+
## 1. Executive Summary
|
| 9 |
+
|
| 10 |
+
DeepBoner is **exclusively** a Sexual Health Research Agent. The codebase is currently in a transitional state where "General" and "Drug Repurposing" modes were architecturally removed, but significant artifacts (docstrings, default arguments, variable names, and examples) remain.
|
| 11 |
+
|
| 12 |
+
This specification dictates the **complete eradication** of non-sexual-health concepts from the codebase to ensure a consistent, focused, and professional product identity.
|
| 13 |
+
|
| 14 |
+
## 2. The Rules of Engagement
|
| 15 |
+
|
| 16 |
+
1. **No "General" Defaults**: The string literal `"general"` shall not exist as a default value for any `domain` parameter.
|
| 17 |
+
2. **No "Drug Repurposing" References**: Terms like "metformin", "alzheimer", "cancer", "aspirin" in examples must be replaced with sexual health examples.
|
| 18 |
+
3. **Single Source of Truth**: `src.config.domain.ResearchDomain.SEXUAL_HEALTH` is the *only* valid domain.
|
| 19 |
+
4. **Ironclad Tests**: Tests must use sexual health queries (e.g., "libido", "testosterone", "PDE5") to ensure the domain logic is actually exercising the production paths.
|
| 20 |
+
|
| 21 |
+
## 3. Implementation Plan
|
| 22 |
+
|
| 23 |
+
### 3.1. Code Cleanup (`src/`)
|
| 24 |
+
|
| 25 |
+
#### `src/app.py`
|
| 26 |
+
- **Logic Fix**: Change `domain_str = domain or "general"` to `domain_str = domain or "sexual_health"`.
|
| 27 |
+
- **Signature Fix**: Change `domain: str = "general"` to `domain: str = "sexual_health"`.
|
| 28 |
+
- **Docstring Fix**: Remove `(e.g., "general", "sexual_health")`.
|
| 29 |
+
|
| 30 |
+
#### `src/mcp_tools.py`
|
| 31 |
+
- **Signature Fix**: Update `search_pubmed` and `search_all_sources` to default `domain="sexual_health"`.
|
| 32 |
+
- **Docstring Fix**: Update examples from "metformin alzheimer" to "testosterone libido".
|
| 33 |
+
- **Argument Description**: Remove `(general, drug_repurposing, sexual_health)` list.
|
| 34 |
+
|
| 35 |
+
#### `src/tools/*.py`
|
| 36 |
+
- **`clinicaltrials.py`, `query_utils.py`, `tools.py`**: Replace all "metformin/alzheimer" example strings with sexual health examples.
|
| 37 |
+
|
| 38 |
+
#### `src/config/domain.py`
|
| 39 |
+
- **Comment Fix**: Remove `# Get default (general) config`.
|
| 40 |
+
|
| 41 |
+
### 3.2. Test Suite Alignment (`tests/`)
|
| 42 |
+
|
| 43 |
+
#### `tests/unit/agent_factory/test_judges.py`
|
| 44 |
+
- Replace `metformin alzheimer` test queries with `sildenafil efficacy`.
|
| 45 |
+
|
| 46 |
+
#### `tests/unit/tools/test_query_utils.py`
|
| 47 |
+
- Ensure synonym expansion tests use relevant terms (or generic ones that don't imply a different domain).
|
| 48 |
+
|
| 49 |
+
#### `tests/unit/mcp/test_mcp_tools_domain.py`
|
| 50 |
+
- Verify defaults are "sexual_health", not "general".
|
| 51 |
+
|
| 52 |
+
## 4. Verification Checklist
|
| 53 |
+
|
| 54 |
+
- [ ] **Grep Audit**: `grep -r "general" src/` should return zero results where it refers to a domain default.
|
| 55 |
+
- [ ] **Grep Audit**: `grep -r "metformin" src/` should return zero results.
|
| 56 |
+
- [ ] **Functionality**: `src/app.py` runs without crashing when `domain` is `None` (defaults to sexual_health).
|
| 57 |
+
- [ ] **Tests**: All 237+ tests pass.
|
| 58 |
+
|
| 59 |
+
## 5. Success State
|
| 60 |
+
|
| 61 |
+
When this spec is implemented, a developer reading the code should see **zero evidence** that this agent was ever intended for anything other than Sexual Health research.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
**NO MOCKS. NO FAKE DATA. REAL SCIENCE.**
|
| 4 |
|
| 5 |
-
These demos run the REAL
|
| 6 |
|
| 7 |
---
|
| 8 |
|
|
@@ -31,7 +31,7 @@ NCBI_API_KEY=your-key
|
|
| 31 |
Demonstrates REAL parallel search across PubMed, ClinicalTrials.gov, and Europe PMC.
|
| 32 |
|
| 33 |
```bash
|
| 34 |
-
uv run python examples/search_demo/run_search.py "
|
| 35 |
```
|
| 36 |
|
| 37 |
**What's REAL:**
|
|
@@ -63,8 +63,8 @@ uv run python examples/embeddings_demo/run_embeddings.py
|
|
| 63 |
Demonstrates the REAL search-judge-synthesize loop.
|
| 64 |
|
| 65 |
```bash
|
| 66 |
-
uv run python examples/orchestrator_demo/run_agent.py "
|
| 67 |
-
uv run python examples/orchestrator_demo/run_agent.py "
|
| 68 |
```
|
| 69 |
|
| 70 |
**What's REAL:**
|
|
@@ -81,7 +81,7 @@ Demonstrates REAL multi-agent coordination using Microsoft Agent Framework.
|
|
| 81 |
|
| 82 |
```bash
|
| 83 |
# Requires OPENAI_API_KEY specifically
|
| 84 |
-
uv run python examples/orchestrator_demo/run_magentic.py "
|
| 85 |
```
|
| 86 |
|
| 87 |
**What's REAL:**
|
|
@@ -96,8 +96,8 @@ uv run python examples/orchestrator_demo/run_magentic.py "metformin cancer"
|
|
| 96 |
Demonstrates REAL mechanistic hypothesis generation.
|
| 97 |
|
| 98 |
```bash
|
| 99 |
-
uv run python examples/hypothesis_demo/run_hypothesis.py "
|
| 100 |
-
uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil
|
| 101 |
```
|
| 102 |
|
| 103 |
**What's REAL:**
|
|
@@ -113,8 +113,8 @@ uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil heart failu
|
|
| 113 |
**THE COMPLETE PIPELINE** - All phases working together.
|
| 114 |
|
| 115 |
```bash
|
| 116 |
-
uv run python examples/full_stack_demo/run_full.py "
|
| 117 |
-
uv run python examples/full_stack_demo/run_full.py "sildenafil
|
| 118 |
```
|
| 119 |
|
| 120 |
**What's REAL:**
|
|
@@ -181,4 +181,4 @@ Mocks belong in `tests/unit/`, not in demos. When you run these examples, you se
|
|
| 181 |
- Real scientific hypotheses
|
| 182 |
- Real research reports
|
| 183 |
|
| 184 |
-
This is what DeepBoner actually does. No fake data. No canned responses.
|
|
|
|
| 2 |
|
| 3 |
**NO MOCKS. NO FAKE DATA. REAL SCIENCE.**
|
| 4 |
|
| 5 |
+
These demos run the REAL sexual health research pipeline with actual API calls.
|
| 6 |
|
| 7 |
---
|
| 8 |
|
|
|
|
| 31 |
Demonstrates REAL parallel search across PubMed, ClinicalTrials.gov, and Europe PMC.
|
| 32 |
|
| 33 |
```bash
|
| 34 |
+
uv run python examples/search_demo/run_search.py "testosterone libido"
|
| 35 |
```
|
| 36 |
|
| 37 |
**What's REAL:**
|
|
|
|
| 63 |
Demonstrates the REAL search-judge-synthesize loop.
|
| 64 |
|
| 65 |
```bash
|
| 66 |
+
uv run python examples/orchestrator_demo/run_agent.py "testosterone libido"
|
| 67 |
+
uv run python examples/orchestrator_demo/run_agent.py "sildenafil erectile dysfunction" --iterations 5
|
| 68 |
```
|
| 69 |
|
| 70 |
**What's REAL:**
|
|
|
|
| 81 |
|
| 82 |
```bash
|
| 83 |
# Requires OPENAI_API_KEY specifically
|
| 84 |
+
uv run python examples/orchestrator_demo/run_magentic.py "testosterone libido"
|
| 85 |
```
|
| 86 |
|
| 87 |
**What's REAL:**
|
|
|
|
| 96 |
Demonstrates REAL mechanistic hypothesis generation.
|
| 97 |
|
| 98 |
```bash
|
| 99 |
+
uv run python examples/hypothesis_demo/run_hypothesis.py "testosterone libido"
|
| 100 |
+
uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil erectile dysfunction"
|
| 101 |
```
|
| 102 |
|
| 103 |
**What's REAL:**
|
|
|
|
| 113 |
**THE COMPLETE PIPELINE** - All phases working together.
|
| 114 |
|
| 115 |
```bash
|
| 116 |
+
uv run python examples/full_stack_demo/run_full.py "testosterone libido"
|
| 117 |
+
uv run python examples/full_stack_demo/run_full.py "sildenafil erectile dysfunction" -i 3
|
| 118 |
```
|
| 119 |
|
| 120 |
**What's REAL:**
|
|
|
|
| 181 |
- Real scientific hypotheses
|
| 182 |
- Real research reports
|
| 183 |
|
| 184 |
+
This is what DeepBoner actually does. No fake data. No canned responses.
|
examples/embeddings_demo/run_embeddings.py
CHANGED
|
@@ -39,7 +39,7 @@ async def demo_real_pipeline() -> None:
|
|
| 39 |
print("=" * 60)
|
| 40 |
|
| 41 |
# 1. Fetch Real Data
|
| 42 |
-
query = "
|
| 43 |
print(f"\n[1] Fetching real papers for: '{query}'...")
|
| 44 |
pubmed = PubMedTool()
|
| 45 |
# Fetch enough results to likely get some overlap/redundancy
|
|
|
|
| 39 |
print("=" * 60)
|
| 40 |
|
| 41 |
# 1. Fetch Real Data
|
| 42 |
+
query = "testosterone mechanism of action"
|
| 43 |
print(f"\n[1] Fetching real papers for: '{query}'...")
|
| 44 |
pubmed = PubMedTool()
|
| 45 |
# Fetch enough results to likely get some overlap/redundancy
|
examples/full_stack_demo/run_full.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"""
|
| 3 |
Demo: Full Stack DeepBoner Agent (Phases 1-8).
|
| 4 |
|
| 5 |
-
This script demonstrates the COMPLETE REAL
|
| 6 |
- Phase 2: REAL Search (PubMed + ClinicalTrials + Europe PMC)
|
| 7 |
- Phase 6: REAL Embeddings (sentence-transformers + ChromaDB)
|
| 8 |
- Phase 7: REAL Hypothesis (LLM mechanistic reasoning)
|
|
@@ -12,8 +12,8 @@ This script demonstrates the COMPLETE REAL drug repurposing research pipeline:
|
|
| 12 |
NO MOCKS. NO FAKE DATA. REAL SCIENCE.
|
| 13 |
|
| 14 |
Usage:
|
| 15 |
-
uv run python examples/full_stack_demo/run_full.py "
|
| 16 |
-
uv run python examples/full_stack_demo/run_full.py "sildenafil
|
| 17 |
|
| 18 |
Requires: OPENAI_API_KEY or ANTHROPIC_API_KEY
|
| 19 |
"""
|
|
@@ -183,14 +183,14 @@ This demo runs the COMPLETE pipeline with REAL API calls:
|
|
| 183 |
5. REAL report: Actual LLM generating structured report
|
| 184 |
|
| 185 |
Examples:
|
| 186 |
-
uv run python examples/full_stack_demo/run_full.py "
|
| 187 |
-
uv run python examples/full_stack_demo/run_full.py "sildenafil
|
| 188 |
-
uv run python examples/full_stack_demo/run_full.py "
|
| 189 |
""",
|
| 190 |
)
|
| 191 |
parser.add_argument(
|
| 192 |
"query",
|
| 193 |
-
help="Research query (e.g., '
|
| 194 |
)
|
| 195 |
parser.add_argument(
|
| 196 |
"-i",
|
|
|
|
| 2 |
"""
|
| 3 |
Demo: Full Stack DeepBoner Agent (Phases 1-8).
|
| 4 |
|
| 5 |
+
This script demonstrates the COMPLETE REAL sexual health research pipeline:
|
| 6 |
- Phase 2: REAL Search (PubMed + ClinicalTrials + Europe PMC)
|
| 7 |
- Phase 6: REAL Embeddings (sentence-transformers + ChromaDB)
|
| 8 |
- Phase 7: REAL Hypothesis (LLM mechanistic reasoning)
|
|
|
|
| 12 |
NO MOCKS. NO FAKE DATA. REAL SCIENCE.
|
| 13 |
|
| 14 |
Usage:
|
| 15 |
+
uv run python examples/full_stack_demo/run_full.py "testosterone libido"
|
| 16 |
+
uv run python examples/full_stack_demo/run_full.py "sildenafil erectile dysfunction" -i 3
|
| 17 |
|
| 18 |
Requires: OPENAI_API_KEY or ANTHROPIC_API_KEY
|
| 19 |
"""
|
|
|
|
| 183 |
5. REAL report: Actual LLM generating structured report
|
| 184 |
|
| 185 |
Examples:
|
| 186 |
+
uv run python examples/full_stack_demo/run_full.py "testosterone libido"
|
| 187 |
+
uv run python examples/full_stack_demo/run_full.py "sildenafil erectile dysfunction" -i 3
|
| 188 |
+
uv run python examples/full_stack_demo/run_full.py "flibanserin mechanism"
|
| 189 |
""",
|
| 190 |
)
|
| 191 |
parser.add_argument(
|
| 192 |
"query",
|
| 193 |
+
help="Research query (e.g., 'testosterone libido')",
|
| 194 |
)
|
| 195 |
parser.add_argument(
|
| 196 |
"-i",
|
examples/hypothesis_demo/run_hypothesis.py
CHANGED
|
@@ -9,8 +9,8 @@ This script demonstrates the REAL hypothesis generation pipeline:
|
|
| 9 |
|
| 10 |
Usage:
|
| 11 |
# Requires OPENAI_API_KEY or ANTHROPIC_API_KEY
|
| 12 |
-
uv run python examples/hypothesis_demo/run_hypothesis.py "
|
| 13 |
-
uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil
|
| 14 |
"""
|
| 15 |
|
| 16 |
import argparse
|
|
@@ -102,15 +102,15 @@ async def main() -> None:
|
|
| 102 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 103 |
epilog="""
|
| 104 |
Examples:
|
| 105 |
-
uv run python examples/hypothesis_demo/run_hypothesis.py "
|
| 106 |
-
uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil
|
| 107 |
-
uv run python examples/hypothesis_demo/run_hypothesis.py "
|
| 108 |
""",
|
| 109 |
)
|
| 110 |
parser.add_argument(
|
| 111 |
"query",
|
| 112 |
nargs="?",
|
| 113 |
-
default="
|
| 114 |
help="Research query",
|
| 115 |
)
|
| 116 |
args = parser.parse_args()
|
|
|
|
| 9 |
|
| 10 |
Usage:
|
| 11 |
# Requires OPENAI_API_KEY or ANTHROPIC_API_KEY
|
| 12 |
+
uv run python examples/hypothesis_demo/run_hypothesis.py "testosterone libido"
|
| 13 |
+
uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil erectile dysfunction"
|
| 14 |
"""
|
| 15 |
|
| 16 |
import argparse
|
|
|
|
| 102 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 103 |
epilog="""
|
| 104 |
Examples:
|
| 105 |
+
uv run python examples/hypothesis_demo/run_hypothesis.py "testosterone libido"
|
| 106 |
+
uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil erectile dysfunction"
|
| 107 |
+
uv run python examples/hypothesis_demo/run_hypothesis.py "flibanserin mechanism"
|
| 108 |
""",
|
| 109 |
)
|
| 110 |
parser.add_argument(
|
| 111 |
"query",
|
| 112 |
nargs="?",
|
| 113 |
+
default="testosterone libido",
|
| 114 |
help="Research query",
|
| 115 |
)
|
| 116 |
args = parser.parse_args()
|
examples/modal_demo/run_analysis.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
| 3 |
|
| 4 |
This script uses StatisticalAnalyzer directly (NO agent_framework dependency).
|
| 5 |
|
| 6 |
-
Usage:
|
| 7 |
-
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
import argparse
|
|
|
|
| 3 |
|
| 4 |
This script uses StatisticalAnalyzer directly (NO agent_framework dependency).
|
| 5 |
|
| 6 |
+
# Usage:
|
| 7 |
+
# source .env
|
| 8 |
+
# uv run python examples/modal_demo/run_analysis.py "testosterone libido"
|
| 9 |
"""
|
| 10 |
|
| 11 |
import argparse
|
examples/orchestrator_demo/run_agent.py
CHANGED
|
@@ -11,8 +11,9 @@ This script demonstrates the REAL Phase 4 orchestration:
|
|
| 11 |
NO MOCKS. REAL API CALLS.
|
| 12 |
|
| 13 |
Usage:
|
| 14 |
-
uv run python examples/orchestrator_demo/run_agent.py "
|
| 15 |
-
uv run python examples/orchestrator_demo/run_agent.py "sildenafil
|
|
|
|
| 16 |
|
| 17 |
Requires: OPENAI_API_KEY or ANTHROPIC_API_KEY
|
| 18 |
"""
|
|
@@ -46,11 +47,11 @@ This demo runs the REAL search-judge-synthesize loop:
|
|
| 46 |
4. REAL synthesis: Actual research summary generation
|
| 47 |
|
| 48 |
Examples:
|
| 49 |
-
uv run python examples/orchestrator_demo/run_agent.py "
|
| 50 |
-
uv run python examples/orchestrator_demo/run_agent.py "
|
| 51 |
""",
|
| 52 |
)
|
| 53 |
-
parser.add_argument("query", help="Research query (e.g., '
|
| 54 |
parser.add_argument("--iterations", type=int, default=3, help="Max iterations (default: 3)")
|
| 55 |
args = parser.parse_args()
|
| 56 |
|
|
|
|
| 11 |
NO MOCKS. REAL API CALLS.
|
| 12 |
|
| 13 |
Usage:
|
| 14 |
+
uv run python examples/orchestrator_demo/run_agent.py "testosterone libido"
|
| 15 |
+
uv run python examples/orchestrator_demo/run_agent.py "sildenafil erectile dysfunction" \
|
| 16 |
+
--iterations 5
|
| 17 |
|
| 18 |
Requires: OPENAI_API_KEY or ANTHROPIC_API_KEY
|
| 19 |
"""
|
|
|
|
| 47 |
4. REAL synthesis: Actual research summary generation
|
| 48 |
|
| 49 |
Examples:
|
| 50 |
+
uv run python examples/orchestrator_demo/run_agent.py "testosterone libido"
|
| 51 |
+
uv run python examples/orchestrator_demo/run_agent.py "flibanserin HSDD" --iterations 5
|
| 52 |
""",
|
| 53 |
)
|
| 54 |
+
parser.add_argument("query", help="Research query (e.g., 'testosterone libido')")
|
| 55 |
parser.add_argument("--iterations", type=int, default=3, help="Max iterations (default: 3)")
|
| 56 |
args = parser.parse_args()
|
| 57 |
|
examples/orchestrator_demo/run_magentic.py
CHANGED
|
@@ -8,7 +8,7 @@ This script demonstrates Phase 5 functionality:
|
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
export OPENAI_API_KEY=...
|
| 11 |
-
uv run python examples/orchestrator_demo/run_magentic.py "
|
| 12 |
"""
|
| 13 |
|
| 14 |
import argparse
|
|
@@ -28,7 +28,7 @@ from src.utils.models import OrchestratorConfig
|
|
| 28 |
async def main() -> None:
|
| 29 |
"""Run the magentic agent demo."""
|
| 30 |
parser = argparse.ArgumentParser(description="Run DeepBoner Magentic Agent")
|
| 31 |
-
parser.add_argument("query", help="Research query (e.g., '
|
| 32 |
parser.add_argument("--iterations", type=int, default=10, help="Max rounds")
|
| 33 |
args = parser.parse_args()
|
| 34 |
|
|
|
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
export OPENAI_API_KEY=...
|
| 11 |
+
uv run python examples/orchestrator_demo/run_magentic.py "testosterone libido"
|
| 12 |
"""
|
| 13 |
|
| 14 |
import argparse
|
|
|
|
| 28 |
async def main() -> None:
|
| 29 |
"""Run the magentic agent demo."""
|
| 30 |
parser = argparse.ArgumentParser(description="Run DeepBoner Magentic Agent")
|
| 31 |
+
parser.add_argument("query", help="Research query (e.g., 'testosterone libido')")
|
| 32 |
parser.add_argument("--iterations", type=int, default=10, help="Max rounds")
|
| 33 |
args = parser.parse_args()
|
| 34 |
|
examples/search_demo/run_search.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Demo: Search for
|
| 4 |
|
| 5 |
This script demonstrates multi-source search functionality:
|
| 6 |
- PubMed search (biomedical literature)
|
|
@@ -12,7 +12,7 @@ Usage:
|
|
| 12 |
uv run python examples/search_demo/run_search.py
|
| 13 |
|
| 14 |
# With custom query:
|
| 15 |
-
uv run python examples/search_demo/run_search.py "
|
| 16 |
|
| 17 |
Requirements:
|
| 18 |
- Optional: NCBI_API_KEY in .env for higher PubMed rate limits
|
|
@@ -61,7 +61,7 @@ async def main(query: str) -> None:
|
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
| 63 |
# Default query or use command line arg
|
| 64 |
-
default_query = "
|
| 65 |
query = sys.argv[1] if len(sys.argv) > 1 else default_query
|
| 66 |
|
| 67 |
asyncio.run(main(query))
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Demo: Search for sexual health research evidence.
|
| 4 |
|
| 5 |
This script demonstrates multi-source search functionality:
|
| 6 |
- PubMed search (biomedical literature)
|
|
|
|
| 12 |
uv run python examples/search_demo/run_search.py
|
| 13 |
|
| 14 |
# With custom query:
|
| 15 |
+
uv run python examples/search_demo/run_search.py "testosterone libido"
|
| 16 |
|
| 17 |
Requirements:
|
| 18 |
- Optional: NCBI_API_KEY in .env for higher PubMed rate limits
|
|
|
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
| 63 |
# Default query or use command line arg
|
| 64 |
+
default_query = "testosterone post-menopause libido"
|
| 65 |
query = sys.argv[1] if len(sys.argv) > 1 else default_query
|
| 66 |
|
| 67 |
asyncio.run(main(query))
|
src/agent_factory/judges.py
CHANGED
|
@@ -166,7 +166,13 @@ class JudgeHandler:
|
|
| 166 |
return assessment
|
| 167 |
|
| 168 |
except Exception as e:
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
# Return a safe default assessment on failure
|
| 171 |
return self._create_fallback_assessment(question, str(e))
|
| 172 |
|
|
|
|
| 166 |
return assessment
|
| 167 |
|
| 168 |
except Exception as e:
|
| 169 |
+
# Log with context for debugging
|
| 170 |
+
logger.error(
|
| 171 |
+
"Assessment failed",
|
| 172 |
+
error=str(e),
|
| 173 |
+
exc_type=type(e).__name__,
|
| 174 |
+
evidence_count=len(evidence),
|
| 175 |
+
)
|
| 176 |
# Return a safe default assessment on failure
|
| 177 |
return self._create_fallback_assessment(question, str(e))
|
| 178 |
|
src/agents/magentic_agents.py
CHANGED
|
@@ -133,7 +133,7 @@ Based on evidence:
|
|
| 133 |
DRUG -> TARGET -> PATHWAY -> THERAPEUTIC EFFECT
|
| 134 |
|
| 135 |
Example:
|
| 136 |
-
|
| 137 |
|
| 138 |
4. Explain the rationale for each hypothesis
|
| 139 |
5. Suggest what additional evidence would support or refute it
|
|
|
|
| 133 |
DRUG -> TARGET -> PATHWAY -> THERAPEUTIC EFFECT
|
| 134 |
|
| 135 |
Example:
|
| 136 |
+
Testosterone -> Androgen receptor -> Dopamine modulation -> Enhanced libido
|
| 137 |
|
| 138 |
4. Explain the rationale for each hypothesis
|
| 139 |
5. Suggest what additional evidence would support or refute it
|
src/agents/tools.py
CHANGED
|
@@ -25,7 +25,7 @@ async def search_pubmed(query: str, max_results: int = 10) -> str:
|
|
| 25 |
drugs, diseases, mechanisms of action, and clinical studies.
|
| 26 |
|
| 27 |
Args:
|
| 28 |
-
query: Search keywords (e.g., "
|
| 29 |
max_results: Maximum results to return (default 10)
|
| 30 |
|
| 31 |
Returns:
|
|
@@ -85,7 +85,7 @@ async def search_clinical_trials(query: str, max_results: int = 10) -> str:
|
|
| 85 |
for potential interventions.
|
| 86 |
|
| 87 |
Args:
|
| 88 |
-
query: Search terms (e.g., "
|
| 89 |
max_results: Maximum results to return (default 10)
|
| 90 |
|
| 91 |
Returns:
|
|
@@ -125,7 +125,7 @@ async def search_preprints(query: str, max_results: int = 10) -> str:
|
|
| 125 |
from bioRxiv, medRxiv, and peer-reviewed papers.
|
| 126 |
|
| 127 |
Args:
|
| 128 |
-
query: Search terms (e.g., "
|
| 129 |
max_results: Maximum results to return (default 10)
|
| 130 |
|
| 131 |
Returns:
|
|
|
|
| 25 |
drugs, diseases, mechanisms of action, and clinical studies.
|
| 26 |
|
| 27 |
Args:
|
| 28 |
+
query: Search keywords (e.g., "testosterone libido mechanism")
|
| 29 |
max_results: Maximum results to return (default 10)
|
| 30 |
|
| 31 |
Returns:
|
|
|
|
| 85 |
for potential interventions.
|
| 86 |
|
| 87 |
Args:
|
| 88 |
+
query: Search terms (e.g., "sildenafil phase 3")
|
| 89 |
max_results: Maximum results to return (default 10)
|
| 90 |
|
| 91 |
Returns:
|
|
|
|
| 125 |
from bioRxiv, medRxiv, and peer-reviewed papers.
|
| 126 |
|
| 127 |
Args:
|
| 128 |
+
query: Search terms (e.g., "flibanserin HSDD preprint")
|
| 129 |
max_results: Maximum results to return (default 10)
|
| 130 |
|
| 131 |
Returns:
|
src/app.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
from collections.abc import AsyncGenerator
|
| 5 |
-
from typing import Any
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
from pydantic_ai.models.anthropic import AnthropicModel
|
|
@@ -22,10 +22,12 @@ from src.utils.config import settings
|
|
| 22 |
from src.utils.exceptions import ConfigurationError
|
| 23 |
from src.utils.models import OrchestratorConfig
|
| 24 |
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def configure_orchestrator(
|
| 27 |
use_mock: bool = False,
|
| 28 |
-
mode:
|
| 29 |
user_api_key: str | None = None,
|
| 30 |
domain: str | ResearchDomain | None = None,
|
| 31 |
) -> tuple[Any, str]:
|
|
@@ -36,7 +38,7 @@ def configure_orchestrator(
|
|
| 36 |
use_mock: If True, use MockJudgeHandler (no API key needed)
|
| 37 |
mode: Orchestrator mode ("simple" or "advanced")
|
| 38 |
user_api_key: Optional user-provided API key (BYOK) - auto-detects provider
|
| 39 |
-
domain: Research domain (
|
| 40 |
|
| 41 |
Returns:
|
| 42 |
Tuple of (Orchestrator instance, backend_name)
|
|
@@ -100,7 +102,7 @@ def configure_orchestrator(
|
|
| 100 |
search_handler=search_handler,
|
| 101 |
judge_handler=judge_handler,
|
| 102 |
config=config,
|
| 103 |
-
mode=mode,
|
| 104 |
api_key=user_api_key,
|
| 105 |
domain=domain,
|
| 106 |
)
|
|
@@ -111,8 +113,8 @@ def configure_orchestrator(
|
|
| 111 |
async def research_agent(
|
| 112 |
message: str,
|
| 113 |
history: list[dict[str, Any]],
|
| 114 |
-
mode: str = "simple",
|
| 115 |
-
domain: str = "
|
| 116 |
api_key: str = "",
|
| 117 |
api_key_state: str = "",
|
| 118 |
) -> AsyncGenerator[str, None]:
|
|
@@ -138,7 +140,11 @@ async def research_agent(
|
|
| 138 |
# Gradio passes None for missing example columns, overriding defaults
|
| 139 |
api_key_str = api_key or ""
|
| 140 |
api_key_state_str = api_key_state or ""
|
| 141 |
-
domain_str = domain or "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
# BUG FIX: Prefer freshly-entered key, then persisted state
|
| 144 |
user_api_key = (api_key_str.strip() or api_key_state_str.strip()) or None
|
|
@@ -153,12 +159,12 @@ async def research_agent(
|
|
| 153 |
has_paid_key = has_openai or has_anthropic or bool(user_api_key)
|
| 154 |
|
| 155 |
# Advanced mode requires OpenAI specifically (due to agent-framework binding)
|
| 156 |
-
if
|
| 157 |
yield (
|
| 158 |
"⚠️ **Warning**: Advanced mode currently requires OpenAI API key. "
|
| 159 |
"Anthropic keys only work in Simple mode. Falling back to Simple.\n\n"
|
| 160 |
)
|
| 161 |
-
|
| 162 |
|
| 163 |
# Inform user about fallback if no keys
|
| 164 |
if not has_paid_key:
|
|
@@ -177,14 +183,16 @@ async def research_agent(
|
|
| 177 |
# It will use: Paid API > HF Inference (free tier)
|
| 178 |
orchestrator, backend_name = configure_orchestrator(
|
| 179 |
use_mock=False, # Never use mock in production - HF Inference is the free fallback
|
| 180 |
-
mode=
|
| 181 |
user_api_key=user_api_key,
|
| 182 |
domain=domain_str,
|
| 183 |
)
|
| 184 |
|
| 185 |
# Immediate backend info + loading feedback so user knows something is happening
|
|
|
|
|
|
|
| 186 |
yield (
|
| 187 |
-
f"🧠 **Backend**: {backend_name} | **Domain**: {
|
| 188 |
"⏳ **Processing...** Searching PubMed, ClinicalTrials.gov, Europe PMC, OpenAlex...\n"
|
| 189 |
)
|
| 190 |
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
from collections.abc import AsyncGenerator
|
| 5 |
+
from typing import Any, Literal
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
from pydantic_ai.models.anthropic import AnthropicModel
|
|
|
|
| 22 |
from src.utils.exceptions import ConfigurationError
|
| 23 |
from src.utils.models import OrchestratorConfig
|
| 24 |
|
| 25 |
+
OrchestratorMode = Literal["simple", "magentic", "advanced", "hierarchical"]
|
| 26 |
+
|
| 27 |
|
| 28 |
def configure_orchestrator(
|
| 29 |
use_mock: bool = False,
|
| 30 |
+
mode: OrchestratorMode = "simple",
|
| 31 |
user_api_key: str | None = None,
|
| 32 |
domain: str | ResearchDomain | None = None,
|
| 33 |
) -> tuple[Any, str]:
|
|
|
|
| 38 |
use_mock: If True, use MockJudgeHandler (no API key needed)
|
| 39 |
mode: Orchestrator mode ("simple" or "advanced")
|
| 40 |
user_api_key: Optional user-provided API key (BYOK) - auto-detects provider
|
| 41 |
+
domain: Research domain (defaults to "sexual_health")
|
| 42 |
|
| 43 |
Returns:
|
| 44 |
Tuple of (Orchestrator instance, backend_name)
|
|
|
|
| 102 |
search_handler=search_handler,
|
| 103 |
judge_handler=judge_handler,
|
| 104 |
config=config,
|
| 105 |
+
mode=mode,
|
| 106 |
api_key=user_api_key,
|
| 107 |
domain=domain,
|
| 108 |
)
|
|
|
|
| 113 |
async def research_agent(
|
| 114 |
message: str,
|
| 115 |
history: list[dict[str, Any]],
|
| 116 |
+
mode: str = "simple", # Gradio passes strings; validated below
|
| 117 |
+
domain: str = "sexual_health",
|
| 118 |
api_key: str = "",
|
| 119 |
api_key_state: str = "",
|
| 120 |
) -> AsyncGenerator[str, None]:
|
|
|
|
| 140 |
# Gradio passes None for missing example columns, overriding defaults
|
| 141 |
api_key_str = api_key or ""
|
| 142 |
api_key_state_str = api_key_state or ""
|
| 143 |
+
domain_str = domain or "sexual_health"
|
| 144 |
+
|
| 145 |
+
# Validate and cast mode to proper type
|
| 146 |
+
valid_modes: set[str] = {"simple", "magentic", "advanced", "hierarchical"}
|
| 147 |
+
mode_validated: OrchestratorMode = mode if mode in valid_modes else "simple" # type: ignore[assignment]
|
| 148 |
|
| 149 |
# BUG FIX: Prefer freshly-entered key, then persisted state
|
| 150 |
user_api_key = (api_key_str.strip() or api_key_state_str.strip()) or None
|
|
|
|
| 159 |
has_paid_key = has_openai or has_anthropic or bool(user_api_key)
|
| 160 |
|
| 161 |
# Advanced mode requires OpenAI specifically (due to agent-framework binding)
|
| 162 |
+
if mode_validated == "advanced" and not (has_openai or is_openai_user_key):
|
| 163 |
yield (
|
| 164 |
"⚠️ **Warning**: Advanced mode currently requires OpenAI API key. "
|
| 165 |
"Anthropic keys only work in Simple mode. Falling back to Simple.\n\n"
|
| 166 |
)
|
| 167 |
+
mode_validated = "simple"
|
| 168 |
|
| 169 |
# Inform user about fallback if no keys
|
| 170 |
if not has_paid_key:
|
|
|
|
| 183 |
# It will use: Paid API > HF Inference (free tier)
|
| 184 |
orchestrator, backend_name = configure_orchestrator(
|
| 185 |
use_mock=False, # Never use mock in production - HF Inference is the free fallback
|
| 186 |
+
mode=mode_validated,
|
| 187 |
user_api_key=user_api_key,
|
| 188 |
domain=domain_str,
|
| 189 |
)
|
| 190 |
|
| 191 |
# Immediate backend info + loading feedback so user knows something is happening
|
| 192 |
+
# Use replace to get "Sexual Health" instead of "Sexual_Health" from .title()
|
| 193 |
+
domain_display = domain_str.replace("_", " ").title()
|
| 194 |
yield (
|
| 195 |
+
f"🧠 **Backend**: {backend_name} | **Domain**: {domain_display}\n\n"
|
| 196 |
"⏳ **Processing...** Searching PubMed, ClinicalTrials.gov, Europe PMC, OpenAlex...\n"
|
| 197 |
)
|
| 198 |
|
src/config/domain.py
CHANGED
|
@@ -6,7 +6,7 @@ allowing the agent to operate in domain-agnostic or domain-specific modes.
|
|
| 6 |
Usage:
|
| 7 |
from src.config.domain import get_domain_config, ResearchDomain
|
| 8 |
|
| 9 |
-
# Get default
|
| 10 |
config = get_domain_config()
|
| 11 |
|
| 12 |
# Get specific domain
|
|
@@ -111,7 +111,7 @@ def get_domain_config(domain: ResearchDomain | str | None = None) -> DomainConfi
|
|
| 111 |
"""Get configuration for a research domain.
|
| 112 |
|
| 113 |
Args:
|
| 114 |
-
domain: The research domain. Defaults to
|
| 115 |
|
| 116 |
Returns:
|
| 117 |
DomainConfig for the specified domain.
|
|
|
|
| 6 |
Usage:
|
| 7 |
from src.config.domain import get_domain_config, ResearchDomain
|
| 8 |
|
| 9 |
+
# Get default config
|
| 10 |
config = get_domain_config()
|
| 11 |
|
| 12 |
# Get specific domain
|
|
|
|
| 111 |
"""Get configuration for a research domain.
|
| 112 |
|
| 113 |
Args:
|
| 114 |
+
domain: The research domain. Defaults to sexual_health if None.
|
| 115 |
|
| 116 |
Returns:
|
| 117 |
DomainConfig for the specified domain.
|
src/mcp_tools.py
CHANGED
|
@@ -18,16 +18,16 @@ _trials = ClinicalTrialsTool()
|
|
| 18 |
_europepmc = EuropePMCTool()
|
| 19 |
|
| 20 |
|
| 21 |
-
async def search_pubmed(query: str, max_results: int = 10, domain: str = "
|
| 22 |
"""Search PubMed for peer-reviewed biomedical literature.
|
| 23 |
|
| 24 |
Searches NCBI PubMed database for scientific papers matching your query.
|
| 25 |
Returns titles, authors, abstracts, and citation information.
|
| 26 |
|
| 27 |
Args:
|
| 28 |
-
query: Search query (e.g., "
|
| 29 |
max_results: Maximum results to return (1-50, default 10)
|
| 30 |
-
domain: Research domain (
|
| 31 |
|
| 32 |
Returns:
|
| 33 |
Formatted search results with paper titles, authors, dates, and abstracts
|
|
@@ -58,7 +58,7 @@ async def search_clinical_trials(query: str, max_results: int = 10) -> str:
|
|
| 58 |
Returns trial titles, phases, status, conditions, and interventions.
|
| 59 |
|
| 60 |
Args:
|
| 61 |
-
query: Search query (e.g., "
|
| 62 |
max_results: Maximum results to return (1-50, default 10)
|
| 63 |
|
| 64 |
Returns:
|
|
@@ -88,7 +88,7 @@ async def search_europepmc(query: str, max_results: int = 10) -> str:
|
|
| 88 |
Useful for finding cutting-edge preprints and open access papers.
|
| 89 |
|
| 90 |
Args:
|
| 91 |
-
query: Search query (e.g., "
|
| 92 |
max_results: Maximum results to return (1-50, default 10)
|
| 93 |
|
| 94 |
Returns:
|
|
@@ -112,16 +112,18 @@ async def search_europepmc(query: str, max_results: int = 10) -> str:
|
|
| 112 |
return "\n".join(formatted)
|
| 113 |
|
| 114 |
|
| 115 |
-
async def search_all_sources(
|
|
|
|
|
|
|
| 116 |
"""Search all biomedical sources simultaneously.
|
| 117 |
|
| 118 |
Performs parallel search across PubMed, ClinicalTrials.gov, and Europe PMC.
|
| 119 |
This is the most comprehensive search option for biomedical research.
|
| 120 |
|
| 121 |
Args:
|
| 122 |
-
query: Search query (e.g., "
|
| 123 |
max_per_source: Maximum results per source (1-20, default 5)
|
| 124 |
-
domain: Research domain (
|
| 125 |
|
| 126 |
Returns:
|
| 127 |
Combined results from all sources with source labels
|
|
@@ -172,8 +174,8 @@ async def analyze_hypothesis(
|
|
| 172 |
the statistical evidence for a research hypothesis.
|
| 173 |
|
| 174 |
Args:
|
| 175 |
-
drug: The drug being evaluated (e.g., "
|
| 176 |
-
condition: The target condition (e.g., "
|
| 177 |
evidence_summary: Summary of evidence to analyze
|
| 178 |
|
| 179 |
Returns:
|
|
|
|
| 18 |
_europepmc = EuropePMCTool()
|
| 19 |
|
| 20 |
|
| 21 |
+
async def search_pubmed(query: str, max_results: int = 10, domain: str = "sexual_health") -> str:
|
| 22 |
"""Search PubMed for peer-reviewed biomedical literature.
|
| 23 |
|
| 24 |
Searches NCBI PubMed database for scientific papers matching your query.
|
| 25 |
Returns titles, authors, abstracts, and citation information.
|
| 26 |
|
| 27 |
Args:
|
| 28 |
+
query: Search query (e.g., "testosterone libido")
|
| 29 |
max_results: Maximum results to return (1-50, default 10)
|
| 30 |
+
domain: Research domain (defaults to "sexual_health")
|
| 31 |
|
| 32 |
Returns:
|
| 33 |
Formatted search results with paper titles, authors, dates, and abstracts
|
|
|
|
| 58 |
Returns trial titles, phases, status, conditions, and interventions.
|
| 59 |
|
| 60 |
Args:
|
| 61 |
+
query: Search query (e.g., "testosterone hypoactive desire", "sildenafil phase 3")
|
| 62 |
max_results: Maximum results to return (1-50, default 10)
|
| 63 |
|
| 64 |
Returns:
|
|
|
|
| 88 |
Useful for finding cutting-edge preprints and open access papers.
|
| 89 |
|
| 90 |
Args:
|
| 91 |
+
query: Search query (e.g., "flibanserin mechanism", "erectile dysfunction novel treatment")
|
| 92 |
max_results: Maximum results to return (1-50, default 10)
|
| 93 |
|
| 94 |
Returns:
|
|
|
|
| 112 |
return "\n".join(formatted)
|
| 113 |
|
| 114 |
|
| 115 |
+
async def search_all_sources(
|
| 116 |
+
query: str, max_per_source: int = 5, domain: str = "sexual_health"
|
| 117 |
+
) -> str:
|
| 118 |
"""Search all biomedical sources simultaneously.
|
| 119 |
|
| 120 |
Performs parallel search across PubMed, ClinicalTrials.gov, and Europe PMC.
|
| 121 |
This is the most comprehensive search option for biomedical research.
|
| 122 |
|
| 123 |
Args:
|
| 124 |
+
query: Search query (e.g., "testosterone replacement therapy", "HSDD treatment")
|
| 125 |
max_per_source: Maximum results per source (1-20, default 5)
|
| 126 |
+
domain: Research domain (defaults to "sexual_health")
|
| 127 |
|
| 128 |
Returns:
|
| 129 |
Combined results from all sources with source labels
|
|
|
|
| 174 |
the statistical evidence for a research hypothesis.
|
| 175 |
|
| 176 |
Args:
|
| 177 |
+
drug: The drug being evaluated (e.g., "sildenafil")
|
| 178 |
+
condition: The target condition (e.g., "erectile dysfunction")
|
| 179 |
evidence_summary: Summary of evidence to analyze
|
| 180 |
|
| 181 |
Returns:
|
src/middleware/sub_iteration.py
CHANGED
|
@@ -81,12 +81,18 @@ class SubIterationMiddleware:
|
|
| 81 |
history.append(result)
|
| 82 |
best_result = result # Assume latest is best for now
|
| 83 |
except Exception as e:
|
| 84 |
-
logger.error(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
if event_callback:
|
| 86 |
await event_callback(
|
| 87 |
AgentEvent(
|
| 88 |
type="error",
|
| 89 |
message=f"Sub-iteration execution failed: {e}",
|
|
|
|
| 90 |
iteration=i,
|
| 91 |
)
|
| 92 |
)
|
|
@@ -97,12 +103,18 @@ class SubIterationMiddleware:
|
|
| 97 |
assessment = await self.judge.assess(task, result, history)
|
| 98 |
final_assessment = assessment
|
| 99 |
except Exception as e:
|
| 100 |
-
logger.error(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
if event_callback:
|
| 102 |
await event_callback(
|
| 103 |
AgentEvent(
|
| 104 |
type="error",
|
| 105 |
message=f"Sub-iteration judge failed: {e}",
|
|
|
|
| 106 |
iteration=i,
|
| 107 |
)
|
| 108 |
)
|
|
|
|
| 81 |
history.append(result)
|
| 82 |
best_result = result # Assume latest is best for now
|
| 83 |
except Exception as e:
|
| 84 |
+
logger.error(
|
| 85 |
+
"Sub-iteration execution failed",
|
| 86 |
+
error=str(e),
|
| 87 |
+
exc_type=type(e).__name__,
|
| 88 |
+
iteration=i,
|
| 89 |
+
)
|
| 90 |
if event_callback:
|
| 91 |
await event_callback(
|
| 92 |
AgentEvent(
|
| 93 |
type="error",
|
| 94 |
message=f"Sub-iteration execution failed: {e}",
|
| 95 |
+
data={"recoverable": False, "error_type": type(e).__name__},
|
| 96 |
iteration=i,
|
| 97 |
)
|
| 98 |
)
|
|
|
|
| 103 |
assessment = await self.judge.assess(task, result, history)
|
| 104 |
final_assessment = assessment
|
| 105 |
except Exception as e:
|
| 106 |
+
logger.error(
|
| 107 |
+
"Sub-iteration judge failed",
|
| 108 |
+
error=str(e),
|
| 109 |
+
exc_type=type(e).__name__,
|
| 110 |
+
iteration=i,
|
| 111 |
+
)
|
| 112 |
if event_callback:
|
| 113 |
await event_callback(
|
| 114 |
AgentEvent(
|
| 115 |
type="error",
|
| 116 |
message=f"Sub-iteration judge failed: {e}",
|
| 117 |
+
data={"recoverable": False, "error_type": type(e).__name__},
|
| 118 |
iteration=i,
|
| 119 |
)
|
| 120 |
)
|
src/orchestrators/factory.py
CHANGED
|
@@ -75,7 +75,7 @@ def create_orchestrator(
|
|
| 75 |
mode: "simple", "magentic", "advanced", or "hierarchical"
|
| 76 |
Note: "magentic" is an alias for "advanced" (kept for backwards compatibility)
|
| 77 |
api_key: Optional API key for advanced mode (OpenAI)
|
| 78 |
-
domain: Research domain for customization (default:
|
| 79 |
|
| 80 |
Returns:
|
| 81 |
Orchestrator instance implementing OrchestratorProtocol
|
|
|
|
| 75 |
mode: "simple", "magentic", "advanced", or "hierarchical"
|
| 76 |
Note: "magentic" is an alias for "advanced" (kept for backwards compatibility)
|
| 77 |
api_key: Optional API key for advanced mode (OpenAI)
|
| 78 |
+
domain: Research domain for customization (default: sexual_health)
|
| 79 |
|
| 80 |
Returns:
|
| 81 |
Orchestrator instance implementing OrchestratorProtocol
|
src/orchestrators/simple.py
CHANGED
|
@@ -18,7 +18,9 @@ import structlog
|
|
| 18 |
|
| 19 |
from src.config.domain import ResearchDomain, get_domain_config
|
| 20 |
from src.orchestrators.base import JudgeHandlerProtocol, SearchHandlerProtocol
|
|
|
|
| 21 |
from src.utils.config import settings
|
|
|
|
| 22 |
from src.utils.models import (
|
| 23 |
AgentEvent,
|
| 24 |
Evidence,
|
|
@@ -132,12 +134,25 @@ class Orchestrator:
|
|
| 132 |
iteration=iteration,
|
| 133 |
)
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
yield AgentEvent(
|
| 138 |
type="error",
|
| 139 |
message=f"Modal analysis failed: {e}",
|
| 140 |
-
data={"error": str(e)},
|
| 141 |
iteration=iteration,
|
| 142 |
)
|
| 143 |
|
|
@@ -288,11 +303,26 @@ class Orchestrator:
|
|
| 288 |
if errors:
|
| 289 |
logger.warning("Search errors", errors=errors)
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
except Exception as e:
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
yield AgentEvent(
|
| 294 |
type="error",
|
| 295 |
message=f"Search failed: {e!s}",
|
|
|
|
| 296 |
iteration=iteration,
|
| 297 |
)
|
| 298 |
continue
|
|
@@ -388,9 +418,9 @@ class Orchestrator:
|
|
| 388 |
iteration=iteration,
|
| 389 |
)
|
| 390 |
|
| 391 |
-
# Generate final response
|
| 392 |
# Use all gathered evidence for the final report
|
| 393 |
-
final_response = self._generate_synthesis(query, all_evidence, assessment)
|
| 394 |
|
| 395 |
yield AgentEvent(
|
| 396 |
type="complete",
|
|
@@ -424,11 +454,26 @@ class Orchestrator:
|
|
| 424 |
iteration=iteration,
|
| 425 |
)
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
except Exception as e:
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
yield AgentEvent(
|
| 430 |
type="error",
|
| 431 |
message=f"Assessment failed: {e!s}",
|
|
|
|
| 432 |
iteration=iteration,
|
| 433 |
)
|
| 434 |
continue
|
|
@@ -445,14 +490,105 @@ class Orchestrator:
|
|
| 445 |
iteration=iteration,
|
| 446 |
)
|
| 447 |
|
| 448 |
-
def _generate_synthesis(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
self,
|
| 450 |
query: str,
|
| 451 |
evidence: list[Evidence],
|
| 452 |
assessment: JudgeAssessment,
|
| 453 |
) -> str:
|
| 454 |
"""
|
| 455 |
-
Generate
|
|
|
|
|
|
|
| 456 |
|
| 457 |
Args:
|
| 458 |
query: The original question
|
|
@@ -460,7 +596,7 @@ class Orchestrator:
|
|
| 460 |
assessment: The final assessment
|
| 461 |
|
| 462 |
Returns:
|
| 463 |
-
Formatted synthesis as markdown
|
| 464 |
"""
|
| 465 |
drug_list = (
|
| 466 |
"\n".join([f"- **{d}**" for d in assessment.details.drug_candidates])
|
|
@@ -474,7 +610,7 @@ class Orchestrator:
|
|
| 474 |
[
|
| 475 |
f"{i + 1}. [{e.citation.title}]({e.citation.url}) "
|
| 476 |
f"({e.citation.source.upper()}, {e.citation.date})"
|
| 477 |
-
for i, e in enumerate(evidence[:10])
|
| 478 |
]
|
| 479 |
)
|
| 480 |
|
|
|
|
| 18 |
|
| 19 |
from src.config.domain import ResearchDomain, get_domain_config
|
| 20 |
from src.orchestrators.base import JudgeHandlerProtocol, SearchHandlerProtocol
|
| 21 |
+
from src.prompts.synthesis import format_synthesis_prompt, get_synthesis_system_prompt
|
| 22 |
from src.utils.config import settings
|
| 23 |
+
from src.utils.exceptions import JudgeError, ModalError, SearchError
|
| 24 |
from src.utils.models import (
|
| 25 |
AgentEvent,
|
| 26 |
Evidence,
|
|
|
|
| 134 |
iteration=iteration,
|
| 135 |
)
|
| 136 |
|
| 137 |
+
except ModalError as e:
|
| 138 |
+
logger.error("Modal analysis failed", error=str(e), exc_type="ModalError")
|
| 139 |
+
yield AgentEvent(
|
| 140 |
+
type="error",
|
| 141 |
+
message=f"Modal analysis failed: {e}",
|
| 142 |
+
data={"error": str(e), "recoverable": True},
|
| 143 |
+
iteration=iteration,
|
| 144 |
+
)
|
| 145 |
except Exception as e:
|
| 146 |
+
# Unexpected error - log with full context for debugging
|
| 147 |
+
logger.error(
|
| 148 |
+
"Modal analysis failed unexpectedly",
|
| 149 |
+
error=str(e),
|
| 150 |
+
exc_type=type(e).__name__,
|
| 151 |
+
)
|
| 152 |
yield AgentEvent(
|
| 153 |
type="error",
|
| 154 |
message=f"Modal analysis failed: {e}",
|
| 155 |
+
data={"error": str(e), "recoverable": True},
|
| 156 |
iteration=iteration,
|
| 157 |
)
|
| 158 |
|
|
|
|
| 303 |
if errors:
|
| 304 |
logger.warning("Search errors", errors=errors)
|
| 305 |
|
| 306 |
+
except SearchError as e:
|
| 307 |
+
logger.error("Search phase failed", error=str(e), exc_type="SearchError")
|
| 308 |
+
yield AgentEvent(
|
| 309 |
+
type="error",
|
| 310 |
+
message=f"Search failed: {e!s}",
|
| 311 |
+
data={"recoverable": True, "error_type": "search"},
|
| 312 |
+
iteration=iteration,
|
| 313 |
+
)
|
| 314 |
+
continue
|
| 315 |
except Exception as e:
|
| 316 |
+
# Unexpected error - log full context for debugging
|
| 317 |
+
logger.error(
|
| 318 |
+
"Search phase failed unexpectedly",
|
| 319 |
+
error=str(e),
|
| 320 |
+
exc_type=type(e).__name__,
|
| 321 |
+
)
|
| 322 |
yield AgentEvent(
|
| 323 |
type="error",
|
| 324 |
message=f"Search failed: {e!s}",
|
| 325 |
+
data={"recoverable": True, "error_type": "unexpected"},
|
| 326 |
iteration=iteration,
|
| 327 |
)
|
| 328 |
continue
|
|
|
|
| 418 |
iteration=iteration,
|
| 419 |
)
|
| 420 |
|
| 421 |
+
# Generate final response using LLM narrative synthesis
|
| 422 |
# Use all gathered evidence for the final report
|
| 423 |
+
final_response = await self._generate_synthesis(query, all_evidence, assessment)
|
| 424 |
|
| 425 |
yield AgentEvent(
|
| 426 |
type="complete",
|
|
|
|
| 454 |
iteration=iteration,
|
| 455 |
)
|
| 456 |
|
| 457 |
+
except JudgeError as e:
|
| 458 |
+
logger.error("Judge phase failed", error=str(e), exc_type="JudgeError")
|
| 459 |
+
yield AgentEvent(
|
| 460 |
+
type="error",
|
| 461 |
+
message=f"Assessment failed: {e!s}",
|
| 462 |
+
data={"recoverable": True, "error_type": "judge"},
|
| 463 |
+
iteration=iteration,
|
| 464 |
+
)
|
| 465 |
+
continue
|
| 466 |
except Exception as e:
|
| 467 |
+
# Unexpected error - log full context for debugging
|
| 468 |
+
logger.error(
|
| 469 |
+
"Judge phase failed unexpectedly",
|
| 470 |
+
error=str(e),
|
| 471 |
+
exc_type=type(e).__name__,
|
| 472 |
+
)
|
| 473 |
yield AgentEvent(
|
| 474 |
type="error",
|
| 475 |
message=f"Assessment failed: {e!s}",
|
| 476 |
+
data={"recoverable": True, "error_type": "unexpected"},
|
| 477 |
iteration=iteration,
|
| 478 |
)
|
| 479 |
continue
|
|
|
|
| 490 |
iteration=iteration,
|
| 491 |
)
|
| 492 |
|
| 493 |
+
async def _generate_synthesis(
|
| 494 |
+
self,
|
| 495 |
+
query: str,
|
| 496 |
+
evidence: list[Evidence],
|
| 497 |
+
assessment: JudgeAssessment,
|
| 498 |
+
) -> str:
|
| 499 |
+
"""
|
| 500 |
+
Generate the final synthesis response using LLM.
|
| 501 |
+
|
| 502 |
+
This method calls an LLM to generate a narrative research report,
|
| 503 |
+
following the Microsoft Agent Framework pattern of using LLM synthesis
|
| 504 |
+
instead of string templating.
|
| 505 |
+
|
| 506 |
+
Args:
|
| 507 |
+
query: The original question
|
| 508 |
+
evidence: All collected evidence
|
| 509 |
+
assessment: The final assessment
|
| 510 |
+
|
| 511 |
+
Returns:
|
| 512 |
+
Narrative synthesis as markdown
|
| 513 |
+
"""
|
| 514 |
+
# Build evidence summary for LLM context (limit to avoid token overflow)
|
| 515 |
+
evidence_lines = []
|
| 516 |
+
for e in evidence[:20]:
|
| 517 |
+
authors = ", ".join(e.citation.authors[:2]) if e.citation.authors else "Unknown"
|
| 518 |
+
content_preview = e.content[:200].replace("\n", " ")
|
| 519 |
+
evidence_lines.append(
|
| 520 |
+
f"- {e.citation.title} ({authors}, {e.citation.date}): {content_preview}..."
|
| 521 |
+
)
|
| 522 |
+
evidence_summary = "\n".join(evidence_lines)
|
| 523 |
+
|
| 524 |
+
# Format synthesis prompt with assessment data
|
| 525 |
+
user_prompt = format_synthesis_prompt(
|
| 526 |
+
query=query,
|
| 527 |
+
evidence_summary=evidence_summary,
|
| 528 |
+
drug_candidates=assessment.details.drug_candidates,
|
| 529 |
+
key_findings=assessment.details.key_findings,
|
| 530 |
+
mechanism_score=assessment.details.mechanism_score,
|
| 531 |
+
clinical_score=assessment.details.clinical_evidence_score,
|
| 532 |
+
confidence=assessment.confidence,
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
# Get domain-specific system prompt
|
| 536 |
+
system_prompt = get_synthesis_system_prompt(self.domain)
|
| 537 |
+
|
| 538 |
+
try:
|
| 539 |
+
# Import here to avoid circular deps and keep optional
|
| 540 |
+
from pydantic_ai import Agent
|
| 541 |
+
|
| 542 |
+
from src.agent_factory.judges import get_model
|
| 543 |
+
|
| 544 |
+
# Create synthesis agent (string output, not structured)
|
| 545 |
+
agent: Agent[None, str] = Agent(
|
| 546 |
+
model=get_model(),
|
| 547 |
+
output_type=str,
|
| 548 |
+
system_prompt=system_prompt,
|
| 549 |
+
)
|
| 550 |
+
result = await agent.run(user_prompt)
|
| 551 |
+
narrative = result.output
|
| 552 |
+
|
| 553 |
+
logger.info("LLM narrative synthesis completed", chars=len(narrative))
|
| 554 |
+
|
| 555 |
+
except Exception as e:
|
| 556 |
+
# Fallback to template synthesis if LLM fails
|
| 557 |
+
# This is intentionally broad - LLM can fail many ways (API, parsing, etc.)
|
| 558 |
+
logger.warning(
|
| 559 |
+
"LLM synthesis failed, using template fallback",
|
| 560 |
+
error=str(e),
|
| 561 |
+
exc_type=type(e).__name__,
|
| 562 |
+
evidence_count=len(evidence),
|
| 563 |
+
)
|
| 564 |
+
return self._generate_template_synthesis(query, evidence, assessment)
|
| 565 |
+
|
| 566 |
+
# Add full citation list footer
|
| 567 |
+
citations = "\n".join(
|
| 568 |
+
f"{i + 1}. [{e.citation.title}]({e.citation.url}) "
|
| 569 |
+
f"({e.citation.source.upper()}, {e.citation.date})"
|
| 570 |
+
for i, e in enumerate(evidence[:15])
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
return f"""{narrative}
|
| 574 |
+
|
| 575 |
+
---
|
| 576 |
+
### Full Citation List ({len(evidence)} sources)
|
| 577 |
+
{citations}
|
| 578 |
+
|
| 579 |
+
*Analysis based on {len(evidence)} sources across {len(self.history)} iterations.*
|
| 580 |
+
"""
|
| 581 |
+
|
| 582 |
+
def _generate_template_synthesis(
|
| 583 |
self,
|
| 584 |
query: str,
|
| 585 |
evidence: list[Evidence],
|
| 586 |
assessment: JudgeAssessment,
|
| 587 |
) -> str:
|
| 588 |
"""
|
| 589 |
+
Generate fallback template synthesis (no LLM).
|
| 590 |
+
|
| 591 |
+
Used when LLM synthesis fails or is unavailable.
|
| 592 |
|
| 593 |
Args:
|
| 594 |
query: The original question
|
|
|
|
| 596 |
assessment: The final assessment
|
| 597 |
|
| 598 |
Returns:
|
| 599 |
+
Formatted synthesis as markdown (bullet-point style)
|
| 600 |
"""
|
| 601 |
drug_list = (
|
| 602 |
"\n".join([f"- **{d}**" for d in assessment.details.drug_candidates])
|
|
|
|
| 610 |
[
|
| 611 |
f"{i + 1}. [{e.citation.title}]({e.citation.url}) "
|
| 612 |
f"({e.citation.source.upper()}, {e.citation.date})"
|
| 613 |
+
for i, e in enumerate(evidence[:10])
|
| 614 |
]
|
| 615 |
)
|
| 616 |
|
src/prompts/hypothesis.py
CHANGED
|
@@ -24,12 +24,12 @@ A good hypothesis:
|
|
| 24 |
4. Generates SEARCH QUERIES: Helps find more evidence
|
| 25 |
|
| 26 |
Example hypothesis format:
|
| 27 |
-
- Drug:
|
| 28 |
-
- Target:
|
| 29 |
-
- Pathway:
|
| 30 |
-
- Effect: Enhanced
|
| 31 |
- Confidence: 0.7
|
| 32 |
-
- Search suggestions: ["
|
| 33 |
|
| 34 |
Be specific. Use actual gene/protein names when possible."""
|
| 35 |
|
|
|
|
| 24 |
4. Generates SEARCH QUERIES: Helps find more evidence
|
| 25 |
|
| 26 |
Example hypothesis format:
|
| 27 |
+
- Drug: Testosterone
|
| 28 |
+
- Target: Androgen Receptor
|
| 29 |
+
- Pathway: Dopaminergic signaling modulation
|
| 30 |
+
- Effect: Enhanced libido in HSDD
|
| 31 |
- Confidence: 0.7
|
| 32 |
+
- Search suggestions: ["testosterone libido mechanism", "sildenafil efficacy women"]
|
| 33 |
|
| 34 |
Be specific. Use actual gene/protein names when possible."""
|
| 35 |
|
src/prompts/report.py
CHANGED
|
@@ -41,9 +41,9 @@ The `hypotheses_tested` field MUST be a LIST of objects, each with these fields:
|
|
| 41 |
|
| 42 |
Example:
|
| 43 |
hypotheses_tested: [
|
| 44 |
-
{{"hypothesis": "
|
| 45 |
"supported": 3, "contradicted": 1}},
|
| 46 |
-
{{"hypothesis": "
|
| 47 |
"supported": 5, "contradicted": 0}}
|
| 48 |
]
|
| 49 |
|
|
@@ -55,7 +55,8 @@ The `references` field MUST be a LIST of objects, each with these fields:
|
|
| 55 |
|
| 56 |
Example:
|
| 57 |
references: [
|
| 58 |
-
{{"title": "
|
|
|
|
| 59 |
]
|
| 60 |
|
| 61 |
─────────────────────────────────────────────────────────────────────────────
|
|
|
|
| 41 |
|
| 42 |
Example:
|
| 43 |
hypotheses_tested: [
|
| 44 |
+
{{"hypothesis": "Testosterone -> AR -> enhanced libido",
|
| 45 |
"supported": 3, "contradicted": 1}},
|
| 46 |
+
{{"hypothesis": "Sildenafil inhibits PDE5 pathway",
|
| 47 |
"supported": 5, "contradicted": 0}}
|
| 48 |
]
|
| 49 |
|
|
|
|
| 55 |
|
| 56 |
Example:
|
| 57 |
references: [
|
| 58 |
+
{{"title": "Testosterone and Libido", "authors": "Smith",
|
| 59 |
+
"source": "pubmed", "url": "https://pubmed.ncbi.nlm.nih.gov/123/"}}
|
| 60 |
]
|
| 61 |
|
| 62 |
─────────────────────────────────────────────────────────────────────────────
|
src/prompts/synthesis.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prompts for narrative report synthesis.
|
| 2 |
+
|
| 3 |
+
This module provides prompts that transform structured evidence data
|
| 4 |
+
into professional, narrative research reports. The key insight is that
|
| 5 |
+
report generation requires an LLM call for synthesis, not string templating.
|
| 6 |
+
|
| 7 |
+
Reference: Microsoft Agent Framework concurrent_custom_aggregator.py pattern.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from src.config.domain import ResearchDomain, get_domain_config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_synthesis_system_prompt(domain: ResearchDomain | str | None = None) -> str:
|
| 14 |
+
"""Get the system prompt for narrative synthesis.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
domain: Research domain for customization (defaults to settings)
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
System prompt instructing LLM to write narrative prose
|
| 21 |
+
"""
|
| 22 |
+
config = get_domain_config(domain)
|
| 23 |
+
return f"""You are a scientific writer specializing in {config.name.lower()}.
|
| 24 |
+
Your task is to synthesize research evidence into a clear, NARRATIVE report.
|
| 25 |
+
|
| 26 |
+
## CRITICAL: Writing Style
|
| 27 |
+
- Write in PROSE PARAGRAPHS, not bullet points
|
| 28 |
+
- Use academic but accessible language
|
| 29 |
+
- Be specific about evidence strength (e.g., "in an RCT of N=200")
|
| 30 |
+
- Reference specific studies by author name when available
|
| 31 |
+
- Provide quantitative results where available (p-values, effect sizes, NNT)
|
| 32 |
+
|
| 33 |
+
## Report Structure
|
| 34 |
+
|
| 35 |
+
### Executive Summary (REQUIRED - 2-3 sentences)
|
| 36 |
+
Start with the bottom line. What does the evidence show? Example:
|
| 37 |
+
"Testosterone therapy demonstrates consistent efficacy for HSDD in postmenopausal
|
| 38 |
+
women, with transdermal formulations showing the best safety profile."
|
| 39 |
+
|
| 40 |
+
### Background (REQUIRED - 1 paragraph)
|
| 41 |
+
Explain the condition, its prevalence, and clinical significance.
|
| 42 |
+
Why does this question matter?
|
| 43 |
+
|
| 44 |
+
### Evidence Synthesis (REQUIRED - 2-4 paragraphs)
|
| 45 |
+
Weave the evidence into a coherent NARRATIVE:
|
| 46 |
+
- **Mechanism of Action**: How does the intervention work biologically?
|
| 47 |
+
- **Clinical Evidence**: What do trials show? Include effect sizes when available.
|
| 48 |
+
- **Comparative Evidence**: How does it compare to alternatives?
|
| 49 |
+
|
| 50 |
+
Write this as flowing prose that tells a story, NOT as a bullet list.
|
| 51 |
+
|
| 52 |
+
### Recommendations (REQUIRED - 3-5 numbered items)
|
| 53 |
+
Provide specific, actionable clinical recommendations based on the evidence.
|
| 54 |
+
These CAN be numbered items since they are action items.
|
| 55 |
+
|
| 56 |
+
### Limitations (REQUIRED - 1 paragraph)
|
| 57 |
+
Acknowledge gaps in the evidence, potential biases, and areas needing more research.
|
| 58 |
+
Be honest about uncertainty.
|
| 59 |
+
|
| 60 |
+
### References (REQUIRED)
|
| 61 |
+
List key references with author, year, title, and URL.
|
| 62 |
+
Format: Author AB et al. (Year). Title. URL
|
| 63 |
+
|
| 64 |
+
## CRITICAL RULES
|
| 65 |
+
1. ONLY cite papers from the provided evidence - NEVER hallucinate or invent references
|
| 66 |
+
2. Write in complete sentences and paragraphs (PROSE, not lists except Recommendations)
|
| 67 |
+
3. Include specific statistics when available (p-values, confidence intervals, effect sizes)
|
| 68 |
+
4. Acknowledge uncertainty honestly - do not overstate conclusions
|
| 69 |
+
5. If evidence is limited, say so clearly
|
| 70 |
+
6. Copy URLs exactly as provided - do not create similar-looking URLs
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
FEW_SHOT_EXAMPLE = """
|
| 75 |
+
## Example: Strong Evidence Synthesis
|
| 76 |
+
|
| 77 |
+
INPUT:
|
| 78 |
+
- Query: "Alprostadil for erectile dysfunction"
|
| 79 |
+
- Evidence: 15 papers including meta-analysis of 8 RCTs (N=3,247)
|
| 80 |
+
- Mechanism Score: 9/10
|
| 81 |
+
- Clinical Score: 9/10
|
| 82 |
+
|
| 83 |
+
OUTPUT:
|
| 84 |
+
|
| 85 |
+
### Executive Summary
|
| 86 |
+
|
| 87 |
+
Alprostadil (prostaglandin E1) represents a well-established second-line treatment
|
| 88 |
+
for erectile dysfunction, with meta-analytic evidence demonstrating 87% efficacy
|
| 89 |
+
in achieving erections sufficient for intercourse. It offers a PDE5-independent
|
| 90 |
+
mechanism particularly valuable for patients who do not respond to oral therapies.
|
| 91 |
+
|
| 92 |
+
### Background
|
| 93 |
+
|
| 94 |
+
Erectile dysfunction affects approximately 30 million men in the United States,
|
| 95 |
+
with prevalence increasing with age from 12% at age 40 to 40% at age 70. While
|
| 96 |
+
PDE5 inhibitors remain first-line therapy, approximately 30% of patients are
|
| 97 |
+
non-responders due to diabetes, radical prostatectomy, or other factors.
|
| 98 |
+
Alprostadil provides an alternative mechanism through direct smooth muscle
|
| 99 |
+
relaxation, making it a crucial second-line option.
|
| 100 |
+
|
| 101 |
+
### Evidence Synthesis
|
| 102 |
+
|
| 103 |
+
**Mechanism of Action**
|
| 104 |
+
|
| 105 |
+
Alprostadil works through a distinct pathway from PDE5 inhibitors. It binds to
|
| 106 |
+
EP2 and EP4 receptors on cavernosal smooth muscle, activating adenylate cyclase
|
| 107 |
+
and increasing intracellular cAMP. This leads to smooth muscle relaxation and
|
| 108 |
+
increased blood flow independent of nitric oxide signaling. As noted by Smith
|
| 109 |
+
et al. (2019), this mechanism explains its efficacy in patients with endothelial
|
| 110 |
+
dysfunction where nitric oxide production is impaired.
|
| 111 |
+
|
| 112 |
+
**Clinical Evidence**
|
| 113 |
+
|
| 114 |
+
A meta-analysis by Johnson et al. (2020) pooled data from 8 randomized controlled
|
| 115 |
+
trials (N=3,247). The primary endpoint of erection sufficient for intercourse was
|
| 116 |
+
achieved in 87% of alprostadil patients versus 12% placebo (RR 7.25, 95% CI:
|
| 117 |
+
5.8-9.1, p<0.001). The number needed to treat was 1.3, indicating robust effect
|
| 118 |
+
size. Onset of action was 5-15 minutes, with duration of 30-60 minutes.
|
| 119 |
+
|
| 120 |
+
**Comparative Evidence**
|
| 121 |
+
|
| 122 |
+
Direct comparisons with PDE5 inhibitors are limited. However, in the subgroup
|
| 123 |
+
of PDE5 non-responders studied by Martinez et al. (2018), alprostadil achieved
|
| 124 |
+
successful intercourse in 72% of patients who had failed sildenafil.
|
| 125 |
+
|
| 126 |
+
### Recommendations
|
| 127 |
+
|
| 128 |
+
1. Consider alprostadil as second-line therapy when PDE5 inhibitors fail or are
|
| 129 |
+
contraindicated
|
| 130 |
+
2. Start with 10 micrograms intracavernosal injection, titrate to 40 micrograms based
|
| 131 |
+
on response
|
| 132 |
+
3. Provide in-office training for self-injection technique before home use
|
| 133 |
+
4. Screen for priapism risk factors before initiating therapy
|
| 134 |
+
5. Consider intraurethral alprostadil (MUSE) for patients averse to injections
|
| 135 |
+
|
| 136 |
+
### Limitations
|
| 137 |
+
|
| 138 |
+
Long-term safety data beyond 2 years is limited. Head-to-head comparisons with
|
| 139 |
+
newer therapies such as low-intensity shockwave therapy are lacking. Most trials
|
| 140 |
+
excluded patients with severe cardiovascular disease, limiting generalizability
|
| 141 |
+
to this population. The psychological burden of injection therapy may affect
|
| 142 |
+
real-world adherence compared to oral medications.
|
| 143 |
+
|
| 144 |
+
### References
|
| 145 |
+
|
| 146 |
+
1. Smith AB et al. (2019). Alprostadil mechanism of action in erectile tissue.
|
| 147 |
+
J Urol. https://pubmed.ncbi.nlm.nih.gov/12345678/
|
| 148 |
+
2. Johnson CD et al. (2020). Meta-analysis of intracavernosal alprostadil efficacy.
|
| 149 |
+
J Sex Med. https://pubmed.ncbi.nlm.nih.gov/23456789/
|
| 150 |
+
3. Martinez R et al. (2018). Alprostadil in PDE5 inhibitor non-responders.
|
| 151 |
+
Int J Impot Res. https://pubmed.ncbi.nlm.nih.gov/34567890/
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def format_synthesis_prompt(
|
| 156 |
+
query: str,
|
| 157 |
+
evidence_summary: str,
|
| 158 |
+
drug_candidates: list[str],
|
| 159 |
+
key_findings: list[str],
|
| 160 |
+
mechanism_score: int,
|
| 161 |
+
clinical_score: int,
|
| 162 |
+
confidence: float,
|
| 163 |
+
) -> str:
|
| 164 |
+
"""Format the user prompt for narrative synthesis.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
query: Original research question
|
| 168 |
+
evidence_summary: Formatted summary of evidence papers
|
| 169 |
+
drug_candidates: List of identified drug/treatment candidates
|
| 170 |
+
key_findings: List of key findings from assessment
|
| 171 |
+
mechanism_score: Mechanism evidence score (0-10)
|
| 172 |
+
clinical_score: Clinical evidence score (0-10)
|
| 173 |
+
confidence: Overall confidence (0.0-1.0)
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Formatted user prompt for the synthesis LLM
|
| 177 |
+
"""
|
| 178 |
+
candidates_str = ", ".join(drug_candidates) if drug_candidates else "None identified"
|
| 179 |
+
if key_findings:
|
| 180 |
+
findings_str = "\n".join(f"- {f}" for f in key_findings)
|
| 181 |
+
else:
|
| 182 |
+
findings_str = "No specific findings extracted"
|
| 183 |
+
|
| 184 |
+
return f"""Synthesize a narrative research report for the following query.
|
| 185 |
+
|
| 186 |
+
## Research Question
|
| 187 |
+
{query}
|
| 188 |
+
|
| 189 |
+
## Evidence Summary
|
| 190 |
+
{evidence_summary}
|
| 191 |
+
|
| 192 |
+
## Identified Drug/Treatment Candidates
|
| 193 |
+
{candidates_str}
|
| 194 |
+
|
| 195 |
+
## Key Findings from Evidence Assessment
|
| 196 |
+
{findings_str}
|
| 197 |
+
|
| 198 |
+
## Assessment Scores
|
| 199 |
+
- Mechanism Score: {mechanism_score}/10
|
| 200 |
+
- Clinical Evidence Score: {clinical_score}/10
|
| 201 |
+
- Overall Confidence: {confidence:.0%}
|
| 202 |
+
|
| 203 |
+
## Instructions
|
| 204 |
+
Generate a NARRATIVE research report following the structure in your system prompt.
|
| 205 |
+
Write in prose paragraphs, NOT bullet points (except for Recommendations section).
|
| 206 |
+
ONLY cite papers mentioned in the Evidence Summary above - do NOT invent references.
|
| 207 |
+
|
| 208 |
+
{FEW_SHOT_EXAMPLE}
|
| 209 |
+
"""
|
src/tools/clinicaltrials.py
CHANGED
|
@@ -51,7 +51,7 @@ class ClinicalTrialsTool:
|
|
| 51 |
"""Search ClinicalTrials.gov for interventional studies.
|
| 52 |
|
| 53 |
Args:
|
| 54 |
-
query: Search query (e.g., "
|
| 55 |
max_results: Maximum results to return (max 100)
|
| 56 |
|
| 57 |
Returns:
|
|
|
|
| 51 |
"""Search ClinicalTrials.gov for interventional studies.
|
| 52 |
|
| 53 |
Args:
|
| 54 |
+
query: Search query (e.g., "testosterone libido")
|
| 55 |
max_results: Maximum results to return (max 100)
|
| 56 |
|
| 57 |
Returns:
|
src/tools/query_utils.py
CHANGED
|
@@ -47,44 +47,37 @@ QUESTION_WORDS: set[str] = {
|
|
| 47 |
"an",
|
| 48 |
}
|
| 49 |
|
| 50 |
-
# Medical synonym expansions
|
| 51 |
SYNONYMS: dict[str, list[str]] = {
|
| 52 |
-
"
|
| 53 |
-
"
|
| 54 |
-
"
|
| 55 |
-
"
|
| 56 |
-
"post-COVID syndrome",
|
| 57 |
-
"post-COVID-19 condition",
|
| 58 |
],
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
],
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
-
"
|
| 69 |
],
|
| 70 |
-
"
|
| 71 |
-
"
|
| 72 |
-
"
|
| 73 |
-
"
|
| 74 |
-
"diabetic",
|
| 75 |
],
|
| 76 |
-
"
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
"
|
| 80 |
-
"malignancy",
|
| 81 |
-
"carcinoma",
|
| 82 |
],
|
| 83 |
-
"
|
| 84 |
-
"
|
| 85 |
-
"
|
| 86 |
-
"coronary artery disease",
|
| 87 |
-
"heart failure",
|
| 88 |
],
|
| 89 |
}
|
| 90 |
|
|
@@ -109,7 +102,7 @@ def expand_synonyms(query: str) -> str:
|
|
| 109 |
Expand medical terms to include synonyms.
|
| 110 |
|
| 111 |
Args:
|
| 112 |
-
query:
|
| 113 |
|
| 114 |
Returns:
|
| 115 |
Query with synonym expansions in OR groups
|
|
|
|
| 47 |
"an",
|
| 48 |
}
|
| 49 |
|
| 50 |
+
# Medical synonym expansions (Sexual Health Focus)
|
| 51 |
SYNONYMS: dict[str, list[str]] = {
|
| 52 |
+
"erectile dysfunction": [
|
| 53 |
+
"ED",
|
| 54 |
+
"impotence",
|
| 55 |
+
"sexual dysfunction",
|
|
|
|
|
|
|
| 56 |
],
|
| 57 |
+
"low libido": [
|
| 58 |
+
"hypoactive sexual desire disorder",
|
| 59 |
+
"HSDD",
|
| 60 |
+
"low sexual desire",
|
| 61 |
+
"loss of libido",
|
| 62 |
],
|
| 63 |
+
"menopause": [
|
| 64 |
+
"postmenopausal",
|
| 65 |
+
"climacteric",
|
| 66 |
+
"perimenopause",
|
| 67 |
],
|
| 68 |
+
"testosterone": [
|
| 69 |
+
"androgen",
|
| 70 |
+
"testosterone therapy",
|
| 71 |
+
"TRT",
|
|
|
|
| 72 |
],
|
| 73 |
+
"premature ejaculation": [
|
| 74 |
+
"PE",
|
| 75 |
+
"rapid ejaculation",
|
| 76 |
+
"early ejaculation",
|
|
|
|
|
|
|
| 77 |
],
|
| 78 |
+
"pcos": [
|
| 79 |
+
"polycystic ovary syndrome",
|
| 80 |
+
"Stein-Leventhal syndrome",
|
|
|
|
|
|
|
| 81 |
],
|
| 82 |
}
|
| 83 |
|
|
|
|
| 102 |
Expand medical terms to include synonyms.
|
| 103 |
|
| 104 |
Args:
|
| 105 |
+
query: Search query (e.g., "testosterone libido")
|
| 106 |
|
| 107 |
Returns:
|
| 108 |
Query with synonym expansions in OR groups
|
src/utils/exceptions.py
CHANGED
|
@@ -35,3 +35,27 @@ class EmbeddingError(DeepBonerError):
|
|
| 35 |
"""Raised when embedding or vector store operations fail."""
|
| 36 |
|
| 37 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"""Raised when embedding or vector store operations fail."""
|
| 36 |
|
| 37 |
pass
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class LLMError(DeepBonerError):
|
| 41 |
+
"""Raised when LLM operations fail (API errors, parsing errors, etc.)."""
|
| 42 |
+
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class QuotaExceededError(LLMError):
|
| 47 |
+
"""Raised when LLM API quota is exceeded (402 errors)."""
|
| 48 |
+
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ModalError(DeepBonerError):
|
| 53 |
+
"""Raised when Modal sandbox operations fail."""
|
| 54 |
+
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class SynthesisError(DeepBonerError):
|
| 59 |
+
"""Raised when report synthesis fails."""
|
| 60 |
+
|
| 61 |
+
pass
|
tests/conftest.py
CHANGED
|
@@ -31,10 +31,10 @@ def sample_evidence():
|
|
| 31 |
"""Sample Evidence objects for testing."""
|
| 32 |
return [
|
| 33 |
Evidence(
|
| 34 |
-
content="
|
| 35 |
citation=Citation(
|
| 36 |
source="pubmed",
|
| 37 |
-
title="
|
| 38 |
url="https://pubmed.ncbi.nlm.nih.gov/12345678/",
|
| 39 |
date="2024-01-15",
|
| 40 |
authors=["Smith J", "Johnson M"],
|
|
@@ -42,11 +42,11 @@ def sample_evidence():
|
|
| 42 |
relevance=0.85,
|
| 43 |
),
|
| 44 |
Evidence(
|
| 45 |
-
content="
|
| 46 |
citation=Citation(
|
| 47 |
source="pubmed",
|
| 48 |
-
title="
|
| 49 |
-
url="https://example.com/
|
| 50 |
date="Unknown",
|
| 51 |
authors=[],
|
| 52 |
),
|
|
|
|
| 31 |
"""Sample Evidence objects for testing."""
|
| 32 |
return [
|
| 33 |
Evidence(
|
| 34 |
+
content="Testosterone shows efficacy in treating hypoactive sexual desire disorder...",
|
| 35 |
citation=Citation(
|
| 36 |
source="pubmed",
|
| 37 |
+
title="Testosterone and Female Libido: A Systematic Review",
|
| 38 |
url="https://pubmed.ncbi.nlm.nih.gov/12345678/",
|
| 39 |
date="2024-01-15",
|
| 40 |
authors=["Smith J", "Johnson M"],
|
|
|
|
| 42 |
relevance=0.85,
|
| 43 |
),
|
| 44 |
Evidence(
|
| 45 |
+
content="Transdermal testosterone offers effective treatment path...",
|
| 46 |
citation=Citation(
|
| 47 |
source="pubmed",
|
| 48 |
+
title="Testosterone Therapy Strategies",
|
| 49 |
+
url="https://example.com/testosterone-therapy",
|
| 50 |
date="Unknown",
|
| 51 |
authors=[],
|
| 52 |
),
|
tests/e2e/test_simple_mode.py
CHANGED
|
@@ -55,11 +55,11 @@ async def test_simple_mode_structure_validation(mock_search_handler, mock_judge_
|
|
| 55 |
complete_event = next(e for e in events if e.type == "complete")
|
| 56 |
report = complete_event.message
|
| 57 |
|
| 58 |
-
# Check
|
| 59 |
-
|
| 60 |
-
assert "
|
| 61 |
-
assert "
|
| 62 |
|
| 63 |
-
# Check for citations
|
| 64 |
assert "Study on test query" in report
|
| 65 |
-
assert "
|
|
|
|
| 55 |
complete_event = next(e for e in events if e.type == "complete")
|
| 56 |
report = complete_event.message
|
| 57 |
|
| 58 |
+
# Check LLM narrative synthesis structure (SPEC_12)
|
| 59 |
+
# LLM generates prose with these sections (may omit ### prefix)
|
| 60 |
+
assert "Executive Summary" in report or "Sexual Health Analysis" in report
|
| 61 |
+
assert "Full Citation List" in report or "Citations" in report
|
| 62 |
|
| 63 |
+
# Check for citations (from citation footer added by orchestrator)
|
| 64 |
assert "Study on test query" in report
|
| 65 |
+
assert "pubmed.example.com/123" in report
|
tests/integration/test_dual_mode_e2e.py
CHANGED
|
@@ -19,7 +19,7 @@ def mock_search_handler():
|
|
| 19 |
citation=Citation(
|
| 20 |
title="Test Paper", url="http://test", date="2024", source="pubmed"
|
| 21 |
),
|
| 22 |
-
content="
|
| 23 |
)
|
| 24 |
]
|
| 25 |
)
|
|
|
|
| 19 |
citation=Citation(
|
| 20 |
title="Test Paper", url="http://test", date="2024", source="pubmed"
|
| 21 |
),
|
| 22 |
+
content="Testosterone improves sexual desire in postmenopausal women.",
|
| 23 |
)
|
| 24 |
]
|
| 25 |
)
|
tests/integration/test_mcp_tools_live.py
CHANGED
|
@@ -12,7 +12,7 @@ class TestMCPToolsLive:
|
|
| 12 |
"""Test that MCP tools execute real searches."""
|
| 13 |
from src.mcp_tools import search_pubmed
|
| 14 |
|
| 15 |
-
result = await search_pubmed("
|
| 16 |
|
| 17 |
assert isinstance(result, str)
|
| 18 |
assert "PubMed Results" in result
|
|
|
|
| 12 |
"""Test that MCP tools execute real searches."""
|
| 13 |
from src.mcp_tools import search_pubmed
|
| 14 |
|
| 15 |
+
result = await search_pubmed("testosterone libido", 3)
|
| 16 |
|
| 17 |
assert isinstance(result, str)
|
| 18 |
assert "PubMed Results" in result
|
tests/integration/test_simple_mode_synthesis.py
CHANGED
|
@@ -92,7 +92,11 @@ async def test_simple_mode_synthesizes_before_max_iterations():
|
|
| 92 |
complete_event = complete_events[0]
|
| 93 |
|
| 94 |
assert "MagicDrug" in complete_event.message
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
assert complete_event.data.get("synthesis_reason") == "high_scores_with_candidates"
|
| 97 |
assert complete_event.iteration == 2 # Should stop at it 2
|
| 98 |
|
|
|
|
| 92 |
complete_event = complete_events[0]
|
| 93 |
|
| 94 |
assert "MagicDrug" in complete_event.message
|
| 95 |
+
# SPEC_12: LLM synthesis produces narrative prose, not template with "Drug Candidates" header
|
| 96 |
+
# Check for narrative structure (LLM may omit ### prefix) OR template fallback
|
| 97 |
+
assert (
|
| 98 |
+
"Executive Summary" in complete_event.message or "Drug Candidates" in complete_event.message
|
| 99 |
+
)
|
| 100 |
assert complete_event.data.get("synthesis_reason") == "high_scores_with_candidates"
|
| 101 |
assert complete_event.iteration == 2 # Should stop at it 2
|
| 102 |
|
tests/unit/agent_factory/test_judges.py
CHANGED
|
@@ -8,6 +8,7 @@ from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
|
|
| 8 |
from src.utils.models import AssessmentDetails, Citation, Evidence, JudgeAssessment
|
| 9 |
|
| 10 |
|
|
|
|
| 11 |
class TestJudgeHandler:
|
| 12 |
"""Tests for JudgeHandler."""
|
| 13 |
|
|
@@ -22,8 +23,8 @@ class TestJudgeHandler:
|
|
| 22 |
mechanism_reasoning="Strong mechanistic evidence",
|
| 23 |
clinical_evidence_score=7,
|
| 24 |
clinical_reasoning="Good clinical support",
|
| 25 |
-
drug_candidates=["
|
| 26 |
-
key_findings=["
|
| 27 |
),
|
| 28 |
sufficient=True,
|
| 29 |
confidence=expected_confidence,
|
|
@@ -51,22 +52,22 @@ class TestJudgeHandler:
|
|
| 51 |
|
| 52 |
evidence = [
|
| 53 |
Evidence(
|
| 54 |
-
content="
|
| 55 |
citation=Citation(
|
| 56 |
source="pubmed",
|
| 57 |
-
title="
|
| 58 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 59 |
date="2024-01-01",
|
| 60 |
),
|
| 61 |
)
|
| 62 |
]
|
| 63 |
|
| 64 |
-
result = await handler.assess("
|
| 65 |
|
| 66 |
assert result.sufficient is True
|
| 67 |
assert result.recommendation == "synthesize"
|
| 68 |
assert result.confidence == expected_confidence
|
| 69 |
-
assert "
|
| 70 |
|
| 71 |
@pytest.mark.asyncio
|
| 72 |
async def test_assess_empty_evidence(self):
|
|
@@ -83,7 +84,7 @@ class TestJudgeHandler:
|
|
| 83 |
sufficient=False,
|
| 84 |
confidence=0.0,
|
| 85 |
recommendation="continue",
|
| 86 |
-
next_search_queries=["
|
| 87 |
reasoning="No evidence found, need to search more",
|
| 88 |
)
|
| 89 |
|
|
@@ -102,11 +103,13 @@ class TestJudgeHandler:
|
|
| 102 |
handler = JudgeHandler()
|
| 103 |
handler.agent = mock_agent
|
| 104 |
|
| 105 |
-
result = await handler.assess("
|
| 106 |
|
| 107 |
assert result.sufficient is False
|
| 108 |
assert result.recommendation == "continue"
|
| 109 |
assert len(result.next_search_queries) > 0
|
|
|
|
|
|
|
| 110 |
|
| 111 |
@pytest.mark.asyncio
|
| 112 |
async def test_assess_handles_llm_failure(self):
|
|
@@ -143,6 +146,7 @@ class TestJudgeHandler:
|
|
| 143 |
assert "failed" in result.reasoning.lower()
|
| 144 |
|
| 145 |
|
|
|
|
| 146 |
class TestMockJudgeHandler:
|
| 147 |
"""Tests for MockJudgeHandler."""
|
| 148 |
|
|
|
|
| 8 |
from src.utils.models import AssessmentDetails, Citation, Evidence, JudgeAssessment
|
| 9 |
|
| 10 |
|
| 11 |
+
@pytest.mark.unit
|
| 12 |
class TestJudgeHandler:
|
| 13 |
"""Tests for JudgeHandler."""
|
| 14 |
|
|
|
|
| 23 |
mechanism_reasoning="Strong mechanistic evidence",
|
| 24 |
clinical_evidence_score=7,
|
| 25 |
clinical_reasoning="Good clinical support",
|
| 26 |
+
drug_candidates=["Testosterone"],
|
| 27 |
+
key_findings=["Libido enhancement effects"],
|
| 28 |
),
|
| 29 |
sufficient=True,
|
| 30 |
confidence=expected_confidence,
|
|
|
|
| 52 |
|
| 53 |
evidence = [
|
| 54 |
Evidence(
|
| 55 |
+
content="Sildenafil shows efficacy in ED...",
|
| 56 |
citation=Citation(
|
| 57 |
source="pubmed",
|
| 58 |
+
title="Sildenafil in ED",
|
| 59 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 60 |
date="2024-01-01",
|
| 61 |
),
|
| 62 |
)
|
| 63 |
]
|
| 64 |
|
| 65 |
+
result = await handler.assess("sildenafil efficacy", evidence)
|
| 66 |
|
| 67 |
assert result.sufficient is True
|
| 68 |
assert result.recommendation == "synthesize"
|
| 69 |
assert result.confidence == expected_confidence
|
| 70 |
+
assert "Testosterone" in result.details.drug_candidates
|
| 71 |
|
| 72 |
@pytest.mark.asyncio
|
| 73 |
async def test_assess_empty_evidence(self):
|
|
|
|
| 84 |
sufficient=False,
|
| 85 |
confidence=0.0,
|
| 86 |
recommendation="continue",
|
| 87 |
+
next_search_queries=["sildenafil mechanism"],
|
| 88 |
reasoning="No evidence found, need to search more",
|
| 89 |
)
|
| 90 |
|
|
|
|
| 103 |
handler = JudgeHandler()
|
| 104 |
handler.agent = mock_agent
|
| 105 |
|
| 106 |
+
result = await handler.assess("sildenafil efficacy", [])
|
| 107 |
|
| 108 |
assert result.sufficient is False
|
| 109 |
assert result.recommendation == "continue"
|
| 110 |
assert len(result.next_search_queries) > 0
|
| 111 |
+
# Assert specific expected query is present
|
| 112 |
+
assert "sildenafil mechanism" in result.next_search_queries
|
| 113 |
|
| 114 |
@pytest.mark.asyncio
|
| 115 |
async def test_assess_handles_llm_failure(self):
|
|
|
|
| 146 |
assert "failed" in result.reasoning.lower()
|
| 147 |
|
| 148 |
|
| 149 |
+
@pytest.mark.unit
|
| 150 |
class TestMockJudgeHandler:
|
| 151 |
"""Tests for MockJudgeHandler."""
|
| 152 |
|
tests/unit/agents/test_hypothesis_agent.py
CHANGED
|
@@ -22,10 +22,10 @@ from src.utils.models import ( # noqa: E402
|
|
| 22 |
def sample_evidence():
|
| 23 |
return [
|
| 24 |
Evidence(
|
| 25 |
-
content="
|
| 26 |
citation=Citation(
|
| 27 |
source="pubmed",
|
| 28 |
-
title="
|
| 29 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 30 |
date="2023",
|
| 31 |
),
|
|
@@ -38,17 +38,17 @@ def mock_assessment():
|
|
| 38 |
return HypothesisAssessment(
|
| 39 |
hypotheses=[
|
| 40 |
MechanismHypothesis(
|
| 41 |
-
drug="
|
| 42 |
-
target="
|
| 43 |
-
pathway="
|
| 44 |
-
effect="
|
| 45 |
confidence=0.75,
|
| 46 |
-
search_suggestions=["
|
| 47 |
)
|
| 48 |
],
|
| 49 |
primary_hypothesis=None,
|
| 50 |
knowledge_gaps=["Clinical trial data needed"],
|
| 51 |
-
recommended_searches=["
|
| 52 |
)
|
| 53 |
|
| 54 |
|
|
@@ -66,12 +66,12 @@ async def test_hypothesis_agent_generates_hypotheses(sample_evidence, mock_asses
|
|
| 66 |
mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
|
| 67 |
|
| 68 |
agent = HypothesisAgent(store)
|
| 69 |
-
response = await agent.run("
|
| 70 |
|
| 71 |
assert isinstance(response, AgentRunResponse)
|
| 72 |
-
assert "
|
| 73 |
assert len(store["hypotheses"]) == 1
|
| 74 |
-
assert store["hypotheses"][0].drug == "
|
| 75 |
|
| 76 |
|
| 77 |
@pytest.mark.asyncio
|
|
|
|
| 22 |
def sample_evidence():
|
| 23 |
return [
|
| 24 |
Evidence(
|
| 25 |
+
content="Testosterone activates androgen receptors...",
|
| 26 |
citation=Citation(
|
| 27 |
source="pubmed",
|
| 28 |
+
title="Testosterone and Libido",
|
| 29 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 30 |
date="2023",
|
| 31 |
),
|
|
|
|
| 38 |
return HypothesisAssessment(
|
| 39 |
hypotheses=[
|
| 40 |
MechanismHypothesis(
|
| 41 |
+
drug="Testosterone",
|
| 42 |
+
target="Androgen Receptor",
|
| 43 |
+
pathway="Dopamine modulation",
|
| 44 |
+
effect="Enhanced sexual desire in HSDD",
|
| 45 |
confidence=0.75,
|
| 46 |
+
search_suggestions=["testosterone libido mechanism", "HSDD treatment"],
|
| 47 |
)
|
| 48 |
],
|
| 49 |
primary_hypothesis=None,
|
| 50 |
knowledge_gaps=["Clinical trial data needed"],
|
| 51 |
+
recommended_searches=["testosterone HSDD clinical trial"],
|
| 52 |
)
|
| 53 |
|
| 54 |
|
|
|
|
| 66 |
mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
|
| 67 |
|
| 68 |
agent = HypothesisAgent(store)
|
| 69 |
+
response = await agent.run("testosterone libido")
|
| 70 |
|
| 71 |
assert isinstance(response, AgentRunResponse)
|
| 72 |
+
assert "Androgen" in response.messages[0].text
|
| 73 |
assert len(store["hypotheses"]) == 1
|
| 74 |
+
assert store["hypotheses"][0].drug == "Testosterone"
|
| 75 |
|
| 76 |
|
| 77 |
@pytest.mark.asyncio
|
tests/unit/agents/test_judge_agent.py
CHANGED
|
@@ -22,7 +22,7 @@ def mock_assessment() -> JudgeAssessment:
|
|
| 22 |
mechanism_reasoning="Strong mechanism evidence",
|
| 23 |
clinical_evidence_score=7,
|
| 24 |
clinical_reasoning="Good clinical data",
|
| 25 |
-
drug_candidates=["
|
| 26 |
key_findings=["Key finding 1"],
|
| 27 |
),
|
| 28 |
sufficient=True,
|
|
|
|
| 22 |
mechanism_reasoning="Strong mechanism evidence",
|
| 23 |
clinical_evidence_score=7,
|
| 24 |
clinical_reasoning="Good clinical data",
|
| 25 |
+
drug_candidates=["Testosterone"],
|
| 26 |
key_findings=["Key finding 1"],
|
| 27 |
),
|
| 28 |
sufficient=True,
|
tests/unit/agents/test_report_agent.py
CHANGED
|
@@ -22,10 +22,10 @@ from src.utils.models import ( # noqa: E402
|
|
| 22 |
def sample_evidence() -> list[Evidence]:
|
| 23 |
return [
|
| 24 |
Evidence(
|
| 25 |
-
content="
|
| 26 |
citation=Citation(
|
| 27 |
source="pubmed",
|
| 28 |
-
title="
|
| 29 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 30 |
date="2023",
|
| 31 |
authors=["Smith J", "Jones A"],
|
|
@@ -38,10 +38,10 @@ def sample_evidence() -> list[Evidence]:
|
|
| 38 |
def sample_hypotheses() -> list[MechanismHypothesis]:
|
| 39 |
return [
|
| 40 |
MechanismHypothesis(
|
| 41 |
-
drug="
|
| 42 |
-
target="
|
| 43 |
-
pathway="
|
| 44 |
-
effect="
|
| 45 |
confidence=0.8,
|
| 46 |
search_suggestions=[],
|
| 47 |
)
|
|
@@ -51,30 +51,35 @@ def sample_hypotheses() -> list[MechanismHypothesis]:
|
|
| 51 |
@pytest.fixture
|
| 52 |
def mock_report() -> ResearchReport:
|
| 53 |
return ResearchReport(
|
| 54 |
-
title="
|
| 55 |
executive_summary=(
|
| 56 |
-
"This report analyzes
|
| 57 |
-
"
|
| 58 |
-
"findings from mechanistic studies showing
|
| 59 |
-
"and reviews clinical data. The evidence suggests
|
| 60 |
-
"
|
| 61 |
),
|
| 62 |
-
research_question="
|
| 63 |
methodology=ReportSection(
|
| 64 |
title="Methodology", content="Searched PubMed and web sources..."
|
| 65 |
),
|
| 66 |
hypotheses_tested=[
|
| 67 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
],
|
| 69 |
mechanistic_findings=ReportSection(
|
| 70 |
-
title="Mechanistic Findings",
|
|
|
|
| 71 |
),
|
| 72 |
clinical_findings=ReportSection(
|
| 73 |
-
title="Clinical Findings", content="
|
| 74 |
),
|
| 75 |
-
drug_candidates=["
|
| 76 |
limitations=["Abstract-level analysis only"],
|
| 77 |
-
conclusion="
|
| 78 |
references=[],
|
| 79 |
sources_searched=["pubmed", "web"],
|
| 80 |
total_papers_reviewed=10,
|
|
@@ -106,7 +111,7 @@ async def test_report_agent_generates_report(
|
|
| 106 |
mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
|
| 107 |
|
| 108 |
agent = ReportAgent(store)
|
| 109 |
-
response = await agent.run("
|
| 110 |
|
| 111 |
assert response.messages[0].text is not None
|
| 112 |
assert "Executive Summary" in response.messages[0].text
|
|
@@ -161,7 +166,7 @@ async def test_report_agent_removes_hallucinated_citations(
|
|
| 161 |
references=[
|
| 162 |
# Valid reference (matches sample_evidence)
|
| 163 |
{
|
| 164 |
-
"title": "
|
| 165 |
"url": "https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 166 |
"authors": "Smith J, Jones A",
|
| 167 |
"date": "2023",
|
|
@@ -195,7 +200,7 @@ async def test_report_agent_removes_hallucinated_citations(
|
|
| 195 |
|
| 196 |
# Only the valid reference should remain
|
| 197 |
assert len(validated_report.references) == 1
|
| 198 |
-
assert validated_report.references[0]["title"] == "
|
| 199 |
# Check that "Fake Paper" is NOT in the string representation of the references list
|
| 200 |
# (This is a bit safer than checking presence in list of dicts if structure varies)
|
| 201 |
ref_urls = [r.get("url") for r in validated_report.references]
|
|
|
|
| 22 |
def sample_evidence() -> list[Evidence]:
|
| 23 |
return [
|
| 24 |
Evidence(
|
| 25 |
+
content="Testosterone activates androgen receptors...",
|
| 26 |
citation=Citation(
|
| 27 |
source="pubmed",
|
| 28 |
+
title="Testosterone mechanisms in HSDD",
|
| 29 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 30 |
date="2023",
|
| 31 |
authors=["Smith J", "Jones A"],
|
|
|
|
| 38 |
def sample_hypotheses() -> list[MechanismHypothesis]:
|
| 39 |
return [
|
| 40 |
MechanismHypothesis(
|
| 41 |
+
drug="Testosterone",
|
| 42 |
+
target="Androgen Receptor",
|
| 43 |
+
pathway="Dopamine modulation",
|
| 44 |
+
effect="Enhanced libido",
|
| 45 |
confidence=0.8,
|
| 46 |
search_suggestions=[],
|
| 47 |
)
|
|
|
|
| 51 |
@pytest.fixture
|
| 52 |
def mock_report() -> ResearchReport:
|
| 53 |
return ResearchReport(
|
| 54 |
+
title="Sexual Health Analysis: Testosterone for HSDD",
|
| 55 |
executive_summary=(
|
| 56 |
+
"This report analyzes testosterone as a treatment for "
|
| 57 |
+
"hypoactive sexual desire disorder (HSDD). It summarizes "
|
| 58 |
+
"findings from mechanistic studies showing androgen receptor effects "
|
| 59 |
+
"and reviews clinical data. The evidence suggests significant "
|
| 60 |
+
"efficacy, with clinical trials supporting transdermal formulations."
|
| 61 |
),
|
| 62 |
+
research_question="Is testosterone effective for treating HSDD in women?",
|
| 63 |
methodology=ReportSection(
|
| 64 |
title="Methodology", content="Searched PubMed and web sources..."
|
| 65 |
),
|
| 66 |
hypotheses_tested=[
|
| 67 |
+
{
|
| 68 |
+
"mechanism": "Testosterone -> AR -> libido",
|
| 69 |
+
"supported": 5,
|
| 70 |
+
"contradicted": 1,
|
| 71 |
+
}
|
| 72 |
],
|
| 73 |
mechanistic_findings=ReportSection(
|
| 74 |
+
title="Mechanistic Findings",
|
| 75 |
+
content="Evidence suggests androgen receptor activation...",
|
| 76 |
),
|
| 77 |
clinical_findings=ReportSection(
|
| 78 |
+
title="Clinical Findings", content="Multiple RCTs support efficacy..."
|
| 79 |
),
|
| 80 |
+
drug_candidates=["Testosterone"],
|
| 81 |
limitations=["Abstract-level analysis only"],
|
| 82 |
+
conclusion="Testosterone shows strong efficacy for HSDD...",
|
| 83 |
references=[],
|
| 84 |
sources_searched=["pubmed", "web"],
|
| 85 |
total_papers_reviewed=10,
|
|
|
|
| 111 |
mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
|
| 112 |
|
| 113 |
agent = ReportAgent(store)
|
| 114 |
+
response = await agent.run("testosterone HSDD")
|
| 115 |
|
| 116 |
assert response.messages[0].text is not None
|
| 117 |
assert "Executive Summary" in response.messages[0].text
|
|
|
|
| 166 |
references=[
|
| 167 |
# Valid reference (matches sample_evidence)
|
| 168 |
{
|
| 169 |
+
"title": "Testosterone mechanisms in HSDD",
|
| 170 |
"url": "https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 171 |
"authors": "Smith J, Jones A",
|
| 172 |
"date": "2023",
|
|
|
|
| 200 |
|
| 201 |
# Only the valid reference should remain
|
| 202 |
assert len(validated_report.references) == 1
|
| 203 |
+
assert validated_report.references[0]["title"] == "Testosterone mechanisms in HSDD"
|
| 204 |
# Check that "Fake Paper" is NOT in the string representation of the references list
|
| 205 |
# (This is a bit safer than checking presence in list of dicts if structure varies)
|
| 206 |
ref_urls = [r.get("url") for r in validated_report.references]
|
tests/unit/graph/test_nodes.py
CHANGED
|
@@ -12,12 +12,12 @@ async def test_judge_node_initialization(mocker):
|
|
| 12 |
# Mock get_model to avoid needing real API keys
|
| 13 |
mocker.patch("src.agents.graph.nodes.get_model", return_value=mocker.Mock())
|
| 14 |
|
| 15 |
-
# Create a mock assessment with attributes
|
| 16 |
mock_hypothesis = mocker.Mock()
|
| 17 |
-
mock_hypothesis.drug = "
|
| 18 |
-
mock_hypothesis.target = "
|
| 19 |
-
mock_hypothesis.pathway = "
|
| 20 |
-
mock_hypothesis.effect = "
|
| 21 |
mock_hypothesis.confidence = 0.8
|
| 22 |
|
| 23 |
mock_assessment = mocker.Mock()
|
|
@@ -32,7 +32,7 @@ async def test_judge_node_initialization(mocker):
|
|
| 32 |
mocker.patch("src.agents.graph.nodes.Agent", return_value=mock_agent_instance)
|
| 33 |
|
| 34 |
state: ResearchState = {
|
| 35 |
-
"query": "Does
|
| 36 |
"hypotheses": [],
|
| 37 |
"conflicts": [],
|
| 38 |
"evidence_ids": [],
|
|
@@ -46,7 +46,7 @@ async def test_judge_node_initialization(mocker):
|
|
| 46 |
|
| 47 |
assert "hypotheses" in update
|
| 48 |
assert len(update["hypotheses"]) == 1
|
| 49 |
-
assert update["hypotheses"][0].id == "
|
| 50 |
assert update["hypotheses"][0].status == "proposed"
|
| 51 |
|
| 52 |
|
|
|
|
| 12 |
# Mock get_model to avoid needing real API keys
|
| 13 |
mocker.patch("src.agents.graph.nodes.get_model", return_value=mocker.Mock())
|
| 14 |
|
| 15 |
+
# Create a mock assessment with attributes (sexual health domain)
|
| 16 |
mock_hypothesis = mocker.Mock()
|
| 17 |
+
mock_hypothesis.drug = "Testosterone"
|
| 18 |
+
mock_hypothesis.target = "Androgen Receptor"
|
| 19 |
+
mock_hypothesis.pathway = "HPG Axis"
|
| 20 |
+
mock_hypothesis.effect = "Libido Enhancement"
|
| 21 |
mock_hypothesis.confidence = 0.8
|
| 22 |
|
| 23 |
mock_assessment = mocker.Mock()
|
|
|
|
| 32 |
mocker.patch("src.agents.graph.nodes.Agent", return_value=mock_agent_instance)
|
| 33 |
|
| 34 |
state: ResearchState = {
|
| 35 |
+
"query": "Does stress affect libido?",
|
| 36 |
"hypotheses": [],
|
| 37 |
"conflicts": [],
|
| 38 |
"evidence_ids": [],
|
|
|
|
| 46 |
|
| 47 |
assert "hypotheses" in update
|
| 48 |
assert len(update["hypotheses"]) == 1
|
| 49 |
+
assert update["hypotheses"][0].id == "Testosterone"
|
| 50 |
assert update["hypotheses"][0].status == "proposed"
|
| 51 |
|
| 52 |
|
tests/unit/orchestrators/test_simple_orchestrator_domain.py
CHANGED
|
@@ -30,7 +30,7 @@ class TestSimpleOrchestratorDomain:
|
|
| 30 |
domain=ResearchDomain.SEXUAL_HEALTH,
|
| 31 |
)
|
| 32 |
|
| 33 |
-
# Test
|
| 34 |
mock_assessment = MagicMock()
|
| 35 |
mock_assessment.details.drug_candidates = []
|
| 36 |
mock_assessment.details.key_findings = []
|
|
@@ -39,7 +39,7 @@ class TestSimpleOrchestratorDomain:
|
|
| 39 |
mock_assessment.details.mechanism_score = 5
|
| 40 |
mock_assessment.details.clinical_evidence_score = 5
|
| 41 |
|
| 42 |
-
report = orch.
|
| 43 |
assert "## Sexual Health Analysis" in report
|
| 44 |
|
| 45 |
# Test _generate_partial_synthesis
|
|
|
|
| 30 |
domain=ResearchDomain.SEXUAL_HEALTH,
|
| 31 |
)
|
| 32 |
|
| 33 |
+
# Test _generate_template_synthesis (the sync fallback method)
|
| 34 |
mock_assessment = MagicMock()
|
| 35 |
mock_assessment.details.drug_candidates = []
|
| 36 |
mock_assessment.details.key_findings = []
|
|
|
|
| 39 |
mock_assessment.details.mechanism_score = 5
|
| 40 |
mock_assessment.details.clinical_evidence_score = 5
|
| 41 |
|
| 42 |
+
report = orch._generate_template_synthesis("query", [], mock_assessment)
|
| 43 |
assert "## Sexual Health Analysis" in report
|
| 44 |
|
| 45 |
# Test _generate_partial_synthesis
|
tests/unit/orchestrators/test_simple_synthesis.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for simple orchestrator LLM synthesis."""
|
| 2 |
+
|
| 3 |
+
from unittest.mock import AsyncMock, MagicMock, patch
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from src.orchestrators.simple import Orchestrator
|
| 8 |
+
from src.utils.models import AssessmentDetails, Citation, Evidence, JudgeAssessment
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@pytest.fixture
|
| 12 |
+
def sample_evidence() -> list[Evidence]:
|
| 13 |
+
"""Sample evidence for testing synthesis."""
|
| 14 |
+
return [
|
| 15 |
+
Evidence(
|
| 16 |
+
content="Testosterone therapy demonstrates efficacy in treating HSDD.",
|
| 17 |
+
citation=Citation(
|
| 18 |
+
source="pubmed",
|
| 19 |
+
title="Testosterone and Female Sexual Desire",
|
| 20 |
+
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 21 |
+
date="2023",
|
| 22 |
+
authors=["Smith J", "Jones A"],
|
| 23 |
+
),
|
| 24 |
+
),
|
| 25 |
+
Evidence(
|
| 26 |
+
content="A meta-analysis of 8 RCTs shows significant improvement in sexual desire.",
|
| 27 |
+
citation=Citation(
|
| 28 |
+
source="pubmed",
|
| 29 |
+
title="Meta-analysis of Testosterone Therapy",
|
| 30 |
+
url="https://pubmed.ncbi.nlm.nih.gov/67890/",
|
| 31 |
+
date="2024",
|
| 32 |
+
authors=["Johnson B"],
|
| 33 |
+
),
|
| 34 |
+
),
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@pytest.fixture
|
| 39 |
+
def sample_assessment() -> JudgeAssessment:
|
| 40 |
+
"""Sample assessment for testing synthesis."""
|
| 41 |
+
return JudgeAssessment(
|
| 42 |
+
sufficient=True,
|
| 43 |
+
confidence=0.85,
|
| 44 |
+
reasoning="Evidence is sufficient to synthesize findings on testosterone therapy for HSDD.",
|
| 45 |
+
recommendation="synthesize",
|
| 46 |
+
next_search_queries=[],
|
| 47 |
+
details=AssessmentDetails(
|
| 48 |
+
mechanism_score=8,
|
| 49 |
+
mechanism_reasoning="Strong evidence of androgen receptor activation pathway.",
|
| 50 |
+
clinical_evidence_score=7,
|
| 51 |
+
clinical_reasoning="Multiple RCTs support efficacy in postmenopausal HSDD.",
|
| 52 |
+
drug_candidates=["Testosterone", "LibiGel"],
|
| 53 |
+
key_findings=[
|
| 54 |
+
"Testosterone improves libido in postmenopausal women",
|
| 55 |
+
"Transdermal formulation has best safety profile",
|
| 56 |
+
],
|
| 57 |
+
),
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@pytest.mark.unit
|
| 62 |
+
class TestGenerateSynthesis:
|
| 63 |
+
"""Tests for _generate_synthesis method."""
|
| 64 |
+
|
| 65 |
+
@pytest.mark.asyncio
|
| 66 |
+
async def test_calls_llm_for_narrative(
|
| 67 |
+
self,
|
| 68 |
+
sample_evidence: list[Evidence],
|
| 69 |
+
sample_assessment: JudgeAssessment,
|
| 70 |
+
) -> None:
|
| 71 |
+
"""Synthesis should make an LLM call, not just use a template."""
|
| 72 |
+
mock_search = MagicMock()
|
| 73 |
+
mock_judge = MagicMock()
|
| 74 |
+
|
| 75 |
+
orchestrator = Orchestrator(
|
| 76 |
+
search_handler=mock_search,
|
| 77 |
+
judge_handler=mock_judge,
|
| 78 |
+
)
|
| 79 |
+
orchestrator.history = [{"iteration": 1}] # Needed for footer
|
| 80 |
+
|
| 81 |
+
with (
|
| 82 |
+
patch("pydantic_ai.Agent") as mock_agent_class,
|
| 83 |
+
patch("src.agent_factory.judges.get_model") as mock_get_model,
|
| 84 |
+
):
|
| 85 |
+
mock_model = MagicMock()
|
| 86 |
+
mock_get_model.return_value = mock_model
|
| 87 |
+
|
| 88 |
+
mock_agent = MagicMock()
|
| 89 |
+
mock_result = MagicMock()
|
| 90 |
+
mock_result.output = """### Executive Summary
|
| 91 |
+
|
| 92 |
+
Testosterone therapy demonstrates consistent efficacy for HSDD treatment.
|
| 93 |
+
|
| 94 |
+
### Background
|
| 95 |
+
|
| 96 |
+
HSDD affects many postmenopausal women.
|
| 97 |
+
|
| 98 |
+
### Evidence Synthesis
|
| 99 |
+
|
| 100 |
+
Studies show significant improvement in sexual desire scores.
|
| 101 |
+
|
| 102 |
+
### Recommendations
|
| 103 |
+
|
| 104 |
+
1. Consider testosterone therapy for postmenopausal HSDD
|
| 105 |
+
|
| 106 |
+
### Limitations
|
| 107 |
+
|
| 108 |
+
Long-term safety data is limited.
|
| 109 |
+
|
| 110 |
+
### References
|
| 111 |
+
|
| 112 |
+
1. Smith J et al. (2023). Testosterone and Female Sexual Desire."""
|
| 113 |
+
|
| 114 |
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
| 115 |
+
mock_agent_class.return_value = mock_agent
|
| 116 |
+
|
| 117 |
+
result = await orchestrator._generate_synthesis(
|
| 118 |
+
query="testosterone HSDD",
|
| 119 |
+
evidence=sample_evidence,
|
| 120 |
+
assessment=sample_assessment,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Verify LLM agent was created and called
|
| 124 |
+
mock_agent_class.assert_called_once()
|
| 125 |
+
mock_agent.run.assert_called_once()
|
| 126 |
+
|
| 127 |
+
# Verify output includes narrative content
|
| 128 |
+
assert "Executive Summary" in result
|
| 129 |
+
assert "Background" in result
|
| 130 |
+
assert "Evidence Synthesis" in result
|
| 131 |
+
|
| 132 |
+
@pytest.mark.asyncio
|
| 133 |
+
async def test_falls_back_on_llm_error(
|
| 134 |
+
self,
|
| 135 |
+
sample_evidence: list[Evidence],
|
| 136 |
+
sample_assessment: JudgeAssessment,
|
| 137 |
+
) -> None:
|
| 138 |
+
"""Synthesis should fall back to template if LLM fails."""
|
| 139 |
+
mock_search = MagicMock()
|
| 140 |
+
mock_judge = MagicMock()
|
| 141 |
+
|
| 142 |
+
orchestrator = Orchestrator(
|
| 143 |
+
search_handler=mock_search,
|
| 144 |
+
judge_handler=mock_judge,
|
| 145 |
+
)
|
| 146 |
+
orchestrator.history = [{"iteration": 1}]
|
| 147 |
+
|
| 148 |
+
with patch("pydantic_ai.Agent") as mock_agent_class:
|
| 149 |
+
# Simulate LLM failure
|
| 150 |
+
mock_agent_class.side_effect = Exception("LLM unavailable")
|
| 151 |
+
|
| 152 |
+
result = await orchestrator._generate_synthesis(
|
| 153 |
+
query="testosterone HSDD",
|
| 154 |
+
evidence=sample_evidence,
|
| 155 |
+
assessment=sample_assessment,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Should return template fallback (has Assessment section)
|
| 159 |
+
assert "Assessment" in result or "Drug Candidates" in result
|
| 160 |
+
assert "Testosterone" in result # Drug candidate should be present
|
| 161 |
+
|
| 162 |
+
@pytest.mark.asyncio
|
| 163 |
+
async def test_includes_citation_footer(
|
| 164 |
+
self,
|
| 165 |
+
sample_evidence: list[Evidence],
|
| 166 |
+
sample_assessment: JudgeAssessment,
|
| 167 |
+
) -> None:
|
| 168 |
+
"""Synthesis should include full citation list footer."""
|
| 169 |
+
mock_search = MagicMock()
|
| 170 |
+
mock_judge = MagicMock()
|
| 171 |
+
|
| 172 |
+
orchestrator = Orchestrator(
|
| 173 |
+
search_handler=mock_search,
|
| 174 |
+
judge_handler=mock_judge,
|
| 175 |
+
)
|
| 176 |
+
orchestrator.history = [{"iteration": 1}]
|
| 177 |
+
|
| 178 |
+
with (
|
| 179 |
+
patch("pydantic_ai.Agent") as mock_agent_class,
|
| 180 |
+
patch("src.agent_factory.judges.get_model"),
|
| 181 |
+
):
|
| 182 |
+
mock_agent = MagicMock()
|
| 183 |
+
mock_result = MagicMock()
|
| 184 |
+
mock_result.output = "Narrative synthesis content."
|
| 185 |
+
mock_agent.run = AsyncMock(return_value=mock_result)
|
| 186 |
+
mock_agent_class.return_value = mock_agent
|
| 187 |
+
|
| 188 |
+
result = await orchestrator._generate_synthesis(
|
| 189 |
+
query="test query",
|
| 190 |
+
evidence=sample_evidence,
|
| 191 |
+
assessment=sample_assessment,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Should include citation footer
|
| 195 |
+
assert "Full Citation List" in result
|
| 196 |
+
assert "pubmed.ncbi.nlm.nih.gov/12345" in result
|
| 197 |
+
assert "pubmed.ncbi.nlm.nih.gov/67890" in result
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
@pytest.mark.unit
|
| 201 |
+
class TestGenerateTemplateSynthesis:
|
| 202 |
+
"""Tests for _generate_template_synthesis fallback method."""
|
| 203 |
+
|
| 204 |
+
def test_returns_structured_output(
|
| 205 |
+
self,
|
| 206 |
+
sample_evidence: list[Evidence],
|
| 207 |
+
sample_assessment: JudgeAssessment,
|
| 208 |
+
) -> None:
|
| 209 |
+
"""Template synthesis should return structured markdown."""
|
| 210 |
+
mock_search = MagicMock()
|
| 211 |
+
mock_judge = MagicMock()
|
| 212 |
+
|
| 213 |
+
orchestrator = Orchestrator(
|
| 214 |
+
search_handler=mock_search,
|
| 215 |
+
judge_handler=mock_judge,
|
| 216 |
+
)
|
| 217 |
+
orchestrator.history = [{"iteration": 1}]
|
| 218 |
+
|
| 219 |
+
result = orchestrator._generate_template_synthesis(
|
| 220 |
+
query="testosterone HSDD",
|
| 221 |
+
evidence=sample_evidence,
|
| 222 |
+
assessment=sample_assessment,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Should have all required sections
|
| 226 |
+
assert "Question" in result
|
| 227 |
+
assert "Drug Candidates" in result
|
| 228 |
+
assert "Key Findings" in result
|
| 229 |
+
assert "Assessment" in result
|
| 230 |
+
assert "Citations" in result
|
| 231 |
+
|
| 232 |
+
def test_includes_drug_candidates(
|
| 233 |
+
self,
|
| 234 |
+
sample_evidence: list[Evidence],
|
| 235 |
+
sample_assessment: JudgeAssessment,
|
| 236 |
+
) -> None:
|
| 237 |
+
"""Template synthesis should list drug candidates."""
|
| 238 |
+
mock_search = MagicMock()
|
| 239 |
+
mock_judge = MagicMock()
|
| 240 |
+
|
| 241 |
+
orchestrator = Orchestrator(
|
| 242 |
+
search_handler=mock_search,
|
| 243 |
+
judge_handler=mock_judge,
|
| 244 |
+
)
|
| 245 |
+
orchestrator.history = [{"iteration": 1}]
|
| 246 |
+
|
| 247 |
+
result = orchestrator._generate_template_synthesis(
|
| 248 |
+
query="test",
|
| 249 |
+
evidence=sample_evidence,
|
| 250 |
+
assessment=sample_assessment,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
assert "Testosterone" in result
|
| 254 |
+
assert "LibiGel" in result
|
| 255 |
+
|
| 256 |
+
def test_includes_scores(
|
| 257 |
+
self,
|
| 258 |
+
sample_evidence: list[Evidence],
|
| 259 |
+
sample_assessment: JudgeAssessment,
|
| 260 |
+
) -> None:
|
| 261 |
+
"""Template synthesis should include assessment scores."""
|
| 262 |
+
mock_search = MagicMock()
|
| 263 |
+
mock_judge = MagicMock()
|
| 264 |
+
|
| 265 |
+
orchestrator = Orchestrator(
|
| 266 |
+
search_handler=mock_search,
|
| 267 |
+
judge_handler=mock_judge,
|
| 268 |
+
)
|
| 269 |
+
orchestrator.history = [{"iteration": 1}]
|
| 270 |
+
|
| 271 |
+
result = orchestrator._generate_template_synthesis(
|
| 272 |
+
query="test",
|
| 273 |
+
evidence=sample_evidence,
|
| 274 |
+
assessment=sample_assessment,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
assert "8/10" in result # Mechanism score
|
| 278 |
+
assert "7/10" in result # Clinical score
|
| 279 |
+
assert "85%" in result # Confidence
|
tests/unit/orchestrators/test_termination.py
CHANGED
|
@@ -42,7 +42,7 @@ def orchestrator():
|
|
| 42 |
@pytest.mark.unit
|
| 43 |
def test_should_synthesize_high_scores(orchestrator):
|
| 44 |
"""High scores with drug candidates triggers synthesis."""
|
| 45 |
-
assessment = make_assessment(mechanism=7, clinical=6, drug_candidates=["
|
| 46 |
|
| 47 |
# Access the private method via name mangling or just call it if it was public.
|
| 48 |
# Since I made it private _should_synthesize, I access it directly.
|
|
|
|
| 42 |
@pytest.mark.unit
|
| 43 |
def test_should_synthesize_high_scores(orchestrator):
|
| 44 |
"""High scores with drug candidates triggers synthesis."""
|
| 45 |
+
assessment = make_assessment(mechanism=7, clinical=6, drug_candidates=["Testosterone"])
|
| 46 |
|
| 47 |
# Access the private method via name mangling or just call it if it was public.
|
| 48 |
# Since I made it private _should_synthesize, I access it directly.
|
tests/unit/prompts/test_synthesis.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for narrative synthesis prompts."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from src.prompts.synthesis import (
|
| 6 |
+
FEW_SHOT_EXAMPLE,
|
| 7 |
+
format_synthesis_prompt,
|
| 8 |
+
get_synthesis_system_prompt,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@pytest.mark.unit
|
| 13 |
+
class TestSynthesisSystemPrompt:
|
| 14 |
+
"""Tests for synthesis system prompt generation."""
|
| 15 |
+
|
| 16 |
+
def test_system_prompt_emphasizes_prose(self) -> None:
|
| 17 |
+
"""System prompt should emphasize prose paragraphs, not bullets."""
|
| 18 |
+
prompt = get_synthesis_system_prompt()
|
| 19 |
+
assert "PROSE PARAGRAPHS" in prompt
|
| 20 |
+
assert "not bullet points" in prompt.lower()
|
| 21 |
+
|
| 22 |
+
def test_system_prompt_requires_executive_summary(self) -> None:
|
| 23 |
+
"""System prompt should require executive summary section."""
|
| 24 |
+
prompt = get_synthesis_system_prompt()
|
| 25 |
+
assert "Executive Summary" in prompt
|
| 26 |
+
assert "REQUIRED" in prompt
|
| 27 |
+
|
| 28 |
+
def test_system_prompt_requires_background(self) -> None:
|
| 29 |
+
"""System prompt should require background section."""
|
| 30 |
+
prompt = get_synthesis_system_prompt()
|
| 31 |
+
assert "Background" in prompt
|
| 32 |
+
|
| 33 |
+
def test_system_prompt_requires_evidence_synthesis(self) -> None:
|
| 34 |
+
"""System prompt should require evidence synthesis section."""
|
| 35 |
+
prompt = get_synthesis_system_prompt()
|
| 36 |
+
assert "Evidence Synthesis" in prompt
|
| 37 |
+
assert "Mechanism of Action" in prompt
|
| 38 |
+
|
| 39 |
+
def test_system_prompt_requires_recommendations(self) -> None:
|
| 40 |
+
"""System prompt should require recommendations section."""
|
| 41 |
+
prompt = get_synthesis_system_prompt()
|
| 42 |
+
assert "Recommendations" in prompt
|
| 43 |
+
|
| 44 |
+
def test_system_prompt_requires_limitations(self) -> None:
|
| 45 |
+
"""System prompt should require limitations section."""
|
| 46 |
+
prompt = get_synthesis_system_prompt()
|
| 47 |
+
assert "Limitations" in prompt
|
| 48 |
+
|
| 49 |
+
def test_system_prompt_warns_about_hallucination(self) -> None:
|
| 50 |
+
"""System prompt should warn about citation hallucination."""
|
| 51 |
+
prompt = get_synthesis_system_prompt()
|
| 52 |
+
assert "NEVER hallucinate" in prompt or "never hallucinate" in prompt.lower()
|
| 53 |
+
|
| 54 |
+
def test_system_prompt_includes_domain_name(self) -> None:
|
| 55 |
+
"""System prompt should include domain name."""
|
| 56 |
+
prompt = get_synthesis_system_prompt("sexual_health")
|
| 57 |
+
assert "sexual health" in prompt.lower()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@pytest.mark.unit
|
| 61 |
+
class TestFormatSynthesisPrompt:
|
| 62 |
+
"""Tests for synthesis user prompt formatting."""
|
| 63 |
+
|
| 64 |
+
def test_includes_query(self) -> None:
|
| 65 |
+
"""User prompt should include the research query."""
|
| 66 |
+
prompt = format_synthesis_prompt(
|
| 67 |
+
query="testosterone libido",
|
| 68 |
+
evidence_summary="Study shows efficacy...",
|
| 69 |
+
drug_candidates=["Testosterone"],
|
| 70 |
+
key_findings=["Improved libido"],
|
| 71 |
+
mechanism_score=8,
|
| 72 |
+
clinical_score=7,
|
| 73 |
+
confidence=0.85,
|
| 74 |
+
)
|
| 75 |
+
assert "testosterone libido" in prompt
|
| 76 |
+
|
| 77 |
+
def test_includes_evidence_summary(self) -> None:
|
| 78 |
+
"""User prompt should include evidence summary."""
|
| 79 |
+
prompt = format_synthesis_prompt(
|
| 80 |
+
query="test query",
|
| 81 |
+
evidence_summary="Study by Smith et al. shows significant results...",
|
| 82 |
+
drug_candidates=[],
|
| 83 |
+
key_findings=[],
|
| 84 |
+
mechanism_score=5,
|
| 85 |
+
clinical_score=5,
|
| 86 |
+
confidence=0.5,
|
| 87 |
+
)
|
| 88 |
+
assert "Study by Smith et al." in prompt
|
| 89 |
+
|
| 90 |
+
def test_includes_drug_candidates(self) -> None:
|
| 91 |
+
"""User prompt should include drug candidates."""
|
| 92 |
+
prompt = format_synthesis_prompt(
|
| 93 |
+
query="test query",
|
| 94 |
+
evidence_summary="...",
|
| 95 |
+
drug_candidates=["Testosterone", "Flibanserin"],
|
| 96 |
+
key_findings=[],
|
| 97 |
+
mechanism_score=5,
|
| 98 |
+
clinical_score=5,
|
| 99 |
+
confidence=0.5,
|
| 100 |
+
)
|
| 101 |
+
assert "Testosterone" in prompt
|
| 102 |
+
assert "Flibanserin" in prompt
|
| 103 |
+
|
| 104 |
+
def test_includes_key_findings(self) -> None:
|
| 105 |
+
"""User prompt should include key findings."""
|
| 106 |
+
prompt = format_synthesis_prompt(
|
| 107 |
+
query="test query",
|
| 108 |
+
evidence_summary="...",
|
| 109 |
+
drug_candidates=[],
|
| 110 |
+
key_findings=["Improved libido in postmenopausal women", "Safe profile"],
|
| 111 |
+
mechanism_score=5,
|
| 112 |
+
clinical_score=5,
|
| 113 |
+
confidence=0.5,
|
| 114 |
+
)
|
| 115 |
+
assert "Improved libido in postmenopausal women" in prompt
|
| 116 |
+
assert "Safe profile" in prompt
|
| 117 |
+
|
| 118 |
+
def test_includes_scores(self) -> None:
|
| 119 |
+
"""User prompt should include assessment scores."""
|
| 120 |
+
prompt = format_synthesis_prompt(
|
| 121 |
+
query="test query",
|
| 122 |
+
evidence_summary="...",
|
| 123 |
+
drug_candidates=[],
|
| 124 |
+
key_findings=[],
|
| 125 |
+
mechanism_score=8,
|
| 126 |
+
clinical_score=7,
|
| 127 |
+
confidence=0.85,
|
| 128 |
+
)
|
| 129 |
+
assert "8/10" in prompt
|
| 130 |
+
assert "7/10" in prompt
|
| 131 |
+
assert "85%" in prompt
|
| 132 |
+
|
| 133 |
+
def test_handles_empty_candidates(self) -> None:
|
| 134 |
+
"""User prompt should handle empty drug candidates."""
|
| 135 |
+
prompt = format_synthesis_prompt(
|
| 136 |
+
query="test query",
|
| 137 |
+
evidence_summary="...",
|
| 138 |
+
drug_candidates=[],
|
| 139 |
+
key_findings=[],
|
| 140 |
+
mechanism_score=5,
|
| 141 |
+
clinical_score=5,
|
| 142 |
+
confidence=0.5,
|
| 143 |
+
)
|
| 144 |
+
assert "None identified" in prompt
|
| 145 |
+
|
| 146 |
+
def test_handles_empty_findings(self) -> None:
|
| 147 |
+
"""User prompt should handle empty key findings."""
|
| 148 |
+
prompt = format_synthesis_prompt(
|
| 149 |
+
query="test query",
|
| 150 |
+
evidence_summary="...",
|
| 151 |
+
drug_candidates=[],
|
| 152 |
+
key_findings=[],
|
| 153 |
+
mechanism_score=5,
|
| 154 |
+
clinical_score=5,
|
| 155 |
+
confidence=0.5,
|
| 156 |
+
)
|
| 157 |
+
assert "No specific findings" in prompt
|
| 158 |
+
|
| 159 |
+
def test_includes_few_shot_example(self) -> None:
|
| 160 |
+
"""User prompt should include few-shot example."""
|
| 161 |
+
prompt = format_synthesis_prompt(
|
| 162 |
+
query="test query",
|
| 163 |
+
evidence_summary="...",
|
| 164 |
+
drug_candidates=[],
|
| 165 |
+
key_findings=[],
|
| 166 |
+
mechanism_score=5,
|
| 167 |
+
clinical_score=5,
|
| 168 |
+
confidence=0.5,
|
| 169 |
+
)
|
| 170 |
+
assert "Alprostadil" in prompt # From the few-shot example
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
@pytest.mark.unit
|
| 174 |
+
class TestFewShotExample:
|
| 175 |
+
"""Tests for the few-shot example quality."""
|
| 176 |
+
|
| 177 |
+
def test_few_shot_is_mostly_narrative(self) -> None:
|
| 178 |
+
"""Few-shot example should be mostly prose paragraphs, not bullets."""
|
| 179 |
+
# Count substantial paragraphs (>100 chars of prose)
|
| 180 |
+
paragraphs = [p for p in FEW_SHOT_EXAMPLE.split("\n\n") if len(p) > 100]
|
| 181 |
+
# Count bullet points
|
| 182 |
+
bullets = FEW_SHOT_EXAMPLE.count("\n- ") + FEW_SHOT_EXAMPLE.count("\n1. ")
|
| 183 |
+
|
| 184 |
+
# Prose should dominate - at least as many paragraphs as bullets
|
| 185 |
+
assert len(paragraphs) >= bullets, "Few-shot example should be mostly narrative prose"
|
| 186 |
+
|
| 187 |
+
def test_few_shot_has_executive_summary(self) -> None:
|
| 188 |
+
"""Few-shot example should demonstrate executive summary."""
|
| 189 |
+
assert "Executive Summary" in FEW_SHOT_EXAMPLE
|
| 190 |
+
|
| 191 |
+
def test_few_shot_has_background(self) -> None:
|
| 192 |
+
"""Few-shot example should demonstrate background section."""
|
| 193 |
+
assert "Background" in FEW_SHOT_EXAMPLE
|
| 194 |
+
|
| 195 |
+
def test_few_shot_has_evidence_synthesis(self) -> None:
|
| 196 |
+
"""Few-shot example should demonstrate evidence synthesis."""
|
| 197 |
+
assert "Evidence Synthesis" in FEW_SHOT_EXAMPLE
|
| 198 |
+
assert "Mechanism of Action" in FEW_SHOT_EXAMPLE
|
| 199 |
+
|
| 200 |
+
def test_few_shot_has_recommendations(self) -> None:
|
| 201 |
+
"""Few-shot example should demonstrate recommendations."""
|
| 202 |
+
assert "Recommendations" in FEW_SHOT_EXAMPLE
|
| 203 |
+
|
| 204 |
+
def test_few_shot_has_limitations(self) -> None:
|
| 205 |
+
"""Few-shot example should demonstrate limitations."""
|
| 206 |
+
assert "Limitations" in FEW_SHOT_EXAMPLE
|
| 207 |
+
|
| 208 |
+
def test_few_shot_has_references(self) -> None:
|
| 209 |
+
"""Few-shot example should demonstrate references format."""
|
| 210 |
+
assert "References" in FEW_SHOT_EXAMPLE
|
| 211 |
+
assert "pubmed.ncbi.nlm.nih.gov" in FEW_SHOT_EXAMPLE
|
| 212 |
+
|
| 213 |
+
def test_few_shot_includes_statistics(self) -> None:
|
| 214 |
+
"""Few-shot example should demonstrate statistical reporting."""
|
| 215 |
+
assert "%" in FEW_SHOT_EXAMPLE # Percentages
|
| 216 |
+
assert "p<" in FEW_SHOT_EXAMPLE or "p=" in FEW_SHOT_EXAMPLE # P-values
|
| 217 |
+
assert "CI" in FEW_SHOT_EXAMPLE # Confidence intervals
|
tests/unit/services/test_embeddings.py
CHANGED
|
@@ -57,7 +57,7 @@ class TestEmbeddingService:
|
|
| 57 |
async def test_embed_returns_vector(self, mock_sentence_transformer, mock_chroma_client):
|
| 58 |
"""Embedding should return a float vector (async check)."""
|
| 59 |
service = EmbeddingService()
|
| 60 |
-
embedding = await service.embed("
|
| 61 |
|
| 62 |
assert isinstance(embedding, list)
|
| 63 |
assert len(embedding) == 3 # noqa: PLR2004
|
|
@@ -86,7 +86,7 @@ class TestEmbeddingService:
|
|
| 86 |
service = EmbeddingService()
|
| 87 |
await service.add_evidence(
|
| 88 |
evidence_id="test1",
|
| 89 |
-
content="
|
| 90 |
metadata={"source": "pubmed"},
|
| 91 |
)
|
| 92 |
|
|
|
|
| 57 |
async def test_embed_returns_vector(self, mock_sentence_transformer, mock_chroma_client):
|
| 58 |
"""Embedding should return a float vector (async check)."""
|
| 59 |
service = EmbeddingService()
|
| 60 |
+
embedding = await service.embed("testosterone libido")
|
| 61 |
|
| 62 |
assert isinstance(embedding, list)
|
| 63 |
assert len(embedding) == 3 # noqa: PLR2004
|
|
|
|
| 86 |
service = EmbeddingService()
|
| 87 |
await service.add_evidence(
|
| 88 |
evidence_id="test1",
|
| 89 |
+
content="Testosterone activates androgen receptor pathway",
|
| 90 |
metadata={"source": "pubmed"},
|
| 91 |
)
|
| 92 |
|
tests/unit/services/test_statistical_analyzer.py
CHANGED
|
@@ -17,10 +17,10 @@ def sample_evidence() -> list[Evidence]:
|
|
| 17 |
"""Sample evidence for testing."""
|
| 18 |
return [
|
| 19 |
Evidence(
|
| 20 |
-
content="
|
| 21 |
citation=Citation(
|
| 22 |
source="pubmed",
|
| 23 |
-
title="
|
| 24 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 25 |
date="2024-01-15",
|
| 26 |
authors=["Smith J"],
|
|
|
|
| 17 |
"""Sample evidence for testing."""
|
| 18 |
return [
|
| 19 |
Evidence(
|
| 20 |
+
content="Testosterone therapy shows effect size of 0.45.",
|
| 21 |
citation=Citation(
|
| 22 |
source="pubmed",
|
| 23 |
+
title="Testosterone HSDD Study",
|
| 24 |
url="https://pubmed.ncbi.nlm.nih.gov/12345/",
|
| 25 |
date="2024-01-15",
|
| 26 |
authors=["Smith J"],
|
tests/unit/test_mcp_tools.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""Unit tests for MCP tool wrappers."""
|
| 2 |
|
| 3 |
-
from unittest.mock import AsyncMock, patch
|
| 4 |
|
| 5 |
import pytest
|
| 6 |
|
|
@@ -17,10 +17,10 @@ from src.utils.models import Citation, Evidence
|
|
| 17 |
def mock_evidence() -> Evidence:
|
| 18 |
"""Sample evidence for testing."""
|
| 19 |
return Evidence(
|
| 20 |
-
content="
|
| 21 |
citation=Citation(
|
| 22 |
source="pubmed",
|
| 23 |
-
title="
|
| 24 |
url="https://pubmed.ncbi.nlm.nih.gov/12345678/",
|
| 25 |
date="2024-01-15",
|
| 26 |
authors=["Smith J", "Jones M", "Brown K"],
|
|
@@ -33,17 +33,30 @@ class TestSearchPubMed:
|
|
| 33 |
"""Tests for search_pubmed MCP tool."""
|
| 34 |
|
| 35 |
@pytest.mark.asyncio
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
@pytest.mark.asyncio
|
| 49 |
async def test_clamps_max_results(self) -> None:
|
|
@@ -81,7 +94,7 @@ class TestSearchClinicalTrials:
|
|
| 81 |
with patch("src.mcp_tools._trials") as mock_tool:
|
| 82 |
mock_tool.search = AsyncMock(return_value=[mock_evidence])
|
| 83 |
|
| 84 |
-
result = await search_clinical_trials("
|
| 85 |
|
| 86 |
assert isinstance(result, str)
|
| 87 |
assert "Clinical Trials" in result
|
|
@@ -119,7 +132,7 @@ class TestSearchAllSources:
|
|
| 119 |
mock_trials.return_value = "## Clinical Trials"
|
| 120 |
mock_europepmc.return_value = "## Europe PMC Results"
|
| 121 |
|
| 122 |
-
result = await search_all_sources("
|
| 123 |
|
| 124 |
assert "Comprehensive Search" in result
|
| 125 |
assert "PubMed" in result
|
|
@@ -138,7 +151,7 @@ class TestSearchAllSources:
|
|
| 138 |
mock_trials.side_effect = Exception("API Error")
|
| 139 |
mock_europepmc.return_value = "## Europe PMC Results"
|
| 140 |
|
| 141 |
-
result = await search_all_sources("
|
| 142 |
|
| 143 |
# Should still contain working sources
|
| 144 |
assert "PubMed" in result
|
|
|
|
| 1 |
"""Unit tests for MCP tool wrappers."""
|
| 2 |
|
| 3 |
+
from unittest.mock import AsyncMock, MagicMock, patch
|
| 4 |
|
| 5 |
import pytest
|
| 6 |
|
|
|
|
| 17 |
def mock_evidence() -> Evidence:
|
| 18 |
"""Sample evidence for testing."""
|
| 19 |
return Evidence(
|
| 20 |
+
content="Testosterone therapy shows efficacy in treating HSDD.",
|
| 21 |
citation=Citation(
|
| 22 |
source="pubmed",
|
| 23 |
+
title="Testosterone and Female Libido",
|
| 24 |
url="https://pubmed.ncbi.nlm.nih.gov/12345678/",
|
| 25 |
date="2024-01-15",
|
| 26 |
authors=["Smith J", "Jones M", "Brown K"],
|
|
|
|
| 33 |
"""Tests for search_pubmed MCP tool."""
|
| 34 |
|
| 35 |
@pytest.mark.asyncio
|
| 36 |
+
@patch("src.mcp_tools._pubmed.search")
|
| 37 |
+
async def test_returns_formatted_string(self, mock_search):
|
| 38 |
+
"""Test that search_pubmed returns Markdown formatted string."""
|
| 39 |
+
# Mock evidence
|
| 40 |
+
mock_evidence = MagicMock()
|
| 41 |
+
mock_evidence.citation.title = "Test Title"
|
| 42 |
+
mock_evidence.citation.authors = ["Author 1", "Author 2"]
|
| 43 |
+
mock_evidence.citation.date = "2024"
|
| 44 |
+
mock_evidence.citation.url = "http://test.com"
|
| 45 |
+
mock_evidence.content = "Abstract content..."
|
| 46 |
+
|
| 47 |
+
mock_search.return_value = [mock_evidence]
|
| 48 |
+
|
| 49 |
+
with patch("src.mcp_tools.get_domain_config") as mock_config:
|
| 50 |
+
mock_config.return_value.name = "Sexual Health Research"
|
| 51 |
+
|
| 52 |
+
result = await search_pubmed("testosterone libido", 10)
|
| 53 |
+
|
| 54 |
+
assert "## PubMed Results" in result
|
| 55 |
+
assert "Sexual Health Research" in result
|
| 56 |
+
assert "Test Title" in result
|
| 57 |
+
assert "Author 1" in result
|
| 58 |
+
assert "2024" in result
|
| 59 |
+
assert "Abstract content..." in result
|
| 60 |
|
| 61 |
@pytest.mark.asyncio
|
| 62 |
async def test_clamps_max_results(self) -> None:
|
|
|
|
| 94 |
with patch("src.mcp_tools._trials") as mock_tool:
|
| 95 |
mock_tool.search = AsyncMock(return_value=[mock_evidence])
|
| 96 |
|
| 97 |
+
result = await search_clinical_trials("sildenafil erectile dysfunction", 10)
|
| 98 |
|
| 99 |
assert isinstance(result, str)
|
| 100 |
assert "Clinical Trials" in result
|
|
|
|
| 132 |
mock_trials.return_value = "## Clinical Trials"
|
| 133 |
mock_europepmc.return_value = "## Europe PMC Results"
|
| 134 |
|
| 135 |
+
result = await search_all_sources("testosterone libido", 5)
|
| 136 |
|
| 137 |
assert "Comprehensive Search" in result
|
| 138 |
assert "PubMed" in result
|
|
|
|
| 151 |
mock_trials.side_effect = Exception("API Error")
|
| 152 |
mock_europepmc.return_value = "## Europe PMC Results"
|
| 153 |
|
| 154 |
+
result = await search_all_sources("testosterone libido", 5)
|
| 155 |
|
| 156 |
# Should still contain working sources
|
| 157 |
assert "PubMed" in result
|
tests/unit/test_orchestrator.py
CHANGED
|
@@ -269,14 +269,14 @@ class TestAgentEvent:
|
|
| 269 |
"""AgentEvent should format to markdown correctly."""
|
| 270 |
event = AgentEvent(
|
| 271 |
type="searching",
|
| 272 |
-
message="Searching for:
|
| 273 |
iteration=1,
|
| 274 |
)
|
| 275 |
|
| 276 |
md = event.to_markdown()
|
| 277 |
assert "🔍" in md
|
| 278 |
assert "SEARCHING" in md
|
| 279 |
-
assert "
|
| 280 |
|
| 281 |
def test_complete_event_icon(self):
|
| 282 |
"""Complete event should have celebration icon."""
|
|
|
|
| 269 |
"""AgentEvent should format to markdown correctly."""
|
| 270 |
event = AgentEvent(
|
| 271 |
type="searching",
|
| 272 |
+
message="Searching for: testosterone libido",
|
| 273 |
iteration=1,
|
| 274 |
)
|
| 275 |
|
| 276 |
md = event.to_markdown()
|
| 277 |
assert "🔍" in md
|
| 278 |
assert "SEARCHING" in md
|
| 279 |
+
assert "testosterone libido" in md
|
| 280 |
|
| 281 |
def test_complete_event_icon(self):
|
| 282 |
"""Complete event should have celebration icon."""
|
tests/unit/tools/test_clinicaltrials.py
CHANGED
|
@@ -49,23 +49,23 @@ class TestClinicalTrialsTool:
|
|
| 49 |
"protocolSection": {
|
| 50 |
"identificationModule": {
|
| 51 |
"nctId": "NCT12345678",
|
| 52 |
-
"briefTitle": "
|
| 53 |
},
|
| 54 |
"statusModule": {
|
| 55 |
"overallStatus": "COMPLETED",
|
| 56 |
"startDateStruct": {"date": "2023-01-01"},
|
| 57 |
},
|
| 58 |
"descriptionModule": {
|
| 59 |
-
"briefSummary": "A study examining
|
| 60 |
},
|
| 61 |
"designModule": {
|
| 62 |
"phases": ["PHASE2", "PHASE3"],
|
| 63 |
},
|
| 64 |
"conditionsModule": {
|
| 65 |
-
"conditions": ["
|
| 66 |
},
|
| 67 |
"armsInterventionsModule": {
|
| 68 |
-
"interventions": [{"name": "
|
| 69 |
},
|
| 70 |
}
|
| 71 |
}
|
|
@@ -75,11 +75,11 @@ class TestClinicalTrialsTool:
|
|
| 75 |
mock_response.raise_for_status = MagicMock()
|
| 76 |
|
| 77 |
with patch("requests.get", return_value=mock_response):
|
| 78 |
-
results = await tool.search("
|
| 79 |
|
| 80 |
assert len(results) == 1
|
| 81 |
assert isinstance(results[0], Evidence)
|
| 82 |
-
assert "
|
| 83 |
assert "PHASE2" in results[0].content or "Phase" in results[0].content
|
| 84 |
|
| 85 |
@pytest.mark.asyncio
|
|
@@ -134,9 +134,9 @@ class TestClinicalTrialsIntegration:
|
|
| 134 |
|
| 135 |
@pytest.mark.asyncio
|
| 136 |
async def test_real_api_returns_interventional(self) -> None:
|
| 137 |
-
"""Test that real API returns interventional studies."""
|
| 138 |
tool = ClinicalTrialsTool()
|
| 139 |
-
results = await tool.search("
|
| 140 |
|
| 141 |
# Should get results
|
| 142 |
assert len(results) > 0
|
|
|
|
| 49 |
"protocolSection": {
|
| 50 |
"identificationModule": {
|
| 51 |
"nctId": "NCT12345678",
|
| 52 |
+
"briefTitle": "Testosterone for HSDD Treatment",
|
| 53 |
},
|
| 54 |
"statusModule": {
|
| 55 |
"overallStatus": "COMPLETED",
|
| 56 |
"startDateStruct": {"date": "2023-01-01"},
|
| 57 |
},
|
| 58 |
"descriptionModule": {
|
| 59 |
+
"briefSummary": "A study examining testosterone for HSDD symptoms.",
|
| 60 |
},
|
| 61 |
"designModule": {
|
| 62 |
"phases": ["PHASE2", "PHASE3"],
|
| 63 |
},
|
| 64 |
"conditionsModule": {
|
| 65 |
+
"conditions": ["HSDD", "Hypoactive Sexual Desire"],
|
| 66 |
},
|
| 67 |
"armsInterventionsModule": {
|
| 68 |
+
"interventions": [{"name": "Testosterone"}],
|
| 69 |
},
|
| 70 |
}
|
| 71 |
}
|
|
|
|
| 75 |
mock_response.raise_for_status = MagicMock()
|
| 76 |
|
| 77 |
with patch("requests.get", return_value=mock_response):
|
| 78 |
+
results = await tool.search("testosterone hsdd", max_results=5)
|
| 79 |
|
| 80 |
assert len(results) == 1
|
| 81 |
assert isinstance(results[0], Evidence)
|
| 82 |
+
assert "Testosterone" in results[0].citation.title
|
| 83 |
assert "PHASE2" in results[0].content or "Phase" in results[0].content
|
| 84 |
|
| 85 |
@pytest.mark.asyncio
|
|
|
|
| 134 |
|
| 135 |
@pytest.mark.asyncio
|
| 136 |
async def test_real_api_returns_interventional(self) -> None:
|
| 137 |
+
"""Test that real API returns interventional studies for sexual health query."""
|
| 138 |
tool = ClinicalTrialsTool()
|
| 139 |
+
results = await tool.search("testosterone HSDD", max_results=3)
|
| 140 |
|
| 141 |
# Should get results
|
| 142 |
assert len(results) > 0
|
tests/unit/tools/test_europepmc.py
CHANGED
|
@@ -27,8 +27,8 @@ class TestEuropePMCTool:
|
|
| 27 |
"result": [
|
| 28 |
{
|
| 29 |
"id": "12345",
|
| 30 |
-
"title": "
|
| 31 |
-
"abstractText": "This study examines
|
| 32 |
"doi": "10.1234/test",
|
| 33 |
"pubYear": "2024",
|
| 34 |
"source": "MED",
|
|
@@ -49,11 +49,11 @@ class TestEuropePMCTool:
|
|
| 49 |
|
| 50 |
mock_instance.get.return_value = mock_resp
|
| 51 |
|
| 52 |
-
results = await tool.search("
|
| 53 |
|
| 54 |
assert len(results) == 1
|
| 55 |
assert isinstance(results[0], Evidence)
|
| 56 |
-
assert "
|
| 57 |
|
| 58 |
@pytest.mark.asyncio
|
| 59 |
async def test_search_marks_preprints(self, tool: EuropePMCTool) -> None:
|
|
@@ -113,11 +113,11 @@ class TestEuropePMCIntegration:
|
|
| 113 |
|
| 114 |
@pytest.mark.asyncio
|
| 115 |
async def test_real_api_call(self) -> None:
|
| 116 |
-
"""Test actual API returns relevant results."""
|
| 117 |
tool = EuropePMCTool()
|
| 118 |
-
results = await tool.search("
|
| 119 |
|
| 120 |
assert len(results) > 0
|
| 121 |
-
# At least one result should mention
|
| 122 |
titles = " ".join([r.citation.title.lower() for r in results])
|
| 123 |
-
assert "
|
|
|
|
| 27 |
"result": [
|
| 28 |
{
|
| 29 |
"id": "12345",
|
| 30 |
+
"title": "Testosterone Therapy for HSDD Study",
|
| 31 |
+
"abstractText": "This study examines testosterone therapy for HSDD.",
|
| 32 |
"doi": "10.1234/test",
|
| 33 |
"pubYear": "2024",
|
| 34 |
"source": "MED",
|
|
|
|
| 49 |
|
| 50 |
mock_instance.get.return_value = mock_resp
|
| 51 |
|
| 52 |
+
results = await tool.search("testosterone HSDD therapy", max_results=5)
|
| 53 |
|
| 54 |
assert len(results) == 1
|
| 55 |
assert isinstance(results[0], Evidence)
|
| 56 |
+
assert "Testosterone Therapy for HSDD Study" in results[0].citation.title
|
| 57 |
|
| 58 |
@pytest.mark.asyncio
|
| 59 |
async def test_search_marks_preprints(self, tool: EuropePMCTool) -> None:
|
|
|
|
| 113 |
|
| 114 |
@pytest.mark.asyncio
|
| 115 |
async def test_real_api_call(self) -> None:
|
| 116 |
+
"""Test actual API returns relevant results for sexual health query."""
|
| 117 |
tool = EuropePMCTool()
|
| 118 |
+
results = await tool.search("testosterone libido therapy", max_results=3)
|
| 119 |
|
| 120 |
assert len(results) > 0
|
| 121 |
+
# At least one result should mention testosterone or libido
|
| 122 |
titles = " ".join([r.citation.title.lower() for r in results])
|
| 123 |
+
assert "testosterone" in titles or "libido" in titles or "sexual" in titles
|
tests/unit/tools/test_openalex.py
CHANGED
|
@@ -13,20 +13,20 @@ SAMPLE_OPENALEX_RESPONSE = {
|
|
| 13 |
{
|
| 14 |
"id": "https://openalex.org/W12345",
|
| 15 |
"doi": "https://doi.org/10.1234/test",
|
| 16 |
-
"display_name": "
|
| 17 |
"publication_year": 2024,
|
| 18 |
"cited_by_count": 150,
|
| 19 |
"abstract_inverted_index": {
|
| 20 |
-
"
|
| 21 |
"shows": [1],
|
| 22 |
"promise": [2],
|
| 23 |
"in": [3],
|
| 24 |
-
"
|
| 25 |
"treatment": [5],
|
| 26 |
},
|
| 27 |
"concepts": [
|
| 28 |
-
{"display_name": "
|
| 29 |
-
{"display_name": "
|
| 30 |
],
|
| 31 |
"authorships": [
|
| 32 |
{"author": {"display_name": "John Smith"}},
|
|
@@ -70,7 +70,7 @@ class TestOpenAlexTool:
|
|
| 70 |
@pytest.mark.asyncio
|
| 71 |
async def test_search_returns_evidence(self, tool: OpenAlexTool, mock_client) -> None:
|
| 72 |
"""Search should return Evidence objects."""
|
| 73 |
-
results = await tool.search("
|
| 74 |
|
| 75 |
assert len(results) == 1
|
| 76 |
assert isinstance(results[0], Evidence)
|
|
@@ -79,27 +79,27 @@ class TestOpenAlexTool:
|
|
| 79 |
@pytest.mark.asyncio
|
| 80 |
async def test_search_includes_citation_count(self, tool: OpenAlexTool, mock_client) -> None:
|
| 81 |
"""Evidence metadata should include cited_by_count."""
|
| 82 |
-
results = await tool.search("
|
| 83 |
assert results[0].metadata["cited_by_count"] == 150
|
| 84 |
|
| 85 |
@pytest.mark.asyncio
|
| 86 |
async def test_search_calculates_relevance(self, tool: OpenAlexTool, mock_client) -> None:
|
| 87 |
"""Evidence relevance should be based on citations (capped at 1.0)."""
|
| 88 |
-
results = await tool.search("
|
| 89 |
# 150 citations / 100 = 1.5 -> capped at 1.0
|
| 90 |
assert results[0].relevance == 1.0
|
| 91 |
|
| 92 |
@pytest.mark.asyncio
|
| 93 |
async def test_search_includes_concepts(self, tool: OpenAlexTool, mock_client) -> None:
|
| 94 |
"""Evidence metadata should include concepts."""
|
| 95 |
-
results = await tool.search("
|
| 96 |
-
assert "
|
| 97 |
-
assert "
|
| 98 |
|
| 99 |
@pytest.mark.asyncio
|
| 100 |
async def test_search_includes_open_access_info(self, tool: OpenAlexTool, mock_client) -> None:
|
| 101 |
"""Evidence metadata should include open access info."""
|
| 102 |
-
results = await tool.search("
|
| 103 |
assert results[0].metadata["is_open_access"] is True
|
| 104 |
assert results[0].metadata["pdf_url"] == "https://example.com/paper.pdf"
|
| 105 |
|
|
@@ -135,15 +135,14 @@ class TestOpenAlexTool:
|
|
| 135 |
"""Verify API call requests citation-sorted results and uses polite pool."""
|
| 136 |
mock_client.get.return_value.json.return_value = {"results": []}
|
| 137 |
|
| 138 |
-
await tool.search("
|
| 139 |
|
| 140 |
# Verify call params
|
| 141 |
call_args = mock_client.get.call_args
|
|
|
|
| 142 |
params = call_args[1]["params"]
|
| 143 |
-
assert
|
| 144 |
-
assert params["
|
| 145 |
-
assert "type:article" in params["filter"]
|
| 146 |
-
assert "has_abstract:true" in params["filter"]
|
| 147 |
|
| 148 |
|
| 149 |
@pytest.mark.integration
|
|
@@ -154,12 +153,12 @@ class TestOpenAlexIntegration:
|
|
| 154 |
async def test_real_api_returns_results(self) -> None:
|
| 155 |
"""Test actual API returns relevant results."""
|
| 156 |
tool = OpenAlexTool()
|
| 157 |
-
results = await tool.search("
|
| 158 |
|
| 159 |
assert len(results) > 0
|
| 160 |
# Should have citation counts
|
| 161 |
assert results[0].metadata["cited_by_count"] >= 0
|
| 162 |
# Should have abstract text
|
| 163 |
-
assert len(results[0].content) >
|
| 164 |
# Should have concepts
|
| 165 |
assert len(results[0].metadata["concepts"]) > 0
|
|
|
|
| 13 |
{
|
| 14 |
"id": "https://openalex.org/W12345",
|
| 15 |
"doi": "https://doi.org/10.1234/test",
|
| 16 |
+
"display_name": "Sildenafil in ED Treatment",
|
| 17 |
"publication_year": 2024,
|
| 18 |
"cited_by_count": 150,
|
| 19 |
"abstract_inverted_index": {
|
| 20 |
+
"Sildenafil": [0],
|
| 21 |
"shows": [1],
|
| 22 |
"promise": [2],
|
| 23 |
"in": [3],
|
| 24 |
+
"ED": [4],
|
| 25 |
"treatment": [5],
|
| 26 |
},
|
| 27 |
"concepts": [
|
| 28 |
+
{"display_name": "Sildenafil", "score": 0.95, "level": 2},
|
| 29 |
+
{"display_name": "Erectile Dysfunction", "score": 0.88, "level": 1},
|
| 30 |
],
|
| 31 |
"authorships": [
|
| 32 |
{"author": {"display_name": "John Smith"}},
|
|
|
|
| 70 |
@pytest.mark.asyncio
|
| 71 |
async def test_search_returns_evidence(self, tool: OpenAlexTool, mock_client) -> None:
|
| 72 |
"""Search should return Evidence objects."""
|
| 73 |
+
results = await tool.search("sildenafil ED", max_results=5)
|
| 74 |
|
| 75 |
assert len(results) == 1
|
| 76 |
assert isinstance(results[0], Evidence)
|
|
|
|
| 79 |
@pytest.mark.asyncio
|
| 80 |
async def test_search_includes_citation_count(self, tool: OpenAlexTool, mock_client) -> None:
|
| 81 |
"""Evidence metadata should include cited_by_count."""
|
| 82 |
+
results = await tool.search("sildenafil ED", max_results=5)
|
| 83 |
assert results[0].metadata["cited_by_count"] == 150
|
| 84 |
|
| 85 |
@pytest.mark.asyncio
|
| 86 |
async def test_search_calculates_relevance(self, tool: OpenAlexTool, mock_client) -> None:
|
| 87 |
"""Evidence relevance should be based on citations (capped at 1.0)."""
|
| 88 |
+
results = await tool.search("sildenafil ED", max_results=5)
|
| 89 |
# 150 citations / 100 = 1.5 -> capped at 1.0
|
| 90 |
assert results[0].relevance == 1.0
|
| 91 |
|
| 92 |
@pytest.mark.asyncio
|
| 93 |
async def test_search_includes_concepts(self, tool: OpenAlexTool, mock_client) -> None:
|
| 94 |
"""Evidence metadata should include concepts."""
|
| 95 |
+
results = await tool.search("sildenafil ED", max_results=5)
|
| 96 |
+
assert "Sildenafil" in results[0].metadata["concepts"]
|
| 97 |
+
assert "Erectile Dysfunction" in results[0].metadata["concepts"]
|
| 98 |
|
| 99 |
@pytest.mark.asyncio
|
| 100 |
async def test_search_includes_open_access_info(self, tool: OpenAlexTool, mock_client) -> None:
|
| 101 |
"""Evidence metadata should include open access info."""
|
| 102 |
+
results = await tool.search("sildenafil ED", max_results=5)
|
| 103 |
assert results[0].metadata["is_open_access"] is True
|
| 104 |
assert results[0].metadata["pdf_url"] == "https://example.com/paper.pdf"
|
| 105 |
|
|
|
|
| 135 |
"""Verify API call requests citation-sorted results and uses polite pool."""
|
| 136 |
mock_client.get.return_value.json.return_value = {"results": []}
|
| 137 |
|
| 138 |
+
await tool.search("sildenafil ED treatment", max_results=3)
|
| 139 |
|
| 140 |
# Verify call params
|
| 141 |
call_args = mock_client.get.call_args
|
| 142 |
+
# args[0] is url, args[1] is kwargs
|
| 143 |
params = call_args[1]["params"]
|
| 144 |
+
assert "sildenafil" in params["search"]
|
| 145 |
+
assert params["per_page"] == 3
|
|
|
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
@pytest.mark.integration
|
|
|
|
| 153 |
async def test_real_api_returns_results(self) -> None:
|
| 154 |
"""Test actual API returns relevant results."""
|
| 155 |
tool = OpenAlexTool()
|
| 156 |
+
results = await tool.search("sildenafil ED treatment", max_results=3)
|
| 157 |
|
| 158 |
assert len(results) > 0
|
| 159 |
# Should have citation counts
|
| 160 |
assert results[0].metadata["cited_by_count"] >= 0
|
| 161 |
# Should have abstract text
|
| 162 |
+
assert len(results[0].content) > 20
|
| 163 |
# Should have concepts
|
| 164 |
assert len(results[0].metadata["concepts"]) > 0
|
tests/unit/tools/test_pubmed.py
CHANGED
|
@@ -13,9 +13,9 @@ SAMPLE_PUBMED_XML = """<?xml version="1.0" ?>
|
|
| 13 |
<MedlineCitation>
|
| 14 |
<PMID>12345678</PMID>
|
| 15 |
<Article>
|
| 16 |
-
<ArticleTitle>
|
| 17 |
<Abstract>
|
| 18 |
-
<AbstractText>
|
| 19 |
</Abstract>
|
| 20 |
<AuthorList>
|
| 21 |
<Author>
|
|
@@ -49,8 +49,33 @@ class TestPubMedTool:
|
|
| 49 |
mock_search_response.json.return_value = {"esearchresult": {"idlist": ["12345678"]}}
|
| 50 |
mock_search_response.raise_for_status = MagicMock()
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
mock_fetch_response = MagicMock()
|
| 53 |
-
mock_fetch_response.text =
|
| 54 |
mock_fetch_response.raise_for_status = MagicMock()
|
| 55 |
|
| 56 |
mock_client = AsyncMock()
|
|
@@ -62,12 +87,12 @@ class TestPubMedTool:
|
|
| 62 |
|
| 63 |
# Act
|
| 64 |
tool = PubMedTool()
|
| 65 |
-
results = await tool.search("
|
| 66 |
|
| 67 |
# Assert
|
| 68 |
assert len(results) == 1
|
| 69 |
assert results[0].citation.source == "pubmed"
|
| 70 |
-
assert "
|
| 71 |
assert "12345678" in results[0].citation.url
|
| 72 |
|
| 73 |
@pytest.mark.asyncio
|
|
@@ -113,7 +138,7 @@ class TestPubMedTool:
|
|
| 113 |
mocker.patch("httpx.AsyncClient", return_value=mock_client)
|
| 114 |
|
| 115 |
tool = PubMedTool()
|
| 116 |
-
await tool.search("What
|
| 117 |
|
| 118 |
# Verify call args
|
| 119 |
call_args = mock_client.get.call_args
|
|
@@ -123,5 +148,5 @@ class TestPubMedTool:
|
|
| 123 |
# "what" and "help" should be stripped
|
| 124 |
assert "what" not in term.lower()
|
| 125 |
assert "help" not in term.lower()
|
| 126 |
-
# "
|
| 127 |
-
assert "
|
|
|
|
| 13 |
<MedlineCitation>
|
| 14 |
<PMID>12345678</PMID>
|
| 15 |
<Article>
|
| 16 |
+
<ArticleTitle>Testosterone Therapy for HSDD</ArticleTitle>
|
| 17 |
<Abstract>
|
| 18 |
+
<AbstractText>Testosterone shows efficacy in HSDD...</AbstractText>
|
| 19 |
</Abstract>
|
| 20 |
<AuthorList>
|
| 21 |
<Author>
|
|
|
|
| 49 |
mock_search_response.json.return_value = {"esearchresult": {"idlist": ["12345678"]}}
|
| 50 |
mock_search_response.raise_for_status = MagicMock()
|
| 51 |
|
| 52 |
+
mock_fetch_xml = """
|
| 53 |
+
<PubmedArticleSet>
|
| 54 |
+
<PubmedArticle>
|
| 55 |
+
<MedlineCitation>
|
| 56 |
+
<PMID>12345678</PMID>
|
| 57 |
+
<Article>
|
| 58 |
+
<ArticleTitle>Testosterone and Libido</ArticleTitle>
|
| 59 |
+
<Abstract>
|
| 60 |
+
<AbstractText>Testosterone improves libido.</AbstractText>
|
| 61 |
+
</Abstract>
|
| 62 |
+
<AuthorList>
|
| 63 |
+
<Author><LastName>Doe</LastName><ForeName>John</ForeName></Author>
|
| 64 |
+
</AuthorList>
|
| 65 |
+
<Journal><JournalIssue><PubDate><Year>2024</Year></PubDate></JournalIssue></Journal>
|
| 66 |
+
</Article>
|
| 67 |
+
</MedlineCitation>
|
| 68 |
+
<PubmedData>
|
| 69 |
+
<ArticleIdList>
|
| 70 |
+
<ArticleId IdType="pubmed">12345678</ArticleId>
|
| 71 |
+
</ArticleIdList>
|
| 72 |
+
</PubmedData>
|
| 73 |
+
</PubmedArticle>
|
| 74 |
+
</PubmedArticleSet>
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
mock_fetch_response = MagicMock()
|
| 78 |
+
mock_fetch_response.text = mock_fetch_xml
|
| 79 |
mock_fetch_response.raise_for_status = MagicMock()
|
| 80 |
|
| 81 |
mock_client = AsyncMock()
|
|
|
|
| 87 |
|
| 88 |
# Act
|
| 89 |
tool = PubMedTool()
|
| 90 |
+
results = await tool.search("testosterone libido")
|
| 91 |
|
| 92 |
# Assert
|
| 93 |
assert len(results) == 1
|
| 94 |
assert results[0].citation.source == "pubmed"
|
| 95 |
+
assert "Testosterone" in results[0].citation.title
|
| 96 |
assert "12345678" in results[0].citation.url
|
| 97 |
|
| 98 |
@pytest.mark.asyncio
|
|
|
|
| 138 |
mocker.patch("httpx.AsyncClient", return_value=mock_client)
|
| 139 |
|
| 140 |
tool = PubMedTool()
|
| 141 |
+
await tool.search("What medications help with Low Libido?")
|
| 142 |
|
| 143 |
# Verify call args
|
| 144 |
call_args = mock_client.get.call_args
|
|
|
|
| 148 |
# "what" and "help" should be stripped
|
| 149 |
assert "what" not in term.lower()
|
| 150 |
assert "help" not in term.lower()
|
| 151 |
+
# "low libido" should be expanded
|
| 152 |
+
assert "HSDD" in term or "hypoactive" in term
|
tests/unit/tools/test_query_utils.py
CHANGED
|
@@ -11,36 +11,36 @@ class TestQueryPreprocessing:
|
|
| 11 |
|
| 12 |
def test_strip_question_words(self) -> None:
|
| 13 |
"""Test removal of question words."""
|
| 14 |
-
assert strip_question_words("What drugs treat
|
| 15 |
-
assert strip_question_words("Which medications help
|
| 16 |
-
assert strip_question_words("How can we
|
| 17 |
-
assert strip_question_words("Is
|
| 18 |
|
| 19 |
def test_strip_preserves_medical_terms(self) -> None:
|
| 20 |
"""Test that medical terms are preserved."""
|
| 21 |
-
result = strip_question_words("What is the mechanism of
|
| 22 |
-
assert "
|
| 23 |
assert "mechanism" in result
|
| 24 |
|
| 25 |
-
def
|
| 26 |
-
"""Test
|
| 27 |
-
result = expand_synonyms("
|
| 28 |
-
assert "
|
| 29 |
|
| 30 |
-
def
|
| 31 |
-
"""Test
|
| 32 |
-
result = expand_synonyms("
|
| 33 |
-
assert "
|
| 34 |
|
| 35 |
def test_expand_synonyms_preserves_unknown(self) -> None:
|
| 36 |
"""Test that unknown terms are preserved."""
|
| 37 |
-
result = expand_synonyms("
|
| 38 |
-
assert "
|
| 39 |
-
assert "
|
| 40 |
|
| 41 |
def test_preprocess_query_full_pipeline(self) -> None:
|
| 42 |
"""Test complete preprocessing pipeline."""
|
| 43 |
-
raw = "What medications show promise for
|
| 44 |
result = preprocess_query(raw)
|
| 45 |
|
| 46 |
# Should not contain question words
|
|
@@ -49,12 +49,12 @@ class TestQueryPreprocessing:
|
|
| 49 |
assert "promise" not in result.lower()
|
| 50 |
|
| 51 |
# Should contain expanded terms
|
| 52 |
-
assert "
|
| 53 |
assert "medications" in result.lower() or "drug" in result.lower()
|
| 54 |
|
| 55 |
def test_preprocess_query_removes_punctuation(self) -> None:
|
| 56 |
"""Test that question marks are removed."""
|
| 57 |
-
result = preprocess_query("Is
|
| 58 |
assert "?" not in result
|
| 59 |
|
| 60 |
def test_preprocess_query_handles_empty(self) -> None:
|
|
@@ -64,8 +64,8 @@ class TestQueryPreprocessing:
|
|
| 64 |
|
| 65 |
def test_preprocess_query_already_clean(self) -> None:
|
| 66 |
"""Test that clean queries pass through."""
|
| 67 |
-
clean = "
|
| 68 |
result = preprocess_query(clean)
|
| 69 |
-
assert "
|
| 70 |
-
assert "
|
| 71 |
assert "mechanism" in result
|
|
|
|
| 11 |
|
| 12 |
def test_strip_question_words(self) -> None:
|
| 13 |
"""Test removal of question words."""
|
| 14 |
+
assert strip_question_words("What drugs treat HSDD") == "drugs treat hsdd"
|
| 15 |
+
assert strip_question_words("Which medications help low libido") == "medications low libido"
|
| 16 |
+
assert strip_question_words("How can we treat ED") == "we treat ed"
|
| 17 |
+
assert strip_question_words("Is sildenafil effective") == "sildenafil"
|
| 18 |
|
| 19 |
def test_strip_preserves_medical_terms(self) -> None:
|
| 20 |
"""Test that medical terms are preserved."""
|
| 21 |
+
result = strip_question_words("What is the mechanism of sildenafil")
|
| 22 |
+
assert "sildenafil" in result
|
| 23 |
assert "mechanism" in result
|
| 24 |
|
| 25 |
+
def test_expand_synonyms_low_libido(self) -> None:
|
| 26 |
+
"""Test Low Libido synonym expansion."""
|
| 27 |
+
result = expand_synonyms("low libido treatment")
|
| 28 |
+
assert "HSDD" in result or "hypoactive sexual desire" in result
|
| 29 |
|
| 30 |
+
def test_expand_synonyms_ed(self) -> None:
|
| 31 |
+
"""Test ED synonym expansion."""
|
| 32 |
+
result = expand_synonyms("erectile dysfunction drug")
|
| 33 |
+
assert "impotence" in result
|
| 34 |
|
| 35 |
def test_expand_synonyms_preserves_unknown(self) -> None:
|
| 36 |
"""Test that unknown terms are preserved."""
|
| 37 |
+
result = expand_synonyms("sildenafil unknowncondition")
|
| 38 |
+
assert "sildenafil" in result
|
| 39 |
+
assert "unknowncondition" in result
|
| 40 |
|
| 41 |
def test_preprocess_query_full_pipeline(self) -> None:
|
| 42 |
"""Test complete preprocessing pipeline."""
|
| 43 |
+
raw = "What medications show promise for Low Libido?"
|
| 44 |
result = preprocess_query(raw)
|
| 45 |
|
| 46 |
# Should not contain question words
|
|
|
|
| 49 |
assert "promise" not in result.lower()
|
| 50 |
|
| 51 |
# Should contain expanded terms
|
| 52 |
+
assert "HSDD" in result or "hypoactive" in result or "low libido" in result.lower()
|
| 53 |
assert "medications" in result.lower() or "drug" in result.lower()
|
| 54 |
|
| 55 |
def test_preprocess_query_removes_punctuation(self) -> None:
|
| 56 |
"""Test that question marks are removed."""
|
| 57 |
+
result = preprocess_query("Is sildenafil safe?")
|
| 58 |
assert "?" not in result
|
| 59 |
|
| 60 |
def test_preprocess_query_handles_empty(self) -> None:
|
|
|
|
| 64 |
|
| 65 |
def test_preprocess_query_already_clean(self) -> None:
|
| 66 |
"""Test that clean queries pass through."""
|
| 67 |
+
clean = "sildenafil ed mechanism"
|
| 68 |
result = preprocess_query(clean)
|
| 69 |
+
assert "sildenafil" in result
|
| 70 |
+
assert "ed" in result
|
| 71 |
assert "mechanism" in result
|