File size: 7,364 Bytes
4e2ccbf d55440b 4e2ccbf de1af88 4e2ccbf d55440b 4e2ccbf d55440b 4e2ccbf fa696e8 4e2ccbf d55440b 4e2ccbf d55440b 4e2ccbf ff16d8e 4e2ccbf de1af88 4e2ccbf de1af88 4e2ccbf de1af88 4e2ccbf de1af88 4e2ccbf de1af88 4e2ccbf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
"""ClinicalTrials.gov search tool using API v2."""
import asyncio
from typing import Any, ClassVar
import requests
from tenacity import retry, stop_after_attempt, wait_exponential
from src.utils.exceptions import SearchError
from src.utils.models import Citation, Evidence
class ClinicalTrialsTool:
"""Search tool for ClinicalTrials.gov.
Note: Uses `requests` library instead of `httpx` because ClinicalTrials.gov's
WAF blocks httpx's TLS fingerprint. The `requests` library is not blocked.
See: https://clinicaltrials.gov/data-api/api
"""
BASE_URL = "https://clinicaltrials.gov/api/v2/studies"
# Fields to retrieve
FIELDS: ClassVar[list[str]] = [
"NCTId",
"BriefTitle",
"Phase",
"OverallStatus",
"Condition",
"InterventionName",
"StartDate",
"BriefSummary",
# NEW: Outcome measures
"OutcomesModule",
"HasResults",
]
# Status filter: Only active/completed studies with potential data
STATUS_FILTER = "COMPLETED,ACTIVE_NOT_RECRUITING,RECRUITING,ENROLLING_BY_INVITATION"
# Study type filter: Only interventional (drug/treatment studies)
STUDY_TYPE_FILTER = "INTERVENTIONAL"
@property
def name(self) -> str:
return "clinicaltrials"
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=10),
reraise=True,
)
async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
"""Search ClinicalTrials.gov for interventional studies.
Args:
query: Search query (e.g., "testosterone libido")
max_results: Maximum results to return (max 100)
Returns:
List of Evidence objects from clinical trials
"""
# Add study type filter to query string (parameter is not supported)
# AREA[StudyType]INTERVENTIONAL restricts to interventional studies
final_query = f"{query} AND AREA[StudyType]INTERVENTIONAL"
params: dict[str, Any] = {
"query.term": final_query,
"pageSize": min(max_results, 100),
"fields": ",".join(self.FIELDS),
# FILTERS - Only active/completed studies
"filter.overallStatus": self.STATUS_FILTER,
}
try:
# Run blocking requests.get in a separate thread for async compatibility
response = await asyncio.to_thread(
requests.get,
self.BASE_URL,
params=params,
headers={"User-Agent": "DeepBoner-Research-Agent/1.0"},
timeout=30,
)
response.raise_for_status()
data = response.json()
studies = data.get("studies", [])
return [self._study_to_evidence(study) for study in studies[:max_results]]
except requests.HTTPError as e:
raise SearchError(f"ClinicalTrials.gov API error: {e}") from e
except requests.RequestException as e:
raise SearchError(f"ClinicalTrials.gov request failed: {e}") from e
def _extract_primary_outcome(self, outcomes_module: dict[str, Any]) -> str:
"""Extract and format primary outcome from outcomes module."""
primary_outcomes = outcomes_module.get("primaryOutcomes", [])
if not primary_outcomes:
return ""
# Get first primary outcome measure and timeframe
first = primary_outcomes[0]
measure = first.get("measure", "")
timeframe = first.get("timeFrame", "")
# Build full outcome string first, then truncate
result = f"{measure} (measured at {timeframe})" if timeframe else measure
# Truncate long outcome descriptions with ellipsis
return result[:197] + "..." if len(result) > 200 else result
def _study_to_evidence(self, study: dict[str, Any]) -> Evidence:
"""Convert a clinical trial study to Evidence."""
# Navigate nested structure
protocol = study.get("protocolSection", {})
id_module = protocol.get("identificationModule", {})
status_module = protocol.get("statusModule", {})
desc_module = protocol.get("descriptionModule", {})
design_module = protocol.get("designModule", {})
conditions_module = protocol.get("conditionsModule", {})
arms_module = protocol.get("armsInterventionsModule", {})
outcomes_module = protocol.get("outcomesModule", {})
nct_id = id_module.get("nctId", "Unknown")
title = id_module.get("briefTitle", "Untitled Study")
status = status_module.get("overallStatus", "Unknown")
start_date = status_module.get("startDateStruct", {}).get("date", "Unknown")
# Get phase (might be a list)
phases = design_module.get("phases", [])
phase = phases[0] if phases else "Not Applicable"
# Get conditions
conditions = conditions_module.get("conditions", [])
conditions_str = ", ".join(conditions[:3]) if conditions else "Unknown"
# Get interventions
interventions = arms_module.get("interventions", [])
intervention_names = [i.get("name", "") for i in interventions[:3]]
interventions_str = ", ".join(intervention_names) if intervention_names else "Unknown"
# Get summary
summary = desc_module.get("briefSummary", "No summary available.")
# Extract outcome measures
primary_outcome_str = self._extract_primary_outcome(outcomes_module)
secondary_count = len(outcomes_module.get("secondaryOutcomes", []))
# Check if results are available (hasResults is TOP-LEVEL, not in protocol!)
has_results = study.get("hasResults", False)
# Results date is in statusModule (nested inside date struct)
results_date_struct = status_module.get("resultsFirstPostDateStruct", {})
results_date = results_date_struct.get("date", "")
# Build content with key trial info
summary_text = summary[:400] + "..." if len(summary) > 400 else summary
content_parts = [
summary_text,
f"Trial Phase: {phase}.",
f"Status: {status}.",
f"Conditions: {conditions_str}.",
f"Interventions: {interventions_str}.",
]
if primary_outcome_str:
content_parts.append(f"Primary Outcome: {primary_outcome_str}.")
if secondary_count > 0:
content_parts.append(f"Secondary Outcomes: {secondary_count} additional endpoints.")
if has_results:
results_info = "Results Available: Yes"
if results_date:
results_info += f" (posted {results_date})"
content_parts.append(results_info + ".")
else:
content_parts.append("Results Available: Not yet posted.")
content = " ".join(content_parts)
return Evidence(
content=content[:2000],
citation=Citation(
source="clinicaltrials",
title=title[:500],
url=f"https://clinicaltrials.gov/study/{nct_id}",
date=start_date,
authors=[], # Trials don't have traditional authors
),
relevance=0.90 if has_results else 0.85, # Boost relevance for trials with results
)
|