Update src/model.py
Browse files- src/model.py +410 -704
src/model.py
CHANGED
|
@@ -27,7 +27,6 @@ openai.api_key = get_openai_api_key()
|
|
| 27 |
# System prompts
|
| 28 |
SYSTEM_PROMPT = """You are an advanced clinical AI assistant designed to aid healthcare professionals.
|
| 29 |
Follow these guidelines in all responses:
|
| 30 |
-
|
| 31 |
1. **Clarify First**: Before providing any diagnosis or plan, if the user's query is underspecified, ALWAYS ask relevant clarifying questions to gather necessary patient information. This includes, but is not limited to, symptoms, duration, severity, medical history, age, lifestyle factors (diet, exercise), and current medications.
|
| 32 |
2. Professional tone: Maintain a clear, respectful, and professional tone appropriate for medical consultation.
|
| 33 |
3. Evidence-based practice: Base all responses on current medical evidence and guidelines.
|
|
@@ -37,20 +36,17 @@ Follow these guidelines in all responses:
|
|
| 37 |
7. Limitations: Acknowledge the limits of AI medical advice and recommend in-person consultation when appropriate.
|
| 38 |
8. Comprehensive approach: Consider differential diagnoses and relevant contextual factors.
|
| 39 |
9. Patient-centered: Focus on clinically relevant information while maintaining respect for the patient.
|
| 40 |
-
|
| 41 |
For each consultation:
|
| 42 |
1. Ask clarifying questions if needed (as per guideline 1).
|
| 43 |
2. Provide differential diagnosis with likelihood assessment.
|
| 44 |
3. Suggest appropriate next steps (testing, treatment, referral).
|
| 45 |
4. Include reasoning for your conclusions.
|
| 46 |
5. Cite medical literature or guidelines supporting your assessment using [source_id].
|
| 47 |
-
|
| 48 |
IMPORTANT: Your primary duty is to support clinical decision-making, not replace clinical judgment.
|
| 49 |
"""
|
| 50 |
|
| 51 |
FOLLOW_UP_PROMPT = """Continue this medical consultation based on the previous discussion.
|
| 52 |
Consider the information already gathered and the tentative diagnosis/plan.
|
| 53 |
-
|
| 54 |
When responding to the follow-up:
|
| 55 |
1. Reference relevant details from the prior conversation.
|
| 56 |
2. Address the specific follow-up question with evidence-based information.
|
|
@@ -58,82 +54,13 @@ When responding to the follow-up:
|
|
| 58 |
4. Update recommendations if appropriate.
|
| 59 |
5. Maintain the same structured approach with transparent reasoning.
|
| 60 |
6. Cite additional medical literature or guidelines when relevant using [source_id].
|
| 61 |
-
|
| 62 |
Remember that this is an ongoing consultation where continuity of care is important.
|
| 63 |
"""
|
| 64 |
|
| 65 |
-
# Enhanced medical query preprocessing
|
| 66 |
-
def preprocess_medical_query(query):
|
| 67 |
-
"""
|
| 68 |
-
Use GPT-4o to extract key medical terms and concepts from the user query.
|
| 69 |
-
This improves search relevance by identifying proper medical terminology.
|
| 70 |
-
"""
|
| 71 |
-
try:
|
| 72 |
-
system_prompt = """
|
| 73 |
-
You are a medical term extraction system. Extract key medical concepts from the input query.
|
| 74 |
-
Focus on:
|
| 75 |
-
1. Symptoms (e.g., fever, pain, cough)
|
| 76 |
-
2. Conditions (e.g., diabetes, hypertension)
|
| 77 |
-
3. Anatomical structures (e.g., liver, heart)
|
| 78 |
-
4. Medications (e.g., aspirin, insulin)
|
| 79 |
-
5. Procedures (e.g., MRI, surgery)
|
| 80 |
-
|
| 81 |
-
For each term extracted, try to provide the corresponding medical terminology or MeSH term if applicable.
|
| 82 |
-
|
| 83 |
-
Return your answer in this JSON format:
|
| 84 |
-
{
|
| 85 |
-
"extracted_terms": ["term1", "term2", ...],
|
| 86 |
-
"mesh_mappings": {"term1": "MeSH term1", "term2": "MeSH term2", ...},
|
| 87 |
-
"optimized_search_query": "term1 AND term2 AND term3..."
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
Include only the JSON object, nothing else.
|
| 91 |
-
"""
|
| 92 |
-
|
| 93 |
-
response = openai.ChatCompletion.create(
|
| 94 |
-
model="gpt-4o-mini", # Using gpt-4o-mini for efficiency, can upgrade to gpt-4o if needed
|
| 95 |
-
messages=[
|
| 96 |
-
{"role": "system", "content": system_prompt},
|
| 97 |
-
{"role": "user", "content": f"Extract medical terms from this query: {query}"}
|
| 98 |
-
],
|
| 99 |
-
temperature=0.1, # Low temperature for consistent outputs
|
| 100 |
-
max_tokens=500,
|
| 101 |
-
)
|
| 102 |
-
|
| 103 |
-
# Extract json from response
|
| 104 |
-
result_text = response.choices[0].message['content']
|
| 105 |
-
try:
|
| 106 |
-
# Find JSON object
|
| 107 |
-
json_match = re.search(r'({[\s\S]*})', result_text)
|
| 108 |
-
if json_match:
|
| 109 |
-
json_str = json_match.group(1)
|
| 110 |
-
return json.loads(json_str)
|
| 111 |
-
else:
|
| 112 |
-
return json.loads(result_text)
|
| 113 |
-
except json.JSONDecodeError:
|
| 114 |
-
# Fallback to original query if parsing fails
|
| 115 |
-
return {
|
| 116 |
-
"extracted_terms": [query],
|
| 117 |
-
"mesh_mappings": {},
|
| 118 |
-
"optimized_search_query": query
|
| 119 |
-
}
|
| 120 |
-
except Exception as e:
|
| 121 |
-
print(f"Error in query preprocessing: {str(e)}")
|
| 122 |
-
# Fallback to original query
|
| 123 |
-
return {
|
| 124 |
-
"extracted_terms": [query],
|
| 125 |
-
"mesh_mappings": {},
|
| 126 |
-
"optimized_search_query": query
|
| 127 |
-
}
|
| 128 |
-
|
| 129 |
# Function to extract source IDs and replace them with actual links
|
| 130 |
def extract_and_link_sources(text, evidence_snippets):
|
| 131 |
-
"""
|
| 132 |
-
|
| 133 |
-
Improved to handle various citation formats and provide richer context.
|
| 134 |
-
"""
|
| 135 |
-
# Expanded pattern to handle more citation formats
|
| 136 |
-
source_pattern = r'\[([\w\d:_\-\.+]+)\]' # Basic citation format [source_id]
|
| 137 |
matches = re.findall(source_pattern, text)
|
| 138 |
|
| 139 |
source_map = {} # Map to store source_id -> source data
|
|
@@ -146,54 +73,28 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 146 |
"id": snippet["id"],
|
| 147 |
"title": snippet["title"].strip(),
|
| 148 |
"url": snippet["url"],
|
| 149 |
-
"citation": snippet["citation"]
|
| 150 |
-
"has_full_text": snippet.get("has_full_text", False),
|
| 151 |
-
"journal": snippet.get("journal", ""),
|
| 152 |
-
"year": snippet.get("year", "")
|
| 153 |
}
|
| 154 |
break
|
| 155 |
|
| 156 |
# Next, try fuzzy matching for cases where the exact ID isn't matched
|
| 157 |
for source_id_match in matches:
|
| 158 |
if source_id_match not in source_map and source_id_match != "source_id":
|
| 159 |
-
# Try multiple matching strategies
|
| 160 |
-
|
| 161 |
-
# Strategy 1: Match on ID prefix (e.g., pubmed-12345 might match pubmed-12345678)
|
| 162 |
for snippet in evidence_snippets:
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
source_map[source_id_match] = {
|
| 165 |
"id": snippet["id"],
|
| 166 |
"title": snippet["title"].strip(),
|
| 167 |
"url": snippet["url"],
|
| 168 |
-
"citation": snippet["citation"]
|
| 169 |
-
"has_full_text": snippet.get("has_full_text", False),
|
| 170 |
-
"journal": snippet.get("journal", ""),
|
| 171 |
-
"year": snippet.get("year", "")
|
| 172 |
}
|
| 173 |
break
|
| 174 |
-
|
| 175 |
-
# Strategy 2: Try to match on partial IDs broken by source type
|
| 176 |
-
if source_id_match not in source_map:
|
| 177 |
-
# Split on common delimiters
|
| 178 |
-
for snippet in evidence_snippets:
|
| 179 |
-
snippet_id_parts = re.split(r'[-_:.]', snippet["id"])
|
| 180 |
-
source_id_parts = re.split(r'[-_:.]', source_id_match)
|
| 181 |
-
|
| 182 |
-
# Check if any significant parts match
|
| 183 |
-
if (len(snippet_id_parts) > 0 and len(source_id_parts) > 0 and
|
| 184 |
-
(snippet_id_parts[0] == source_id_parts[0] or # First part matches (e.g., "pubmed")
|
| 185 |
-
(len(snippet_id_parts) > 1 and len(source_id_parts) > 1 and
|
| 186 |
-
snippet_id_parts[1] == source_id_parts[1]))): # Second part matches (e.g., the ID number)
|
| 187 |
-
source_map[source_id_match] = {
|
| 188 |
-
"id": snippet["id"],
|
| 189 |
-
"title": snippet["title"].strip(),
|
| 190 |
-
"url": snippet["url"],
|
| 191 |
-
"citation": snippet["citation"],
|
| 192 |
-
"has_full_text": snippet.get("has_full_text", False),
|
| 193 |
-
"journal": snippet.get("journal", ""),
|
| 194 |
-
"year": snippet.get("year", "")
|
| 195 |
-
}
|
| 196 |
-
break
|
| 197 |
|
| 198 |
# Handle generic [source_id] placeholder
|
| 199 |
if "source_id" in matches:
|
|
@@ -205,10 +106,7 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 205 |
"id": snippet["id"],
|
| 206 |
"title": snippet["title"].strip(),
|
| 207 |
"url": snippet["url"],
|
| 208 |
-
"citation": snippet["citation"]
|
| 209 |
-
"has_full_text": snippet.get("has_full_text", False),
|
| 210 |
-
"journal": snippet.get("journal", ""),
|
| 211 |
-
"year": snippet.get("year", "")
|
| 212 |
}
|
| 213 |
|
| 214 |
# Replace source_id placeholders with actual links in the text
|
|
@@ -216,24 +114,13 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 216 |
for source_id_key, source_data in source_map.items():
|
| 217 |
safe_id = re.escape(source_id_key)
|
| 218 |
pattern = f"\\[{safe_id}\\]"
|
| 219 |
-
|
| 220 |
-
# Create a more informative replacement that shows the title and preserves the source ID
|
| 221 |
-
title = source_data['title']
|
| 222 |
-
short_title = title[:60] + "..." if len(title) > 60 else title
|
| 223 |
-
|
| 224 |
-
# Include year if available for better context
|
| 225 |
-
year_text = f" ({source_data['year']})" if source_data.get('year') else ""
|
| 226 |
-
|
| 227 |
-
# Create the replacement with hover tooltip (works in many markdown renderers)
|
| 228 |
-
replacement = f"[{short_title}{year_text}]({source_data['url']} \"{title}\")"
|
| 229 |
-
|
| 230 |
linked_text = re.sub(pattern, replacement, linked_text)
|
| 231 |
|
| 232 |
# Handle remaining [source_id] placeholders
|
| 233 |
if "source_id" in source_map and "[source_id]" in linked_text:
|
| 234 |
generic_data = source_map["source_id"]
|
| 235 |
-
|
| 236 |
-
replacement = f"[{generic_data['title']}{year_text}]({generic_data['url']})"
|
| 237 |
linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
|
| 238 |
|
| 239 |
# Final fallback for any [source_id] not mapped at all
|
|
@@ -243,29 +130,48 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 243 |
|
| 244 |
# Implement PubMed API integration for medical evidence retrieval
|
| 245 |
def fetch_from_pubmed_api(query, max_results=3, api_key=None):
|
| 246 |
-
"""
|
| 247 |
-
Enhanced PubMed API integration using E-utilities (ESearch + EFetch)
|
| 248 |
-
to retrieve more detailed article information with better abstracts.
|
| 249 |
-
"""
|
| 250 |
results = []
|
| 251 |
-
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
| 252 |
|
| 253 |
-
#
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
else:
|
| 261 |
-
|
| 262 |
-
|
| 263 |
|
| 264 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
search_params = {
|
| 266 |
"db": "pubmed",
|
| 267 |
-
"term":
|
| 268 |
-
"retmax": max_results
|
| 269 |
"retmode": "json",
|
| 270 |
"sort": "relevance"
|
| 271 |
}
|
|
@@ -273,7 +179,7 @@ def fetch_from_pubmed_api(query, max_results=3, api_key=None):
|
|
| 273 |
# Add API key if provided (increases rate limits)
|
| 274 |
if api_key:
|
| 275 |
search_params["api_key"] = api_key
|
| 276 |
-
|
| 277 |
try:
|
| 278 |
# First get article IDs
|
| 279 |
search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
|
|
@@ -284,132 +190,83 @@ def fetch_from_pubmed_api(query, max_results=3, api_key=None):
|
|
| 284 |
search_data = search_response.json()
|
| 285 |
|
| 286 |
if "esearchresult" in search_data and "idlist" in search_data["esearchresult"]:
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
if not pmids:
|
| 290 |
-
return []
|
| 291 |
-
|
| 292 |
-
# Step 3: Use EFetch to get article details
|
| 293 |
-
fetch_params = {
|
| 294 |
-
"db": "pubmed",
|
| 295 |
-
"id": ",".join(pmids),
|
| 296 |
-
"retmode": "xml",
|
| 297 |
-
"rettype": "abstract"
|
| 298 |
-
}
|
| 299 |
-
|
| 300 |
-
if api_key:
|
| 301 |
-
fetch_params["api_key"] = api_key
|
| 302 |
-
|
| 303 |
-
fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
|
| 304 |
-
|
| 305 |
-
if fetch_response.status_code != 200:
|
| 306 |
-
return []
|
| 307 |
-
|
| 308 |
-
# Step 4: Parse XML to extract article details
|
| 309 |
-
root = ET.fromstring(fetch_response.text)
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
# Extract authors
|
| 321 |
-
authors = []
|
| 322 |
-
for author in article.findall(".//Author"):
|
| 323 |
-
last_name = author.findtext(".//LastName") or ""
|
| 324 |
-
initials = author.findtext(".//Initials") or ""
|
| 325 |
-
if last_name or initials:
|
| 326 |
-
authors.append(f"{last_name} {initials}".strip())
|
| 327 |
-
|
| 328 |
-
# Extract journal and publication date
|
| 329 |
-
journal = article.findtext(".//Journal/Title") or "Unknown Journal"
|
| 330 |
-
year = article.findtext(".//PubDate/Year") or "Unknown Year"
|
| 331 |
-
|
| 332 |
-
# Extract abstract with sections
|
| 333 |
-
abstract_texts = []
|
| 334 |
-
|
| 335 |
-
# Get structured abstract if available
|
| 336 |
-
abstract_sections = article.findall(".//AbstractText")
|
| 337 |
-
if abstract_sections:
|
| 338 |
-
for section in abstract_sections:
|
| 339 |
-
label = section.get("Label", "")
|
| 340 |
-
text = section.text or ""
|
| 341 |
-
if label and text:
|
| 342 |
-
abstract_texts.append(f"{label}: {text}")
|
| 343 |
-
elif text:
|
| 344 |
-
abstract_texts.append(text)
|
| 345 |
-
else:
|
| 346 |
-
# Try single abstract text
|
| 347 |
-
main_abstract = article.findtext(".//Abstract/AbstractText")
|
| 348 |
-
if main_abstract:
|
| 349 |
-
abstract_texts.append(main_abstract)
|
| 350 |
-
|
| 351 |
-
abstract = " ".join(abstract_texts) if abstract_texts else "Abstract not available"
|
| 352 |
-
|
| 353 |
-
# Check if full text is available in PMC
|
| 354 |
-
pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
|
| 355 |
-
pmc_link = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/" if pmc_id else None
|
| 356 |
-
has_full_text = bool(pmc_id)
|
| 357 |
-
|
| 358 |
-
# Build citation
|
| 359 |
-
author_citation = ""
|
| 360 |
-
if authors:
|
| 361 |
-
if len(authors) == 1:
|
| 362 |
-
author_citation = authors[0]
|
| 363 |
-
elif len(authors) == 2:
|
| 364 |
-
author_citation = f"{authors[0]} & {authors[1]}"
|
| 365 |
-
else:
|
| 366 |
-
author_citation = f"{authors[0]} et al."
|
| 367 |
-
|
| 368 |
-
citation = f"{author_citation} ({year}). {title}. {journal}."
|
| 369 |
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
"authors": authors,
|
| 375 |
-
"journal": journal,
|
| 376 |
-
"year": year,
|
| 377 |
-
"text": abstract,
|
| 378 |
-
"url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
|
| 379 |
-
"pmc_url": pmc_link,
|
| 380 |
-
"has_full_text": has_full_text,
|
| 381 |
-
"source_type": "PubMed" + (" (Full Text Available)" if has_full_text else ""),
|
| 382 |
-
"citation": citation
|
| 383 |
-
}
|
| 384 |
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
if term.lower() in title.lower():
|
| 389 |
-
relevance_score += 2 # Higher weight for terms in title
|
| 390 |
-
if term.lower() in abstract.lower():
|
| 391 |
-
relevance_score += 1 # Lower weight for terms in abstract
|
| 392 |
|
| 393 |
-
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
break
|
| 399 |
-
|
| 400 |
-
except Exception as e:
|
| 401 |
-
print(f"Error parsing article {pmid}: {str(e)}")
|
| 402 |
-
continue
|
| 403 |
-
|
| 404 |
-
# Sort by relevance score
|
| 405 |
-
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
| 406 |
-
return results[:max_results]
|
| 407 |
-
|
| 408 |
-
except Exception as e:
|
| 409 |
-
print(f"Error in PubMed API fetch: {str(e)}")
|
| 410 |
return []
|
| 411 |
-
|
| 412 |
-
return results
|
| 413 |
|
| 414 |
def fetch_from_pmc_api(query, max_results=2, api_key=None):
|
| 415 |
"""Fetch free full text articles from PubMed Central (PMC)"""
|
|
@@ -749,346 +606,50 @@ def fetch_from_core_api(query, max_results=2, api_key=None):
|
|
| 749 |
except Exception:
|
| 750 |
return []
|
| 751 |
|
| 752 |
-
# Europe PMC API integration for open access full-text articles
|
| 753 |
-
def fetch_from_europe_pmc(query, max_results=2):
|
| 754 |
-
"""
|
| 755 |
-
Fetch research articles from Europe PMC's API, which provides better
|
| 756 |
-
access to full-text content than regular PubMed.
|
| 757 |
-
"""
|
| 758 |
-
results = []
|
| 759 |
-
|
| 760 |
-
# Process the query with medical term extraction if it's complex enough
|
| 761 |
-
if len(query.split()) > 3:
|
| 762 |
-
query_analysis = preprocess_medical_query(query)
|
| 763 |
-
search_query = query_analysis.get("optimized_search_query", query)
|
| 764 |
-
extracted_terms = query_analysis.get("extracted_terms", [])
|
| 765 |
-
else:
|
| 766 |
-
search_query = query
|
| 767 |
-
extracted_terms = [query]
|
| 768 |
-
|
| 769 |
-
try:
|
| 770 |
-
# Europe PMC REST API URL
|
| 771 |
-
api_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 772 |
-
|
| 773 |
-
# Request parameters
|
| 774 |
-
params = {
|
| 775 |
-
"query": search_query,
|
| 776 |
-
"format": "json",
|
| 777 |
-
"resultType": "core", # Core result fields
|
| 778 |
-
"pageSize": max_results * 2, # Get more results for filtering
|
| 779 |
-
"cursorMark": "*", # Starting point for pagination
|
| 780 |
-
"sort": "relevance", # Sort by relevance
|
| 781 |
-
"synonym": "TRUE", # Include MeSH term synonyms for better matching
|
| 782 |
-
"hasTextMinedTerms": "TRUE", # Filter for text-mined terms
|
| 783 |
-
"hasLabsLinks": "TRUE" # Include links to full text
|
| 784 |
-
}
|
| 785 |
-
|
| 786 |
-
# Make the request
|
| 787 |
-
response = requests.get(api_url, params=params)
|
| 788 |
-
|
| 789 |
-
if response.status_code != 200:
|
| 790 |
-
return []
|
| 791 |
-
|
| 792 |
-
data = response.json()
|
| 793 |
-
|
| 794 |
-
# Check if results exist
|
| 795 |
-
if "resultList" not in data or "result" not in data["resultList"]:
|
| 796 |
-
return []
|
| 797 |
-
|
| 798 |
-
# Process results
|
| 799 |
-
for article in data["resultList"]["result"]:
|
| 800 |
-
try:
|
| 801 |
-
# Extract basic article info
|
| 802 |
-
pmid = article.get("pmid", "")
|
| 803 |
-
title = article.get("title", "No title available")
|
| 804 |
-
|
| 805 |
-
# Extract abstract - Europe PMC sometimes provides better abstracts
|
| 806 |
-
abstract = article.get("abstractText", "Abstract not available")
|
| 807 |
-
|
| 808 |
-
# Get author information
|
| 809 |
-
authors = []
|
| 810 |
-
if "authorList" in article and "author" in article["authorList"]:
|
| 811 |
-
for author in article["authorList"]["author"]:
|
| 812 |
-
author_name = []
|
| 813 |
-
if "lastName" in author:
|
| 814 |
-
author_name.append(author["lastName"])
|
| 815 |
-
if "initials" in author:
|
| 816 |
-
author_name.append(author["initials"])
|
| 817 |
-
if author_name:
|
| 818 |
-
authors.append(" ".join(author_name))
|
| 819 |
-
|
| 820 |
-
# Get journal info
|
| 821 |
-
journal = article.get("journalTitle", "Unknown Journal")
|
| 822 |
-
year = article.get("pubYear", "Unknown Year")
|
| 823 |
-
|
| 824 |
-
# Check if full text is available
|
| 825 |
-
has_full_text = False
|
| 826 |
-
full_text_url = None
|
| 827 |
-
|
| 828 |
-
# Europe PMC provides several indicators for full text
|
| 829 |
-
if "isOpenAccess" in article and article["isOpenAccess"] == "Y":
|
| 830 |
-
has_full_text = True
|
| 831 |
-
|
| 832 |
-
# Get PMC ID if available
|
| 833 |
-
pmc_id = article.get("pmcid", "")
|
| 834 |
-
if pmc_id:
|
| 835 |
-
has_full_text = True
|
| 836 |
-
full_text_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
|
| 837 |
-
|
| 838 |
-
# Build citation
|
| 839 |
-
author_citation = ""
|
| 840 |
-
if authors:
|
| 841 |
-
if len(authors) == 1:
|
| 842 |
-
author_citation = authors[0]
|
| 843 |
-
elif len(authors) == 2:
|
| 844 |
-
author_citation = f"{authors[0]} & {authors[1]}"
|
| 845 |
-
else:
|
| 846 |
-
author_citation = f"{authors[0]} et al."
|
| 847 |
-
|
| 848 |
-
citation = f"{author_citation} ({year}). {title}. {journal}."
|
| 849 |
-
|
| 850 |
-
# Building the URL - prefer Europe PMC links as they often have better HTML rendering
|
| 851 |
-
url = f"https://europepmc.org/article/MED/{pmid}" if pmid else ""
|
| 852 |
-
|
| 853 |
-
# If no PMID but has DOI, use DOI link
|
| 854 |
-
if not url and "doi" in article:
|
| 855 |
-
url = f"https://doi.org/{article['doi']}"
|
| 856 |
-
|
| 857 |
-
# Use PMC link if available
|
| 858 |
-
if full_text_url:
|
| 859 |
-
url = full_text_url
|
| 860 |
-
|
| 861 |
-
# Create result object
|
| 862 |
-
result = {
|
| 863 |
-
"id": f"epmc-{pmid if pmid else article.get('id', uuid.uuid4().hex[:8])}",
|
| 864 |
-
"title": title,
|
| 865 |
-
"authors": authors,
|
| 866 |
-
"journal": journal,
|
| 867 |
-
"year": year,
|
| 868 |
-
"text": abstract,
|
| 869 |
-
"url": url,
|
| 870 |
-
"has_full_text": has_full_text,
|
| 871 |
-
"source_type": "Europe PMC" + (" (Full Text Available)" if has_full_text else ""),
|
| 872 |
-
"citation": citation
|
| 873 |
-
}
|
| 874 |
-
|
| 875 |
-
# Calculate relevance score
|
| 876 |
-
relevance_score = 0
|
| 877 |
-
for term in extracted_terms:
|
| 878 |
-
if term.lower() in title.lower():
|
| 879 |
-
relevance_score += 2 # Higher weight for terms in title
|
| 880 |
-
if term.lower() in abstract.lower():
|
| 881 |
-
relevance_score += 1 # Lower weight for terms in abstract
|
| 882 |
-
|
| 883 |
-
result["relevance_score"] = relevance_score
|
| 884 |
-
results.append(result)
|
| 885 |
-
|
| 886 |
-
except Exception as e:
|
| 887 |
-
print(f"Error processing Europe PMC article: {str(e)}")
|
| 888 |
-
continue
|
| 889 |
-
|
| 890 |
-
# Sort by relevance score
|
| 891 |
-
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
| 892 |
-
return results[:max_results]
|
| 893 |
-
|
| 894 |
-
except Exception as e:
|
| 895 |
-
print(f"Error in Europe PMC API fetch: {str(e)}")
|
| 896 |
-
return []
|
| 897 |
-
|
| 898 |
-
return results
|
| 899 |
-
|
| 900 |
-
# Relevance filtering for medical evidence
|
| 901 |
-
def assess_evidence_relevance(query, evidence_snippets, max_results=5):
|
| 902 |
-
"""
|
| 903 |
-
Use GPT-4o to assess the relevance of evidence snippets to the user query
|
| 904 |
-
and select the most applicable ones for response generation.
|
| 905 |
-
"""
|
| 906 |
-
if not evidence_snippets:
|
| 907 |
-
return []
|
| 908 |
-
|
| 909 |
-
try:
|
| 910 |
-
# If we already have ranking from the API calls, use that first
|
| 911 |
-
if "relevance_score" in evidence_snippets[0]:
|
| 912 |
-
# Sort by the existing relevance score
|
| 913 |
-
evidence_snippets.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
| 914 |
-
|
| 915 |
-
# If we have less than or equal to max_results, return all
|
| 916 |
-
if len(evidence_snippets) <= max_results:
|
| 917 |
-
return evidence_snippets
|
| 918 |
-
|
| 919 |
-
# If more than 10 snippets, only run GPT filtering on the top 10
|
| 920 |
-
snippets_to_assess = evidence_snippets[:min(10, len(evidence_snippets))]
|
| 921 |
-
else:
|
| 922 |
-
snippets_to_assess = evidence_snippets
|
| 923 |
-
|
| 924 |
-
# Prepare snippets for assessment
|
| 925 |
-
snippets_text = ""
|
| 926 |
-
for i, snippet in enumerate(snippets_to_assess):
|
| 927 |
-
snippets_text += f"---ARTICLE {i+1}---\n"
|
| 928 |
-
snippets_text += f"Title: {snippet['title']}\n"
|
| 929 |
-
snippets_text += f"Source: {snippet['source_type']}\n"
|
| 930 |
-
|
| 931 |
-
# Limit text length to avoid token limits
|
| 932 |
-
text = snippet['text']
|
| 933 |
-
if len(text) > 800:
|
| 934 |
-
text = text[:800] + "..."
|
| 935 |
-
|
| 936 |
-
snippets_text += f"Content: {text}\n\n"
|
| 937 |
-
|
| 938 |
-
# Create system prompt for assessment
|
| 939 |
-
system_prompt = """
|
| 940 |
-
You are an expert medical research assistant helping to select the most relevant medical evidence for a patient query.
|
| 941 |
-
|
| 942 |
-
Review the provided medical articles and evaluate their relevance to the patient's query.
|
| 943 |
-
Focus on these factors:
|
| 944 |
-
1. Direct relevance to the medical condition or symptoms described
|
| 945 |
-
2. Quality and comprehensiveness of the information
|
| 946 |
-
3. Whether it covers diagnosis, treatment, or management aspects needed
|
| 947 |
-
4. Recency and reliability of the source
|
| 948 |
-
5. Presence of actionable information that would help answer the query
|
| 949 |
-
|
| 950 |
-
For each article, assign a relevance score from 1-10 (10 being most relevant).
|
| 951 |
-
|
| 952 |
-
Return your assessment as a JSON object:
|
| 953 |
-
{
|
| 954 |
-
"article_rankings": [
|
| 955 |
-
{"article_index": 1, "relevance_score": 8, "reason": "Directly addresses the primary symptom with treatment options"},
|
| 956 |
-
{"article_index": 2, "relevance_score": 4, "reason": "Tangentially related but not focused on the main condition"},
|
| 957 |
-
...
|
| 958 |
-
],
|
| 959 |
-
"recommended_indices": [1, 3, 5] // Indices of the most relevant articles to use, in order of relevance
|
| 960 |
-
}
|
| 961 |
-
|
| 962 |
-
Include only the JSON in your response.
|
| 963 |
-
"""
|
| 964 |
-
|
| 965 |
-
# Call GPT-4o
|
| 966 |
-
response = openai.ChatCompletion.create(
|
| 967 |
-
model="gpt-4o-mini",
|
| 968 |
-
messages=[
|
| 969 |
-
{"role": "system", "content": system_prompt},
|
| 970 |
-
{"role": "user", "content": f"Patient query: {query}\n\nArticles to assess:\n{snippets_text}"}
|
| 971 |
-
],
|
| 972 |
-
temperature=0.1,
|
| 973 |
-
max_tokens=1000,
|
| 974 |
-
)
|
| 975 |
-
|
| 976 |
-
result_text = response.choices[0].message['content']
|
| 977 |
-
|
| 978 |
-
# Extract the JSON
|
| 979 |
-
try:
|
| 980 |
-
# Find JSON object
|
| 981 |
-
json_match = re.search(r'({[\s\S]*})', result_text)
|
| 982 |
-
if json_match:
|
| 983 |
-
json_str = json_match.group(1)
|
| 984 |
-
assessment = json.loads(json_str)
|
| 985 |
-
else:
|
| 986 |
-
assessment = json.loads(result_text)
|
| 987 |
-
|
| 988 |
-
# Get recommended indices
|
| 989 |
-
recommended_indices = assessment.get("recommended_indices", [])
|
| 990 |
-
|
| 991 |
-
# If no recommendations, fall back to top 5 from original list
|
| 992 |
-
if not recommended_indices and len(evidence_snippets) > max_results:
|
| 993 |
-
return evidence_snippets[:max_results]
|
| 994 |
-
|
| 995 |
-
# Filter evidence snippets based on recommended indices
|
| 996 |
-
filtered_snippets = []
|
| 997 |
-
for idx in recommended_indices:
|
| 998 |
-
if 0 <= idx-1 < len(snippets_to_assess):
|
| 999 |
-
filtered_snippets.append(snippets_to_assess[idx-1])
|
| 1000 |
-
|
| 1001 |
-
# If we have less than max_results, add more from original sorted list
|
| 1002 |
-
if len(filtered_snippets) < max_results and len(evidence_snippets) > len(filtered_snippets):
|
| 1003 |
-
# Get indices of snippets already included
|
| 1004 |
-
included_indices = set()
|
| 1005 |
-
for snippet in filtered_snippets:
|
| 1006 |
-
for i, original in enumerate(evidence_snippets):
|
| 1007 |
-
if snippet["id"] == original["id"]:
|
| 1008 |
-
included_indices.add(i)
|
| 1009 |
-
break
|
| 1010 |
-
|
| 1011 |
-
# Add more snippets that weren't already included
|
| 1012 |
-
for i, snippet in enumerate(evidence_snippets):
|
| 1013 |
-
if i not in included_indices and len(filtered_snippets) < max_results:
|
| 1014 |
-
filtered_snippets.append(snippet)
|
| 1015 |
-
|
| 1016 |
-
return filtered_snippets[:max_results]
|
| 1017 |
-
|
| 1018 |
-
except json.JSONDecodeError:
|
| 1019 |
-
# Fallback to sorted snippets if parsing fails
|
| 1020 |
-
return evidence_snippets[:max_results]
|
| 1021 |
-
|
| 1022 |
-
except Exception as e:
|
| 1023 |
-
print(f"Error in evidence relevance assessment: {str(e)}")
|
| 1024 |
-
# Fallback to the original sorting
|
| 1025 |
-
return evidence_snippets[:max_results]
|
| 1026 |
-
|
| 1027 |
# Enhanced RAG System with real medical sources
|
| 1028 |
def fetch_medical_evidence(query, max_results=5):
|
| 1029 |
-
"""Fetch medical evidence from multiple sources using real APIs
|
| 1030 |
-
|
| 1031 |
|
| 1032 |
# Define API keys
|
| 1033 |
pubmed_api_key = os.environ.get("PUBMED_API_KEY")
|
| 1034 |
core_api_key = os.environ.get("CORE_API_KEY")
|
| 1035 |
|
| 1036 |
-
# Step 1: Query preprocessing with GPT-4o to extract medical terms
|
| 1037 |
-
query_analysis = preprocess_medical_query(query)
|
| 1038 |
-
processed_query = query_analysis.get("optimized_search_query", query)
|
| 1039 |
-
|
| 1040 |
-
# Step 2: Gather evidence from multiple sources
|
| 1041 |
-
|
| 1042 |
# Source 1: PubMed API - prioritize for relevant medical research
|
| 1043 |
-
pubmed_results = fetch_from_pubmed_api(query, max_results=max(
|
| 1044 |
if pubmed_results:
|
| 1045 |
-
|
| 1046 |
|
| 1047 |
-
# Source 2:
|
| 1048 |
-
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
# Source 3: PubMed Central - free full text articles
|
| 1053 |
-
if len(all_results) < max_results * 2: # Get more than needed for filtering
|
| 1054 |
-
remaining = (max_results * 2) - len(all_results)
|
| 1055 |
-
pmc_results = fetch_from_pmc_api(processed_query, max_results=remaining, api_key=pubmed_api_key)
|
| 1056 |
if pmc_results:
|
| 1057 |
-
|
| 1058 |
|
| 1059 |
-
# Source
|
| 1060 |
-
if len(
|
| 1061 |
-
remaining =
|
| 1062 |
-
core_results = fetch_from_core_api(
|
| 1063 |
if core_results:
|
| 1064 |
-
|
| 1065 |
|
| 1066 |
-
# Source
|
| 1067 |
-
if len(
|
| 1068 |
-
remaining = max_results - len(
|
| 1069 |
-
who_results = fetch_from_who_api(
|
| 1070 |
if who_results:
|
| 1071 |
-
|
| 1072 |
|
| 1073 |
-
# Step 3: Initial sorting by source quality and full text availability
|
| 1074 |
# Prioritize sources with full text for better diagnosis
|
| 1075 |
-
|
| 1076 |
-
x.get("
|
| 1077 |
-
"
|
| 1078 |
-
"
|
| 1079 |
-
"
|
| 1080 |
-
"PMC" in x.get("source_type", ""), # PMC for free full text
|
| 1081 |
-
"PubMed" in x.get("source_type", "") # Regular PubMed last
|
| 1082 |
), reverse=True)
|
| 1083 |
|
| 1084 |
-
#
|
| 1085 |
-
# Only run this if we have more results than needed
|
| 1086 |
-
if len(all_results) > max_results:
|
| 1087 |
-
filtered_results = assess_evidence_relevance(query, all_results, max_results)
|
| 1088 |
-
else:
|
| 1089 |
-
filtered_results = all_results
|
| 1090 |
-
|
| 1091 |
-
return filtered_results # Return the filtered and ranked results
|
| 1092 |
|
| 1093 |
# Function to parse doctor agent responses
|
| 1094 |
def parse_doctor_response(response_text):
|
|
@@ -1158,7 +719,7 @@ def doctor_agent(messages):
|
|
| 1158 |
|
| 1159 |
# Single orchestrator turn with enhanced reasoning and citation tracking
|
| 1160 |
def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
| 1161 |
-
"""Handle a single turn of conversation with the doctor agent
|
| 1162 |
# Select appropriate system prompt based on whether this is a follow-up
|
| 1163 |
if is_follow_up:
|
| 1164 |
system = {"role": "system", "content": FOLLOW_UP_PROMPT}
|
|
@@ -1167,97 +728,30 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
|
| 1167 |
|
| 1168 |
msgs = [system] + history
|
| 1169 |
|
| 1170 |
-
# Evidence gathering
|
| 1171 |
evidence_snippets = []
|
| 1172 |
if use_rag:
|
| 1173 |
# Only fetch and format evidence if RAG is enabled
|
| 1174 |
evidence_snippets = fetch_medical_evidence(query)
|
| 1175 |
|
| 1176 |
-
# Format evidence for the model
|
| 1177 |
if evidence_snippets:
|
| 1178 |
evidence_text = "MEDICAL EVIDENCE FROM AUTHORITATIVE SOURCES:\n\n"
|
| 1179 |
|
| 1180 |
for i, snippet in enumerate(evidence_snippets):
|
| 1181 |
-
|
| 1182 |
-
evidence_text += f"
|
| 1183 |
-
evidence_text += f"
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
if "authors" in snippet and snippet["authors"]:
|
| 1187 |
-
authors_text = ", ".join(snippet["authors"][:3])
|
| 1188 |
-
if len(snippet["authors"]) > 3:
|
| 1189 |
-
authors_text += " et al."
|
| 1190 |
-
evidence_text += f"AUTHORS: {authors_text}\n"
|
| 1191 |
-
|
| 1192 |
-
# Add journal and year if available
|
| 1193 |
-
journal_info = []
|
| 1194 |
-
if "journal" in snippet and snippet["journal"]:
|
| 1195 |
-
journal_info.append(snippet["journal"])
|
| 1196 |
-
if "year" in snippet and snippet["year"]:
|
| 1197 |
-
journal_info.append(snippet["year"])
|
| 1198 |
-
if journal_info:
|
| 1199 |
-
evidence_text += f"PUBLICATION: {', '.join(journal_info)}\n"
|
| 1200 |
-
|
| 1201 |
-
# Format the source type with emphasis on full text availability
|
| 1202 |
-
source_type = snippet.get("source_type", "Unknown Source")
|
| 1203 |
-
evidence_text += f"SOURCE TYPE: {source_type}\n"
|
| 1204 |
-
|
| 1205 |
-
# Format the text with section labels if available
|
| 1206 |
-
text = snippet.get("text", "").strip()
|
| 1207 |
-
# Split by section labels if they exist (e.g., "METHODS:", "RESULTS:")
|
| 1208 |
-
sections = re.split(r'([A-Z][A-Z\s]+:)', text)
|
| 1209 |
-
|
| 1210 |
-
if len(sections) > 1:
|
| 1211 |
-
formatted_text = ""
|
| 1212 |
-
current_section = None
|
| 1213 |
-
for section in sections:
|
| 1214 |
-
if re.match(r'[A-Z][A-Z\s]+:', section):
|
| 1215 |
-
current_section = section
|
| 1216 |
-
formatted_text += f"\n{current_section}\n"
|
| 1217 |
-
elif current_section is not None:
|
| 1218 |
-
formatted_text += section.strip() + "\n"
|
| 1219 |
-
evidence_text += f"CONTENT:\n{formatted_text}\n"
|
| 1220 |
-
else:
|
| 1221 |
-
evidence_text += f"CONTENT:\n{text}\n"
|
| 1222 |
-
|
| 1223 |
-
# Add URL for verification
|
| 1224 |
-
evidence_text += f"URL: {snippet.get('url', 'No URL available')}\n"
|
| 1225 |
-
|
| 1226 |
-
# Additional link to full text if available
|
| 1227 |
-
if snippet.get("has_full_text", False) and snippet.get("pmc_url"):
|
| 1228 |
-
evidence_text += f"FULL TEXT: {snippet.get('pmc_url')}\n"
|
| 1229 |
-
|
| 1230 |
-
# Add citation
|
| 1231 |
-
evidence_text += f"CITATION: {snippet.get('citation', 'Citation not available')}\n\n"
|
| 1232 |
-
|
| 1233 |
-
# Add a separator between articles
|
| 1234 |
-
evidence_text += "----------------------------------------------\n\n"
|
| 1235 |
|
| 1236 |
# Enhanced instructions for better source utilization
|
| 1237 |
-
evidence_text += """CITATION
|
| 1238 |
-
|
| 1239 |
-
|
| 1240 |
-
|
| 1241 |
-
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
3. When multiple sources support a claim, cite all of them for stronger evidence.
|
| 1245 |
-
Example: "This treatment approach is supported by multiple studies [pubmed-12345678][epmc-87654321]."
|
| 1246 |
-
|
| 1247 |
-
4. For each diagnostic or treatment recommendation, provide at least one citation.
|
| 1248 |
-
|
| 1249 |
-
5. Read the CONTENT sections carefully and extract specific details - don't just cite generally.
|
| 1250 |
-
|
| 1251 |
-
6. If sources have conflicting information, acknowledge this and present both perspectives with citations.
|
| 1252 |
-
|
| 1253 |
-
7. Use the most recent sources when available, especially for treatment recommendations.
|
| 1254 |
-
|
| 1255 |
-
8. For each recommendation, try to provide evidence on:
|
| 1256 |
-
- Efficacy (how well it works)
|
| 1257 |
-
- Safety (potential side effects)
|
| 1258 |
-
- Appropriateness for this specific patient scenario
|
| 1259 |
-
|
| 1260 |
-
9. If full text is available, prioritize information from those sources as they contain more complete data.
|
| 1261 |
"""
|
| 1262 |
|
| 1263 |
msgs.append({"role": "system", "content": evidence_text})
|
|
@@ -1272,38 +766,31 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
|
| 1272 |
if use_rag:
|
| 1273 |
output_instructions = """
|
| 1274 |
Please structure your response clearly.
|
| 1275 |
-
|
| 1276 |
**Priority 1: Ask Clarifying Questions**
|
| 1277 |
If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
|
| 1278 |
-
|
| 1279 |
**Priority 2: Main Response (After Clarification)**
|
| 1280 |
Once sufficient information is available (either initially or after asking questions), provide:
|
| 1281 |
1. A direct answer to the patient's concerns.
|
| 1282 |
2. If appropriate, a clear diagnosis or differential diagnosis.
|
| 1283 |
3. Recommendations for a treatment plan or next steps.
|
| 1284 |
4. Ensure you cite medical evidence using the [source_id] format for any claims or information taken from the provided MEDICAL EVIDENCE snippets.
|
| 1285 |
-
|
| 1286 |
**After your main response, ALWAYS include these sections:**
|
| 1287 |
- **Reasoning**: Bullet points detailing your clinical reasoning.
|
| 1288 |
-
- **Sources**: A list of all references cited in your main response,
|
| 1289 |
"""
|
| 1290 |
else:
|
| 1291 |
# Different instructions when RAG is disabled - no mention of sources or citations
|
| 1292 |
output_instructions = """
|
| 1293 |
Please structure your response clearly.
|
| 1294 |
-
|
| 1295 |
**Priority 1: Ask Clarifying Questions**
|
| 1296 |
If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
|
| 1297 |
-
|
| 1298 |
**Priority 2: Main Response (After Clarification)**
|
| 1299 |
Once sufficient information is available (either initially or after asking questions), provide:
|
| 1300 |
1. A direct answer to the patient's concerns.
|
| 1301 |
2. If appropriate, a clear diagnosis or differential diagnosis.
|
| 1302 |
3. Recommendations for a treatment plan or next steps.
|
| 1303 |
-
|
| 1304 |
**After your main response, ALWAYS include this section:**
|
| 1305 |
- **Reasoning**: Bullet points detailing your clinical reasoning.
|
| 1306 |
-
|
| 1307 |
IMPORTANT: Since database search is disabled, do not include citations or sources in your response.
|
| 1308 |
"""
|
| 1309 |
|
|
@@ -1313,42 +800,261 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
|
| 1313 |
# Get response from doctor agent
|
| 1314 |
response = doctor_agent(msgs)
|
| 1315 |
|
| 1316 |
-
#
|
| 1317 |
-
explanation = None
|
| 1318 |
-
evidence = None
|
| 1319 |
-
|
| 1320 |
if use_rag:
|
| 1321 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1322 |
parsed_response = parse_doctor_response(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1323 |
|
| 1324 |
-
#
|
| 1325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1326 |
|
| 1327 |
-
#
|
| 1328 |
-
|
| 1329 |
-
|
| 1330 |
-
|
| 1331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1332 |
else:
|
| 1333 |
-
|
| 1334 |
|
| 1335 |
-
#
|
| 1336 |
-
|
| 1337 |
-
|
| 1338 |
-
|
| 1339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1340 |
|
| 1341 |
-
#
|
| 1342 |
-
|
| 1343 |
-
|
| 1344 |
-
|
| 1345 |
-
|
| 1346 |
-
|
| 1347 |
-
|
| 1348 |
-
|
| 1349 |
-
|
| 1350 |
-
|
| 1351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1352 |
|
| 1353 |
# Enhanced interactive loop with better handling of consultations
|
| 1354 |
def run_consultation(use_rag=True):
|
|
|
|
| 27 |
# System prompts
|
| 28 |
SYSTEM_PROMPT = """You are an advanced clinical AI assistant designed to aid healthcare professionals.
|
| 29 |
Follow these guidelines in all responses:
|
|
|
|
| 30 |
1. **Clarify First**: Before providing any diagnosis or plan, if the user's query is underspecified, ALWAYS ask relevant clarifying questions to gather necessary patient information. This includes, but is not limited to, symptoms, duration, severity, medical history, age, lifestyle factors (diet, exercise), and current medications.
|
| 31 |
2. Professional tone: Maintain a clear, respectful, and professional tone appropriate for medical consultation.
|
| 32 |
3. Evidence-based practice: Base all responses on current medical evidence and guidelines.
|
|
|
|
| 36 |
7. Limitations: Acknowledge the limits of AI medical advice and recommend in-person consultation when appropriate.
|
| 37 |
8. Comprehensive approach: Consider differential diagnoses and relevant contextual factors.
|
| 38 |
9. Patient-centered: Focus on clinically relevant information while maintaining respect for the patient.
|
|
|
|
| 39 |
For each consultation:
|
| 40 |
1. Ask clarifying questions if needed (as per guideline 1).
|
| 41 |
2. Provide differential diagnosis with likelihood assessment.
|
| 42 |
3. Suggest appropriate next steps (testing, treatment, referral).
|
| 43 |
4. Include reasoning for your conclusions.
|
| 44 |
5. Cite medical literature or guidelines supporting your assessment using [source_id].
|
|
|
|
| 45 |
IMPORTANT: Your primary duty is to support clinical decision-making, not replace clinical judgment.
|
| 46 |
"""
|
| 47 |
|
| 48 |
FOLLOW_UP_PROMPT = """Continue this medical consultation based on the previous discussion.
|
| 49 |
Consider the information already gathered and the tentative diagnosis/plan.
|
|
|
|
| 50 |
When responding to the follow-up:
|
| 51 |
1. Reference relevant details from the prior conversation.
|
| 52 |
2. Address the specific follow-up question with evidence-based information.
|
|
|
|
| 54 |
4. Update recommendations if appropriate.
|
| 55 |
5. Maintain the same structured approach with transparent reasoning.
|
| 56 |
6. Cite additional medical literature or guidelines when relevant using [source_id].
|
|
|
|
| 57 |
Remember that this is an ongoing consultation where continuity of care is important.
|
| 58 |
"""
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# Function to extract source IDs and replace them with actual links
|
| 61 |
def extract_and_link_sources(text, evidence_snippets):
|
| 62 |
+
"""Replace [source_id] placeholders with actual source information"""
|
| 63 |
+
source_pattern = r'\[([\w\d:_\-\.+]+)\]' # Expanded to handle more characters including +
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
matches = re.findall(source_pattern, text)
|
| 65 |
|
| 66 |
source_map = {} # Map to store source_id -> source data
|
|
|
|
| 73 |
"id": snippet["id"],
|
| 74 |
"title": snippet["title"].strip(),
|
| 75 |
"url": snippet["url"],
|
| 76 |
+
"citation": snippet["citation"]
|
|
|
|
|
|
|
|
|
|
| 77 |
}
|
| 78 |
break
|
| 79 |
|
| 80 |
# Next, try fuzzy matching for cases where the exact ID isn't matched
|
| 81 |
for source_id_match in matches:
|
| 82 |
if source_id_match not in source_map and source_id_match != "source_id":
|
|
|
|
|
|
|
|
|
|
| 83 |
for snippet in evidence_snippets:
|
| 84 |
+
# Try to match on partial IDs (e.g. part before a hyphen)
|
| 85 |
+
snippet_id_parts = snippet["id"].split("-")
|
| 86 |
+
source_id_parts = source_id_match.split("-")
|
| 87 |
+
|
| 88 |
+
# Check if the first parts match (journal name)
|
| 89 |
+
if (snippet_id_parts and source_id_parts and
|
| 90 |
+
snippet_id_parts[0] == source_id_parts[0]):
|
| 91 |
source_map[source_id_match] = {
|
| 92 |
"id": snippet["id"],
|
| 93 |
"title": snippet["title"].strip(),
|
| 94 |
"url": snippet["url"],
|
| 95 |
+
"citation": snippet["citation"]
|
|
|
|
|
|
|
|
|
|
| 96 |
}
|
| 97 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
# Handle generic [source_id] placeholder
|
| 100 |
if "source_id" in matches:
|
|
|
|
| 106 |
"id": snippet["id"],
|
| 107 |
"title": snippet["title"].strip(),
|
| 108 |
"url": snippet["url"],
|
| 109 |
+
"citation": snippet["citation"]
|
|
|
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
|
| 112 |
# Replace source_id placeholders with actual links in the text
|
|
|
|
| 114 |
for source_id_key, source_data in source_map.items():
|
| 115 |
safe_id = re.escape(source_id_key)
|
| 116 |
pattern = f"\\[{safe_id}\\]"
|
| 117 |
+
replacement = f"[{source_data['title']}]({source_data['url']})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
linked_text = re.sub(pattern, replacement, linked_text)
|
| 119 |
|
| 120 |
# Handle remaining [source_id] placeholders
|
| 121 |
if "source_id" in source_map and "[source_id]" in linked_text:
|
| 122 |
generic_data = source_map["source_id"]
|
| 123 |
+
replacement = f"[{generic_data['title']}]({generic_data['url']})"
|
|
|
|
| 124 |
linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
|
| 125 |
|
| 126 |
# Final fallback for any [source_id] not mapped at all
|
|
|
|
| 130 |
|
| 131 |
# Implement PubMed API integration for medical evidence retrieval
|
| 132 |
def fetch_from_pubmed_api(query, max_results=3, api_key=None):
|
| 133 |
+
"""Fetch medical evidence from PubMed API using E-utilities"""
|
|
|
|
|
|
|
|
|
|
| 134 |
results = []
|
|
|
|
| 135 |
|
| 136 |
+
# Clean up the query for better results
|
| 137 |
+
cleaned_query = re.sub(r'^(hi|hello|hey|greetings|good morning|good afternoon|good evening)[,\.]?\s+', '', query.lower())
|
| 138 |
+
cleaned_query = re.sub(r"(i'?m|i am)\s+a\s+\d+[-\s]year[-\s]old", '', cleaned_query)
|
| 139 |
+
cleaned_query = re.sub(r'(my name is|i am|i have been|i\'ve been|i was|i have|i\'ve had|i feel|i\'m feeling|i experienced)', '', cleaned_query)
|
| 140 |
+
|
| 141 |
+
# Try to extract key medical symptoms
|
| 142 |
+
symptom_patterns = [
|
| 143 |
+
r'(muscle weakness)', r'(fatigue)', r'(rash)', r'(pain)', r'(swelling)',
|
| 144 |
+
r'(difficulty breathing|shortness of breath)', r'(fever)', r'(headache)',
|
| 145 |
+
r'(nausea|vomiting)', r'(dizziness)', r'(numbness)', r'(tingling)'
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
medical_terms = []
|
| 149 |
+
for pattern in symptom_patterns:
|
| 150 |
+
matches = re.findall(pattern, query.lower())
|
| 151 |
+
if matches:
|
| 152 |
+
medical_terms.extend(matches)
|
| 153 |
+
|
| 154 |
+
# If we found medical terms, prioritize them in the search
|
| 155 |
+
if medical_terms:
|
| 156 |
+
search_query = " AND ".join(medical_terms)
|
| 157 |
+
# Add the complete cleaned query as a less weighted part
|
| 158 |
+
if cleaned_query:
|
| 159 |
+
search_query = f"({search_query}) OR ({cleaned_query})"
|
| 160 |
else:
|
| 161 |
+
# If no medical terms found, use the cleaned query
|
| 162 |
+
search_query = cleaned_query
|
| 163 |
|
| 164 |
+
# Encode the query for the API
|
| 165 |
+
encoded_query = urllib.parse.quote(search_query)
|
| 166 |
+
|
| 167 |
+
# Base URL for PubMed E-utilities
|
| 168 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
| 169 |
+
|
| 170 |
+
# Search parameters
|
| 171 |
search_params = {
|
| 172 |
"db": "pubmed",
|
| 173 |
+
"term": encoded_query,
|
| 174 |
+
"retmax": max_results,
|
| 175 |
"retmode": "json",
|
| 176 |
"sort": "relevance"
|
| 177 |
}
|
|
|
|
| 179 |
# Add API key if provided (increases rate limits)
|
| 180 |
if api_key:
|
| 181 |
search_params["api_key"] = api_key
|
| 182 |
+
|
| 183 |
try:
|
| 184 |
# First get article IDs
|
| 185 |
search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
|
|
|
|
| 190 |
search_data = search_response.json()
|
| 191 |
|
| 192 |
if "esearchresult" in search_data and "idlist" in search_data["esearchresult"]:
|
| 193 |
+
ids = search_data["esearchresult"]["idlist"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
if ids:
|
| 196 |
+
# Fetch article details
|
| 197 |
+
fetch_params = {
|
| 198 |
+
"db": "pubmed",
|
| 199 |
+
"id": ",".join(ids),
|
| 200 |
+
"retmode": "xml"
|
| 201 |
+
}
|
| 202 |
+
if api_key:
|
| 203 |
+
fetch_params["api_key"] = api_key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
+
fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
|
| 206 |
+
|
| 207 |
+
if fetch_response.status_code != 200:
|
| 208 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
+
try:
|
| 211 |
+
# Parse XML response
|
| 212 |
+
root = ET.fromstring(fetch_response.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
+
for article in root.findall(".//PubmedArticle"):
|
| 215 |
+
try:
|
| 216 |
+
pmid = article.findtext(".//PMID")
|
| 217 |
+
title = article.findtext(".//ArticleTitle") or "No title available"
|
| 218 |
+
|
| 219 |
+
# Extract abstract
|
| 220 |
+
abstract_elements = article.findall(".//AbstractText")
|
| 221 |
+
abstract = " ".join([(elem.text or "") for elem in abstract_elements])
|
| 222 |
+
|
| 223 |
+
# Extract authors
|
| 224 |
+
authors = []
|
| 225 |
+
for author in article.findall(".//Author"):
|
| 226 |
+
last_name = author.findtext(".//LastName") or ""
|
| 227 |
+
initials = author.findtext(".//Initials") or ""
|
| 228 |
+
if last_name and initials:
|
| 229 |
+
authors.append(f"{last_name} {initials}")
|
| 230 |
+
|
| 231 |
+
author_str = ", ".join(authors[:3])
|
| 232 |
+
if len(authors) > 3:
|
| 233 |
+
author_str += " et al."
|
| 234 |
+
|
| 235 |
+
# Extract journal and date
|
| 236 |
+
journal = article.findtext(".//Journal/Title") or "Journal not specified"
|
| 237 |
+
year = article.findtext(".//PubDate/Year") or "N/A"
|
| 238 |
+
|
| 239 |
+
# Create citation
|
| 240 |
+
citation = f"{author_str}. ({year}). {title}. {journal}. PMID: {pmid}"
|
| 241 |
+
|
| 242 |
+
# Create direct access URL
|
| 243 |
+
url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
| 244 |
+
|
| 245 |
+
# Check if free full text is available via PMC
|
| 246 |
+
pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
|
| 247 |
+
has_free_text = bool(pmc_id) or article.findtext(".//PublicationStatus") == "epublish"
|
| 248 |
+
|
| 249 |
+
# If PMC ID is available, use that URL instead as it provides full text
|
| 250 |
+
if pmc_id:
|
| 251 |
+
url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
|
| 252 |
+
|
| 253 |
+
results.append({
|
| 254 |
+
"id": f"pubmed:{pmid}",
|
| 255 |
+
"title": title,
|
| 256 |
+
"text": abstract[:800] + "..." if len(abstract) > 800 else abstract,
|
| 257 |
+
"citation": citation,
|
| 258 |
+
"url": url,
|
| 259 |
+
"source_type": "PubMed" + (" (Free Full Text)" if has_free_text else ""),
|
| 260 |
+
"is_open_access": has_free_text
|
| 261 |
+
})
|
| 262 |
+
except Exception:
|
| 263 |
+
continue
|
| 264 |
+
except ET.ParseError:
|
| 265 |
+
return []
|
| 266 |
|
| 267 |
+
return results
|
| 268 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
return []
|
|
|
|
|
|
|
| 270 |
|
| 271 |
def fetch_from_pmc_api(query, max_results=2, api_key=None):
|
| 272 |
"""Fetch free full text articles from PubMed Central (PMC)"""
|
|
|
|
| 606 |
except Exception:
|
| 607 |
return []
|
| 608 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
# Enhanced RAG System with real medical sources
|
| 610 |
def fetch_medical_evidence(query, max_results=5):
|
| 611 |
+
"""Fetch medical evidence from multiple sources using real APIs"""
|
| 612 |
+
results = []
|
| 613 |
|
| 614 |
# Define API keys
|
| 615 |
pubmed_api_key = os.environ.get("PUBMED_API_KEY")
|
| 616 |
core_api_key = os.environ.get("CORE_API_KEY")
|
| 617 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
# Source 1: PubMed API - prioritize for relevant medical research
|
| 619 |
+
pubmed_results = fetch_from_pubmed_api(query, max_results=max(2, max_results//2), api_key=pubmed_api_key)
|
| 620 |
if pubmed_results:
|
| 621 |
+
results.extend(pubmed_results)
|
| 622 |
|
| 623 |
+
# Source 2: PubMed Central - free full text articles
|
| 624 |
+
if len(results) < max_results:
|
| 625 |
+
remaining = max_results - len(results)
|
| 626 |
+
pmc_results = fetch_from_pmc_api(query, max_results=remaining, api_key=pubmed_api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
if pmc_results:
|
| 628 |
+
results.extend(pmc_results)
|
| 629 |
|
| 630 |
+
# Source 3: CORE API - open access research papers
|
| 631 |
+
if len(results) < max_results:
|
| 632 |
+
remaining = max_results - len(results)
|
| 633 |
+
core_results = fetch_from_core_api(query, max_results=remaining, api_key=core_api_key)
|
| 634 |
if core_results:
|
| 635 |
+
results.extend(core_results)
|
| 636 |
|
| 637 |
+
# Source 4: WHO Guidelines - if still need more results
|
| 638 |
+
if len(results) < max_results:
|
| 639 |
+
remaining = max_results - len(results)
|
| 640 |
+
who_results = fetch_from_who_api(query, max_results=remaining)
|
| 641 |
if who_results:
|
| 642 |
+
results.extend(who_results)
|
| 643 |
|
|
|
|
| 644 |
# Prioritize sources with full text for better diagnosis
|
| 645 |
+
results.sort(key=lambda x: (
|
| 646 |
+
"Full Text" in x.get("source_type", ""),
|
| 647 |
+
"CORE" in x.get("source_type", ""),
|
| 648 |
+
"PMC" in x.get("source_type", ""),
|
| 649 |
+
"PubMed" in x.get("source_type", "")
|
|
|
|
|
|
|
| 650 |
), reverse=True)
|
| 651 |
|
| 652 |
+
return results[:max_results] # Limit to requested number after sorting
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
|
| 654 |
# Function to parse doctor agent responses
|
| 655 |
def parse_doctor_response(response_text):
|
|
|
|
| 719 |
|
| 720 |
# Single orchestrator turn with enhanced reasoning and citation tracking
|
| 721 |
def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
| 722 |
+
"""Handle a single turn of conversation with the doctor agent"""
|
| 723 |
# Select appropriate system prompt based on whether this is a follow-up
|
| 724 |
if is_follow_up:
|
| 725 |
system = {"role": "system", "content": FOLLOW_UP_PROMPT}
|
|
|
|
| 728 |
|
| 729 |
msgs = [system] + history
|
| 730 |
|
| 731 |
+
# Evidence gathering
|
| 732 |
evidence_snippets = []
|
| 733 |
if use_rag:
|
| 734 |
# Only fetch and format evidence if RAG is enabled
|
| 735 |
evidence_snippets = fetch_medical_evidence(query)
|
| 736 |
|
| 737 |
+
# Format evidence for the model
|
| 738 |
if evidence_snippets:
|
| 739 |
evidence_text = "MEDICAL EVIDENCE FROM AUTHORITATIVE SOURCES:\n\n"
|
| 740 |
|
| 741 |
for i, snippet in enumerate(evidence_snippets):
|
| 742 |
+
evidence_text += f"[{snippet['id']}] {snippet['title']}\n"
|
| 743 |
+
evidence_text += f"Source: {snippet['source_type']}\n"
|
| 744 |
+
evidence_text += f"Content: {snippet['text']}\n"
|
| 745 |
+
evidence_text += f"Citation: {snippet['citation']}\n"
|
| 746 |
+
evidence_text += f"URL: {snippet['url']}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
|
| 748 |
# Enhanced instructions for better source utilization
|
| 749 |
+
evidence_text += """CITATION INSTRUCTIONS:
|
| 750 |
+
1. When referencing these sources in your response, use the format [source_id] to cite them.
|
| 751 |
+
2. Prioritize information from sources marked with "Full Text Available" as they provide more comprehensive data.
|
| 752 |
+
3. CORE API sources provide open access full text articles that are particularly valuable for diagnosis.
|
| 753 |
+
4. Use the most relevant medical evidence to support your diagnostic reasoning.
|
| 754 |
+
5. Try to cite multiple sources to provide a well-rounded assessment.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 755 |
"""
|
| 756 |
|
| 757 |
msgs.append({"role": "system", "content": evidence_text})
|
|
|
|
| 766 |
if use_rag:
|
| 767 |
output_instructions = """
|
| 768 |
Please structure your response clearly.
|
|
|
|
| 769 |
**Priority 1: Ask Clarifying Questions**
|
| 770 |
If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
|
|
|
|
| 771 |
**Priority 2: Main Response (After Clarification)**
|
| 772 |
Once sufficient information is available (either initially or after asking questions), provide:
|
| 773 |
1. A direct answer to the patient's concerns.
|
| 774 |
2. If appropriate, a clear diagnosis or differential diagnosis.
|
| 775 |
3. Recommendations for a treatment plan or next steps.
|
| 776 |
4. Ensure you cite medical evidence using the [source_id] format for any claims or information taken from the provided MEDICAL EVIDENCE snippets.
|
|
|
|
| 777 |
**After your main response, ALWAYS include these sections:**
|
| 778 |
- **Reasoning**: Bullet points detailing your clinical reasoning.
|
| 779 |
+
- **Sources**: A list of all references cited in your main response, using their full titles and corresponding URLs if they were linked (e.g., [Title of Source](URL)). If a source was just an ID without a direct link in the text, list its ID or citation.
|
| 780 |
"""
|
| 781 |
else:
|
| 782 |
# Different instructions when RAG is disabled - no mention of sources or citations
|
| 783 |
output_instructions = """
|
| 784 |
Please structure your response clearly.
|
|
|
|
| 785 |
**Priority 1: Ask Clarifying Questions**
|
| 786 |
If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
|
|
|
|
| 787 |
**Priority 2: Main Response (After Clarification)**
|
| 788 |
Once sufficient information is available (either initially or after asking questions), provide:
|
| 789 |
1. A direct answer to the patient's concerns.
|
| 790 |
2. If appropriate, a clear diagnosis or differential diagnosis.
|
| 791 |
3. Recommendations for a treatment plan or next steps.
|
|
|
|
| 792 |
**After your main response, ALWAYS include this section:**
|
| 793 |
- **Reasoning**: Bullet points detailing your clinical reasoning.
|
|
|
|
| 794 |
IMPORTANT: Since database search is disabled, do not include citations or sources in your response.
|
| 795 |
"""
|
| 796 |
|
|
|
|
| 800 |
# Get response from doctor agent
|
| 801 |
response = doctor_agent(msgs)
|
| 802 |
|
| 803 |
+
# Process the response based on whether RAG is enabled
|
|
|
|
|
|
|
|
|
|
| 804 |
if use_rag:
|
| 805 |
+
# Process the response to replace source placeholders with actual links
|
| 806 |
+
linked_response, source_map = extract_and_link_sources(response, evidence_snippets)
|
| 807 |
+
|
| 808 |
+
# Parse the response
|
| 809 |
+
parsed_response = parse_doctor_response(linked_response)
|
| 810 |
+
|
| 811 |
+
# Enhance source information with evidence snippets data
|
| 812 |
+
enhanced_sources = []
|
| 813 |
+
# Use the source_map from extract_and_link_sources as the primary guide for cited sources
|
| 814 |
+
for source_id_key, mapped_data in source_map.items():
|
| 815 |
+
enhanced_sources.append({
|
| 816 |
+
"id": mapped_data["id"], # This is the original ID from the snippet
|
| 817 |
+
"title": mapped_data["title"],
|
| 818 |
+
"citation": mapped_data["citation"],
|
| 819 |
+
"url": mapped_data["url"],
|
| 820 |
+
"source_type": "Referenced Source" # Or derive from snippet if available
|
| 821 |
+
})
|
| 822 |
+
|
| 823 |
+
# Get source types and open access status from original snippets
|
| 824 |
+
for es in enhanced_sources:
|
| 825 |
+
for snippet in evidence_snippets:
|
| 826 |
+
if es["id"] == snippet["id"]:
|
| 827 |
+
es["source_type"] = snippet.get("source_type", "Referenced Source")
|
| 828 |
+
es["is_open_access"] = snippet.get("is_open_access", False)
|
| 829 |
+
break
|
| 830 |
+
|
| 831 |
+
# If there are sources in parsed_response["sources"] that are not in source_map
|
| 832 |
+
# (e.g., LLM hallucinated an ID or cited something not in snippets), add them.
|
| 833 |
+
current_enhanced_ids = {es['id'] for es in enhanced_sources}
|
| 834 |
+
|
| 835 |
+
for source_text in parsed_response["sources"]: # source_text could be "[id]", "title (url)", or just "citation"
|
| 836 |
+
source_id_candidate = source_text.strip("[]") # Basic extraction
|
| 837 |
+
|
| 838 |
+
# Check if this source_id_candidate was part of the original evidence
|
| 839 |
+
found_in_evidence = False
|
| 840 |
+
for snippet in evidence_snippets:
|
| 841 |
+
if source_id_candidate == snippet["id"]:
|
| 842 |
+
if source_id_candidate not in current_enhanced_ids:
|
| 843 |
+
enhanced_sources.append({
|
| 844 |
+
"id": snippet["id"],
|
| 845 |
+
"title": snippet["title"],
|
| 846 |
+
"citation": snippet["citation"],
|
| 847 |
+
"url": snippet["url"],
|
| 848 |
+
"source_type": snippet["source_type"],
|
| 849 |
+
"is_open_access": snippet.get("is_open_access", False)
|
| 850 |
+
})
|
| 851 |
+
current_enhanced_ids.add(snippet["id"]) # Add to set to avoid re-adding
|
| 852 |
+
found_in_evidence = True
|
| 853 |
+
break
|
| 854 |
+
|
| 855 |
+
if not found_in_evidence:
|
| 856 |
+
# If it's not in source_map and not directly in evidence_snippets by a simple ID match,
|
| 857 |
+
# it might be a raw citation or a URL. Add it with available info.
|
| 858 |
+
is_duplicate = False
|
| 859 |
+
for es_item in enhanced_sources:
|
| 860 |
+
if es_item["title"] == source_text or es_item["url"] == source_text or es_item["citation"] == source_text:
|
| 861 |
+
is_duplicate = True
|
| 862 |
+
break
|
| 863 |
+
if not is_duplicate and source_text not in current_enhanced_ids:
|
| 864 |
+
# Try to extract a URL if present in markdown format
|
| 865 |
+
url_match = re.search(r'\[(.*?)\]\((https?://[^)]+)\)', source_text)
|
| 866 |
+
if url_match:
|
| 867 |
+
title = url_match.group(1)
|
| 868 |
+
url = url_match.group(2)
|
| 869 |
+
else:
|
| 870 |
+
title = source_text # Could be a citation string or a plain title
|
| 871 |
+
url = "" # No URL found directly
|
| 872 |
+
|
| 873 |
+
enhanced_sources.append({
|
| 874 |
+
"id": source_id_candidate, # Use the candidate, might be a simple title or part of citation
|
| 875 |
+
"title": title,
|
| 876 |
+
"citation": source_text, # The original text from LLM's source list
|
| 877 |
+
"url": url,
|
| 878 |
+
"source_type": "Referenced Source (uncategorized)"
|
| 879 |
+
})
|
| 880 |
+
current_enhanced_ids.add(source_id_candidate)
|
| 881 |
+
|
| 882 |
+
# Add the enhanced sources back to the parsed response
|
| 883 |
+
parsed_response["enhanced_sources"] = enhanced_sources
|
| 884 |
+
main_response = linked_response
|
| 885 |
+
else:
|
| 886 |
+
# If RAG is disabled, just parse the response without source processing
|
| 887 |
parsed_response = parse_doctor_response(response)
|
| 888 |
+
parsed_response["enhanced_sources"] = []
|
| 889 |
+
main_response = response
|
| 890 |
+
|
| 891 |
+
# Create detailed explanation with reasoning and sources
|
| 892 |
+
explanation = []
|
| 893 |
+
|
| 894 |
+
# Add reasoning section
|
| 895 |
+
if parsed_response["reasoning"]:
|
| 896 |
+
explanation.append("## REASONING")
|
| 897 |
+
for i, reason in enumerate(parsed_response["reasoning"]):
|
| 898 |
+
explanation.append(f"{i+1}. {reason}")
|
| 899 |
+
explanation.append("")
|
| 900 |
+
|
| 901 |
+
# Only add sources section if RAG is enabled
|
| 902 |
+
if use_rag and parsed_response["enhanced_sources"]:
|
| 903 |
+
explanation.append("## SOURCES USED")
|
| 904 |
+
|
| 905 |
+
# Add enhanced sources first (these are the ones actually cited in the response)
|
| 906 |
+
source_added_count = 0
|
| 907 |
|
| 908 |
+
unique_sources_for_display = {} # id: {title, url, citation, source_type}
|
| 909 |
+
for source in parsed_response["enhanced_sources"]:
|
| 910 |
+
# Prefer using the mapped title and URL from extract_and_link_sources if available
|
| 911 |
+
display_id = source.get('id', source.get('title', 'Unknown Source'))
|
| 912 |
+
|
| 913 |
+
if display_id not in unique_sources_for_display:
|
| 914 |
+
unique_sources_for_display[display_id] = {
|
| 915 |
+
"title": source.get('title', 'N/A'),
|
| 916 |
+
"url": source.get('url', ''),
|
| 917 |
+
"citation": source.get('citation', ''),
|
| 918 |
+
"source_type": source.get('source_type', 'Referenced Source'),
|
| 919 |
+
"is_open_access": source.get('is_open_access', False)
|
| 920 |
+
}
|
| 921 |
+
|
| 922 |
+
# Create a categorized display of sources
|
| 923 |
+
source_categories = {
|
| 924 |
+
"CORE": [], # CORE API full text
|
| 925 |
+
"PMC": [], # PubMed Central full text
|
| 926 |
+
"PubMed": [], # PubMed abstracts
|
| 927 |
+
"WHO": [], # WHO guidelines
|
| 928 |
+
"Other": [] # Uncategorized
|
| 929 |
+
}
|
| 930 |
|
| 931 |
+
# Categorize sources
|
| 932 |
+
for key, src_data in unique_sources_for_display.items():
|
| 933 |
+
source_type = src_data['source_type']
|
| 934 |
+
|
| 935 |
+
if "CORE" in source_type:
|
| 936 |
+
source_categories["CORE"].append((key, src_data))
|
| 937 |
+
elif "PMC" in source_type:
|
| 938 |
+
source_categories["PMC"].append((key, src_data))
|
| 939 |
+
elif "PubMed" in source_type:
|
| 940 |
+
source_categories["PubMed"].append((key, src_data))
|
| 941 |
+
elif "WHO" in source_type:
|
| 942 |
+
source_categories["WHO"].append((key, src_data))
|
| 943 |
else:
|
| 944 |
+
source_categories["Other"].append((key, src_data))
|
| 945 |
|
| 946 |
+
# Display sources by category
|
| 947 |
+
for category, sources in source_categories.items():
|
| 948 |
+
if sources:
|
| 949 |
+
if category != "Other": # Skip category header for Other
|
| 950 |
+
explanation.append(f"### {category} Sources:")
|
| 951 |
+
|
| 952 |
+
for key, src_data in sources:
|
| 953 |
+
title = src_data['title']
|
| 954 |
+
url = src_data['url']
|
| 955 |
+
is_open_access = src_data.get('is_open_access', False)
|
| 956 |
+
|
| 957 |
+
if url: # If URL exists, make it a markdown link
|
| 958 |
+
explanation.append(f"- [{title}]({url}) {' π' if is_open_access else ''}")
|
| 959 |
+
else: # Otherwise, just list the title or ID
|
| 960 |
+
explanation.append(f"- {title}")
|
| 961 |
+
|
| 962 |
+
if src_data['source_type']:
|
| 963 |
+
explanation.append(f" Source Type: {src_data['source_type']}")
|
| 964 |
+
if src_data['citation']: # Always show citation if available
|
| 965 |
+
explanation.append(f" Citation: {src_data['citation']}")
|
| 966 |
+
explanation.append("") # Add a blank line for spacing
|
| 967 |
+
source_added_count += 1
|
| 968 |
+
|
| 969 |
+
if source_added_count == 0 and parsed_response["sources"]: # Fallback to raw sources if enhanced list is empty but LLM listed some
|
| 970 |
+
explanation.append("## SOURCES MENTIONED (Raw)") # Indicate these are less processed
|
| 971 |
+
for source_text in parsed_response["sources"]:
|
| 972 |
+
explanation.append(f"- {source_text.strip()}")
|
| 973 |
+
explanation.append("")
|
| 974 |
+
source_added_count +=1
|
| 975 |
+
|
| 976 |
+
# If we still have no sources, remove the header
|
| 977 |
+
if source_added_count == 0: # Check if any sources were actually added to explanation
|
| 978 |
+
# Remove "## SOURCES USED" header if it was added but no sources followed
|
| 979 |
+
if explanation and explanation[-1] == "## SOURCES USED":
|
| 980 |
+
explanation.pop()
|
| 981 |
+
|
| 982 |
+
# Enhanced version to display clickable article links
|
| 983 |
+
# Check if we have evidence snippets but no sources in the explanation
|
| 984 |
+
if evidence_snippets and "## SOURCES USED" not in "\n".join(explanation):
|
| 985 |
+
# If AI didn't explicitly cite sources, show available evidence anyway
|
| 986 |
+
additional_explanation = ["\n## AVAILABLE MEDICAL SOURCES"]
|
| 987 |
|
| 988 |
+
# Create categorized display of all available sources
|
| 989 |
+
categorized_snippets = {
|
| 990 |
+
"CORE Open Access": [], # CORE API full text
|
| 991 |
+
"PubMed Central": [], # PMC full text
|
| 992 |
+
"PubMed": [], # PubMed abstracts
|
| 993 |
+
"WHO Guidelines": [], # WHO guidelines
|
| 994 |
+
"Other": [] # Uncategorized
|
| 995 |
+
}
|
| 996 |
+
|
| 997 |
+
# Categorize snippets
|
| 998 |
+
for snippet in evidence_snippets:
|
| 999 |
+
source_type = snippet.get("source_type", "")
|
| 1000 |
+
|
| 1001 |
+
if "CORE" in source_type:
|
| 1002 |
+
categorized_snippets["CORE Open Access"].append(snippet)
|
| 1003 |
+
elif "PMC" in source_type:
|
| 1004 |
+
categorized_snippets["PubMed Central"].append(snippet)
|
| 1005 |
+
elif "PubMed" in source_type and "PMC" not in source_type:
|
| 1006 |
+
categorized_snippets["PubMed"].append(snippet)
|
| 1007 |
+
elif "WHO" in source_type:
|
| 1008 |
+
categorized_snippets["WHO Guidelines"].append(snippet)
|
| 1009 |
+
else:
|
| 1010 |
+
categorized_snippets["Other"].append(snippet)
|
| 1011 |
+
|
| 1012 |
+
# Display snippets by category
|
| 1013 |
+
for category, snippets in categorized_snippets.items():
|
| 1014 |
+
if snippets:
|
| 1015 |
+
if category != "Other": # Skip category header for Other
|
| 1016 |
+
additional_explanation.append(f"### {category}:")
|
| 1017 |
+
|
| 1018 |
+
for snippet in snippets:
|
| 1019 |
+
title = snippet.get("title", "Unknown Title")
|
| 1020 |
+
url = snippet.get("url", "")
|
| 1021 |
+
source_type = snippet.get("source_type", "Medical Source")
|
| 1022 |
+
is_open_access = snippet.get("is_open_access", False)
|
| 1023 |
+
|
| 1024 |
+
if url:
|
| 1025 |
+
# Format as clickable markdown link with open access indicator
|
| 1026 |
+
additional_explanation.append(f"- [{title}]({url}) {' π' if is_open_access else ''}")
|
| 1027 |
+
else:
|
| 1028 |
+
additional_explanation.append(f"- {title} {' π' if is_open_access else ''}")
|
| 1029 |
+
|
| 1030 |
+
if "source_type" in snippet:
|
| 1031 |
+
additional_explanation.append(f" Source Type: {snippet['source_type']}")
|
| 1032 |
+
if "citation" in snippet:
|
| 1033 |
+
additional_explanation.append(f" Citation: {snippet['citation']}")
|
| 1034 |
+
additional_explanation.append("")
|
| 1035 |
+
|
| 1036 |
+
# Add to the main explanation
|
| 1037 |
+
explanation.extend(additional_explanation)
|
| 1038 |
+
|
| 1039 |
+
# Add a note about data availability
|
| 1040 |
+
data_availability_note = [
|
| 1041 |
+
"\n## DATA AVAILABILITY NOTE",
|
| 1042 |
+
"- PubMed sources typically provide abstracts only, unless marked as free full text",
|
| 1043 |
+
"- PubMed Central (PMC) sources provide complete free full text articles",
|
| 1044 |
+
"- CORE Open Access sources provide full text content from research repositories",
|
| 1045 |
+
"- WHO Guidelines provide official medical recommendations and protocols",
|
| 1046 |
+
"- Sources marked with π indicate open access content with full text available"
|
| 1047 |
+
]
|
| 1048 |
+
explanation.extend(data_availability_note)
|
| 1049 |
+
|
| 1050 |
+
# Format explanation as string
|
| 1051 |
+
explanation_text = "\n".join(explanation)
|
| 1052 |
+
|
| 1053 |
+
# Update conversation history
|
| 1054 |
+
history.append({"role": "user", "content": query})
|
| 1055 |
+
history.append({"role": "assistant", "content": main_response})
|
| 1056 |
+
|
| 1057 |
+
return main_response, explanation_text, evidence_snippets
|
| 1058 |
|
| 1059 |
# Enhanced interactive loop with better handling of consultations
|
| 1060 |
def run_consultation(use_rag=True):
|