Update src/model.py
Browse files- src/model.py +440 -187
src/model.py
CHANGED
|
@@ -67,7 +67,7 @@ Remember that this is an ongoing consultation where continuity of care is import
|
|
| 67 |
def extract_and_link_sources(text, evidence_snippets):
|
| 68 |
"""
|
| 69 |
Replace [PMID:123456] citation placeholders with actual links to PubMed articles.
|
| 70 |
-
Also handles other citation formats for compatibility.
|
| 71 |
|
| 72 |
Args:
|
| 73 |
text (str): Text containing citations
|
|
@@ -78,16 +78,20 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 78 |
"""
|
| 79 |
# Look for [PMID:123456] format first (preferred)
|
| 80 |
pmid_pattern = r'\[PMID:(\d+)\]'
|
|
|
|
|
|
|
| 81 |
# Also look for older [source_id] format for compatibility
|
| 82 |
source_pattern = r'\[([\w\d:_\-\.+]+)\]'
|
| 83 |
|
| 84 |
# Find all PMID citations
|
| 85 |
pmid_matches = re.findall(pmid_pattern, text)
|
|
|
|
|
|
|
| 86 |
# Find all other citation formats
|
| 87 |
source_matches = re.findall(source_pattern, text)
|
| 88 |
|
| 89 |
-
# Remove PMID matches from source matches to avoid duplicates
|
| 90 |
-
source_matches = [s for s in source_matches if not s.startswith('PMID:')]
|
| 91 |
|
| 92 |
# Create source map
|
| 93 |
source_map = {}
|
|
@@ -116,6 +120,30 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 116 |
}
|
| 117 |
break
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# Process other citation formats for backward compatibility
|
| 120 |
for source_id_match in source_matches:
|
| 121 |
if source_id_match not in source_map and source_id_match != "source_id":
|
|
@@ -126,7 +154,8 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 126 |
"title": snippet["title"].strip(),
|
| 127 |
"url": snippet["url"],
|
| 128 |
"citation": snippet["citation"],
|
| 129 |
-
"pmid": snippet.get("pmid", "")
|
|
|
|
| 130 |
}
|
| 131 |
break
|
| 132 |
|
|
@@ -144,9 +173,22 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 144 |
|
| 145 |
linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
# Replace other citation formats
|
| 148 |
for source_id_key, source_data in source_map.items():
|
| 149 |
-
if not source_id_key.startswith("PMID:"):
|
| 150 |
safe_id = re.escape(source_id_key)
|
| 151 |
pattern = f"\\[{safe_id}\\]"
|
| 152 |
replacement = f"[{source_data['title']}]({source_data['url']})"
|
|
@@ -163,7 +205,8 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 163 |
"title": snippet["title"].strip(),
|
| 164 |
"url": snippet["url"],
|
| 165 |
"citation": snippet["citation"],
|
| 166 |
-
"pmid": snippet.get("pmid", "")
|
|
|
|
| 167 |
}
|
| 168 |
replacement = f"[{snippet['title']}]({snippet['url']})"
|
| 169 |
linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
|
|
@@ -171,6 +214,7 @@ def extract_and_link_sources(text, evidence_snippets):
|
|
| 171 |
# Final fallback for any remaining placeholders
|
| 172 |
linked_text = re.sub(r'\[source_id\]', "[Medical Reference]", linked_text)
|
| 173 |
linked_text = re.sub(r'\[PMID:(\d+)\]', r'[PubMed Article]', linked_text)
|
|
|
|
| 174 |
|
| 175 |
return linked_text, source_map
|
| 176 |
|
|
@@ -652,21 +696,333 @@ def fetch_from_core_api(query, max_results=2, api_key=None):
|
|
| 652 |
except Exception:
|
| 653 |
return []
|
| 654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
# Enhanced RAG System with focused PubMed searches
|
| 656 |
def fetch_medical_evidence(query, max_results=3):
|
| 657 |
"""
|
| 658 |
-
Fetch medical evidence using a
|
| 659 |
-
1. Search with extracted medical terms
|
| 660 |
-
2. Search with the original query
|
|
|
|
| 661 |
|
| 662 |
-
This provides better coverage and relevance
|
| 663 |
|
| 664 |
Args:
|
| 665 |
query (str): The user's original query
|
| 666 |
max_results (int): Maximum number of results to return (now set to 3)
|
| 667 |
|
| 668 |
Returns:
|
| 669 |
-
list: Combined and deduplicated results from
|
| 670 |
"""
|
| 671 |
# Define API key if available
|
| 672 |
pubmed_api_key = os.environ.get("PUBMED_API_KEY")
|
|
@@ -682,18 +1038,28 @@ def fetch_medical_evidence(query, max_results=3):
|
|
| 682 |
|
| 683 |
# Search with extracted terms (Search A)
|
| 684 |
# Increase from 2 to 3 results from this search
|
| 685 |
-
terms_results = enhanced_search_pubmed(terms_query, retmax=
|
| 686 |
else:
|
| 687 |
terms_results = []
|
| 688 |
|
| 689 |
# Step 2: Search with the full original query (Search B)
|
| 690 |
# Increase from 2 to 3 results from this search
|
| 691 |
print(f"Searching PubMed with full query")
|
| 692 |
-
full_query_results = enhanced_search_pubmed(query, retmax=
|
| 693 |
|
| 694 |
-
# Step 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
all_results = []
|
| 696 |
seen_pmids = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
|
| 698 |
# Add results from terms search first (often more relevant)
|
| 699 |
for result in terms_results:
|
|
@@ -712,6 +1078,46 @@ def fetch_medical_evidence(query, max_results=3):
|
|
| 712 |
"pmid": pmid # Keep the original PMID for direct access
|
| 713 |
})
|
| 714 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 715 |
# Then add results from full query search
|
| 716 |
for result in full_query_results:
|
| 717 |
pmid = result["pmid"]
|
|
@@ -729,11 +1135,11 @@ def fetch_medical_evidence(query, max_results=3):
|
|
| 729 |
"pmid": pmid # Keep the original PMID for direct access
|
| 730 |
})
|
| 731 |
|
| 732 |
-
# Step
|
| 733 |
if not all_results:
|
| 734 |
-
print("No relevant medical evidence found
|
| 735 |
else:
|
| 736 |
-
print(f"Found {len(all_results)} relevant medical articles")
|
| 737 |
|
| 738 |
return all_results
|
| 739 |
|
|
@@ -840,8 +1246,12 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
|
| 840 |
|
| 841 |
1. IMPORTANT: You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
|
| 842 |
|
| 843 |
-
2. When citing information from these articles, use the
|
|
|
|
|
|
|
|
|
|
| 844 |
Example: "Recent studies have shown improved outcomes with early intervention [PMID:34567890]."
|
|
|
|
| 845 |
|
| 846 |
3. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
|
| 847 |
|
|
@@ -854,7 +1264,9 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
|
| 854 |
|
| 855 |
7. Use the most recent sources when available, especially for treatment recommendations.
|
| 856 |
|
| 857 |
-
8. If full text is available, prioritize information from those sources as they contain more complete data.
|
|
|
|
|
|
|
| 858 |
"""
|
| 859 |
|
| 860 |
msgs.append({"role": "system", "content": evidence_text})
|
|
@@ -878,15 +1290,21 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
|
|
| 878 |
1. A direct answer to the patient's concerns.
|
| 879 |
2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
|
| 880 |
3. Recommendations for a treatment plan or next steps.
|
| 881 |
-
4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using
|
|
|
|
|
|
|
|
|
|
|
|
|
| 882 |
|
| 883 |
**After your main response, ALWAYS include these sections:**
|
| 884 |
- **Reasoning**: Bullet points detailing your clinical reasoning.
|
| 885 |
-
- **Sources**: A list of all
|
| 886 |
- PMID: 12345678 - Author et al. (Year). Title. Journal.
|
| 887 |
URL: https://pubmed.ncbi.nlm.nih.gov/12345678/
|
|
|
|
|
|
|
| 888 |
|
| 889 |
-
IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references or
|
| 890 |
"""
|
| 891 |
else:
|
| 892 |
# Different instructions when RAG is disabled - no mention of sources or citations
|
|
@@ -1097,171 +1515,6 @@ def extract_medical_terms(query, max_terms=5):
|
|
| 1097 |
result = list(medical_terms)[:max_terms]
|
| 1098 |
return result
|
| 1099 |
|
| 1100 |
-
# Enhanced PubMed search function
|
| 1101 |
-
def enhanced_search_pubmed(query, retmax=2, api_key=None):
|
| 1102 |
-
"""
|
| 1103 |
-
Enhanced PubMed search using E-utilities API with improved parsing and error handling.
|
| 1104 |
-
|
| 1105 |
-
Args:
|
| 1106 |
-
query (str): Search query string
|
| 1107 |
-
retmax (int): Maximum number of results to return
|
| 1108 |
-
api_key (str, optional): NCBI API key for higher rate limits
|
| 1109 |
-
|
| 1110 |
-
Returns:
|
| 1111 |
-
list: List of article dictionaries with title, abstract, PMID, URL
|
| 1112 |
-
"""
|
| 1113 |
-
results = []
|
| 1114 |
-
|
| 1115 |
-
# Base URLs for PubMed E-utilities
|
| 1116 |
-
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
| 1117 |
-
|
| 1118 |
-
# Rate limiting - sleep to avoid hitting rate limits
|
| 1119 |
-
# NCBI allows 3 requests/second without API key, 10 with key
|
| 1120 |
-
time.sleep(0.33 if api_key is None else 0.1)
|
| 1121 |
-
|
| 1122 |
-
try:
|
| 1123 |
-
# Step 1: Use ESearch to get PMIDs
|
| 1124 |
-
search_params = {
|
| 1125 |
-
"db": "pubmed",
|
| 1126 |
-
"term": query,
|
| 1127 |
-
"retmax": retmax,
|
| 1128 |
-
"retmode": "json",
|
| 1129 |
-
"sort": "relevance"
|
| 1130 |
-
}
|
| 1131 |
-
|
| 1132 |
-
if api_key:
|
| 1133 |
-
search_params["api_key"] = api_key
|
| 1134 |
-
|
| 1135 |
-
search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
|
| 1136 |
-
|
| 1137 |
-
if search_response.status_code != 200:
|
| 1138 |
-
print(f"PubMed search error: {search_response.status_code}")
|
| 1139 |
-
return []
|
| 1140 |
-
|
| 1141 |
-
search_data = search_response.json()
|
| 1142 |
-
|
| 1143 |
-
if "esearchresult" not in search_data or "idlist" not in search_data["esearchresult"]:
|
| 1144 |
-
print("No results found or invalid response format")
|
| 1145 |
-
return []
|
| 1146 |
-
|
| 1147 |
-
pmids = search_data["esearchresult"]["idlist"]
|
| 1148 |
-
|
| 1149 |
-
if not pmids:
|
| 1150 |
-
print("No PMIDs found for the query")
|
| 1151 |
-
return []
|
| 1152 |
-
|
| 1153 |
-
# Rate limiting before second request
|
| 1154 |
-
time.sleep(0.33 if api_key is None else 0.1)
|
| 1155 |
-
|
| 1156 |
-
# Step 2: Use EFetch to get article details with abstracts
|
| 1157 |
-
fetch_params = {
|
| 1158 |
-
"db": "pubmed",
|
| 1159 |
-
"id": ",".join(pmids),
|
| 1160 |
-
"retmode": "xml",
|
| 1161 |
-
"rettype": "abstract"
|
| 1162 |
-
}
|
| 1163 |
-
|
| 1164 |
-
if api_key:
|
| 1165 |
-
fetch_params["api_key"] = api_key
|
| 1166 |
-
|
| 1167 |
-
fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
|
| 1168 |
-
|
| 1169 |
-
if fetch_response.status_code != 200:
|
| 1170 |
-
print(f"PubMed fetch error: {fetch_response.status_code}")
|
| 1171 |
-
return []
|
| 1172 |
-
|
| 1173 |
-
# Step 3: Parse XML response
|
| 1174 |
-
root = ET.fromstring(fetch_response.text)
|
| 1175 |
-
|
| 1176 |
-
for article in root.findall(".//PubmedArticle"):
|
| 1177 |
-
try:
|
| 1178 |
-
# Extract PMID
|
| 1179 |
-
pmid = article.findtext(".//PMID")
|
| 1180 |
-
if not pmid:
|
| 1181 |
-
continue
|
| 1182 |
-
|
| 1183 |
-
# Extract title
|
| 1184 |
-
title = article.findtext(".//ArticleTitle") or "No title available"
|
| 1185 |
-
|
| 1186 |
-
# Extract abstract sections with labels if available
|
| 1187 |
-
abstract_sections = []
|
| 1188 |
-
for abstract_text in article.findall(".//AbstractText"):
|
| 1189 |
-
label = abstract_text.get("Label", "")
|
| 1190 |
-
text = abstract_text.text or ""
|
| 1191 |
-
|
| 1192 |
-
if label and text:
|
| 1193 |
-
abstract_sections.append(f"{label}: {text}")
|
| 1194 |
-
elif text:
|
| 1195 |
-
abstract_sections.append(text)
|
| 1196 |
-
|
| 1197 |
-
# If no structured abstract, try to get the plain abstract
|
| 1198 |
-
if not abstract_sections:
|
| 1199 |
-
abstract_text = article.findtext(".//Abstract/AbstractText")
|
| 1200 |
-
if abstract_text:
|
| 1201 |
-
abstract_sections.append(abstract_text)
|
| 1202 |
-
|
| 1203 |
-
# Join all abstract sections
|
| 1204 |
-
abstract = " ".join(abstract_sections) or "Abstract not available"
|
| 1205 |
-
|
| 1206 |
-
# Extract authors
|
| 1207 |
-
authors = []
|
| 1208 |
-
for author in article.findall(".//Author"):
|
| 1209 |
-
last_name = author.findtext(".//LastName") or ""
|
| 1210 |
-
initials = author.findtext(".//Initials") or ""
|
| 1211 |
-
if last_name and initials:
|
| 1212 |
-
authors.append(f"{last_name} {initials}")
|
| 1213 |
-
|
| 1214 |
-
# Format authors for citation
|
| 1215 |
-
author_text = ""
|
| 1216 |
-
if authors:
|
| 1217 |
-
if len(authors) == 1:
|
| 1218 |
-
author_text = authors[0]
|
| 1219 |
-
elif len(authors) == 2:
|
| 1220 |
-
author_text = f"{authors[0]} & {authors[1]}"
|
| 1221 |
-
else:
|
| 1222 |
-
author_text = f"{authors[0]} et al."
|
| 1223 |
-
|
| 1224 |
-
# Extract journal and publication year
|
| 1225 |
-
journal = article.findtext(".//Journal/Title") or "Unknown Journal"
|
| 1226 |
-
year = article.findtext(".//PubDate/Year") or ""
|
| 1227 |
-
|
| 1228 |
-
# Create direct URL to PubMed article
|
| 1229 |
-
url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
| 1230 |
-
|
| 1231 |
-
# Create citation
|
| 1232 |
-
citation = f"{author_text}{' ' if author_text else ''}({year}). {title}. {journal}. PMID: {pmid}"
|
| 1233 |
-
|
| 1234 |
-
# Check for full text availability
|
| 1235 |
-
pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
|
| 1236 |
-
has_full_text = bool(pmc_id)
|
| 1237 |
-
full_text_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/" if pmc_id else None
|
| 1238 |
-
|
| 1239 |
-
# Create result object
|
| 1240 |
-
result = {
|
| 1241 |
-
"pmid": pmid,
|
| 1242 |
-
"title": title,
|
| 1243 |
-
"abstract": abstract,
|
| 1244 |
-
"authors": authors,
|
| 1245 |
-
"journal": journal,
|
| 1246 |
-
"year": year,
|
| 1247 |
-
"url": url,
|
| 1248 |
-
"full_text_url": full_text_url,
|
| 1249 |
-
"has_full_text": has_full_text,
|
| 1250 |
-
"citation": citation
|
| 1251 |
-
}
|
| 1252 |
-
|
| 1253 |
-
results.append(result)
|
| 1254 |
-
|
| 1255 |
-
except Exception as e:
|
| 1256 |
-
print(f"Error parsing article {pmid}: {str(e)}")
|
| 1257 |
-
continue
|
| 1258 |
-
|
| 1259 |
-
return results
|
| 1260 |
-
|
| 1261 |
-
except Exception as e:
|
| 1262 |
-
print(f"Error in PubMed search: {str(e)}")
|
| 1263 |
-
return []
|
| 1264 |
-
|
| 1265 |
# JSON schema for the search_pubmed function for API documentation
|
| 1266 |
SEARCH_PUBMED_SCHEMA = {
|
| 1267 |
"name": "search_pubmed",
|
|
|
|
| 67 |
def extract_and_link_sources(text, evidence_snippets):
|
| 68 |
"""
|
| 69 |
Replace [PMID:123456] citation placeholders with actual links to PubMed articles.
|
| 70 |
+
Also handles DOI citations and other citation formats for compatibility.
|
| 71 |
|
| 72 |
Args:
|
| 73 |
text (str): Text containing citations
|
|
|
|
| 78 |
"""
|
| 79 |
# Look for [PMID:123456] format first (preferred)
|
| 80 |
pmid_pattern = r'\[PMID:(\d+)\]'
|
| 81 |
+
# Look for [DOI:10.xxxx/yyyy] format for Europe PMC articles
|
| 82 |
+
doi_pattern = r'\[DOI:(10\.\d+\/[^\]]+)\]'
|
| 83 |
# Also look for older [source_id] format for compatibility
|
| 84 |
source_pattern = r'\[([\w\d:_\-\.+]+)\]'
|
| 85 |
|
| 86 |
# Find all PMID citations
|
| 87 |
pmid_matches = re.findall(pmid_pattern, text)
|
| 88 |
+
# Find all DOI citations
|
| 89 |
+
doi_matches = re.findall(doi_pattern, text)
|
| 90 |
# Find all other citation formats
|
| 91 |
source_matches = re.findall(source_pattern, text)
|
| 92 |
|
| 93 |
+
# Remove PMID and DOI matches from source matches to avoid duplicates
|
| 94 |
+
source_matches = [s for s in source_matches if not (s.startswith('PMID:') or s.startswith('DOI:'))]
|
| 95 |
|
| 96 |
# Create source map
|
| 97 |
source_map = {}
|
|
|
|
| 120 |
}
|
| 121 |
break
|
| 122 |
|
| 123 |
+
# Process DOI citations
|
| 124 |
+
for doi in doi_matches:
|
| 125 |
+
for snippet in evidence_snippets:
|
| 126 |
+
# Check if this is a direct DOI match
|
| 127 |
+
if 'doi' in snippet and snippet['doi'] == doi:
|
| 128 |
+
source_map[f"DOI:{doi}"] = {
|
| 129 |
+
"id": snippet.get("id", f"DOI:{doi}"),
|
| 130 |
+
"title": snippet["title"].strip(),
|
| 131 |
+
"url": snippet["url"],
|
| 132 |
+
"citation": snippet["citation"],
|
| 133 |
+
"doi": doi
|
| 134 |
+
}
|
| 135 |
+
break
|
| 136 |
+
# Also check the ID field which might contain DOI
|
| 137 |
+
elif snippet.get("id") == f"DOI:{doi}":
|
| 138 |
+
source_map[f"DOI:{doi}"] = {
|
| 139 |
+
"id": snippet["id"],
|
| 140 |
+
"title": snippet["title"].strip(),
|
| 141 |
+
"url": snippet["url"],
|
| 142 |
+
"citation": snippet["citation"],
|
| 143 |
+
"doi": doi
|
| 144 |
+
}
|
| 145 |
+
break
|
| 146 |
+
|
| 147 |
# Process other citation formats for backward compatibility
|
| 148 |
for source_id_match in source_matches:
|
| 149 |
if source_id_match not in source_map and source_id_match != "source_id":
|
|
|
|
| 154 |
"title": snippet["title"].strip(),
|
| 155 |
"url": snippet["url"],
|
| 156 |
"citation": snippet["citation"],
|
| 157 |
+
"pmid": snippet.get("pmid", ""),
|
| 158 |
+
"doi": snippet.get("doi", "")
|
| 159 |
}
|
| 160 |
break
|
| 161 |
|
|
|
|
| 173 |
|
| 174 |
linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
|
| 175 |
|
| 176 |
+
# Replace DOI citations with links
|
| 177 |
+
for doi_key in [f"DOI:{doi}" for doi in doi_matches]:
|
| 178 |
+
if doi_key in source_map:
|
| 179 |
+
source_data = source_map[doi_key]
|
| 180 |
+
safe_key = re.escape(doi_key)
|
| 181 |
+
pattern = f"\\[{safe_key}\\]"
|
| 182 |
+
|
| 183 |
+
# Create a replacement with title and URL
|
| 184 |
+
short_title = source_data['title'][:60] + "..." if len(source_data['title']) > 60 else source_data['title']
|
| 185 |
+
replacement = f"[{short_title}]({source_data['url']})"
|
| 186 |
+
|
| 187 |
+
linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
|
| 188 |
+
|
| 189 |
# Replace other citation formats
|
| 190 |
for source_id_key, source_data in source_map.items():
|
| 191 |
+
if not (source_id_key.startswith("PMID:") or source_id_key.startswith("DOI:")):
|
| 192 |
safe_id = re.escape(source_id_key)
|
| 193 |
pattern = f"\\[{safe_id}\\]"
|
| 194 |
replacement = f"[{source_data['title']}]({source_data['url']})"
|
|
|
|
| 205 |
"title": snippet["title"].strip(),
|
| 206 |
"url": snippet["url"],
|
| 207 |
"citation": snippet["citation"],
|
| 208 |
+
"pmid": snippet.get("pmid", ""),
|
| 209 |
+
"doi": snippet.get("doi", "")
|
| 210 |
}
|
| 211 |
replacement = f"[{snippet['title']}]({snippet['url']})"
|
| 212 |
linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
|
|
|
|
| 214 |
# Final fallback for any remaining placeholders
|
| 215 |
linked_text = re.sub(r'\[source_id\]', "[Medical Reference]", linked_text)
|
| 216 |
linked_text = re.sub(r'\[PMID:(\d+)\]', r'[PubMed Article]', linked_text)
|
| 217 |
+
linked_text = re.sub(r'\[DOI:(10\.\d+\/[^\]]+)\]', r'[Europe PMC Article]', linked_text)
|
| 218 |
|
| 219 |
return linked_text, source_map
|
| 220 |
|
|
|
|
| 696 |
except Exception:
|
| 697 |
return []
|
| 698 |
|
| 699 |
+
# Enhanced PubMed search function
|
| 700 |
+
def enhanced_search_pubmed(query, retmax=2, api_key=None):
|
| 701 |
+
"""
|
| 702 |
+
Enhanced PubMed search using E-utilities API with improved parsing and error handling.
|
| 703 |
+
|
| 704 |
+
Args:
|
| 705 |
+
query (str): Search query string
|
| 706 |
+
retmax (int): Maximum number of results to return
|
| 707 |
+
api_key (str, optional): NCBI API key for higher rate limits
|
| 708 |
+
|
| 709 |
+
Returns:
|
| 710 |
+
list: List of article dictionaries with title, abstract, PMID, URL
|
| 711 |
+
"""
|
| 712 |
+
results = []
|
| 713 |
+
|
| 714 |
+
# Base URLs for PubMed E-utilities
|
| 715 |
+
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
| 716 |
+
|
| 717 |
+
# Rate limiting - sleep to avoid hitting rate limits
|
| 718 |
+
# NCBI allows 3 requests/second without API key, 10 with key
|
| 719 |
+
time.sleep(0.33 if api_key is None else 0.1)
|
| 720 |
+
|
| 721 |
+
try:
|
| 722 |
+
# Step 1: Use ESearch to get PMIDs
|
| 723 |
+
search_params = {
|
| 724 |
+
"db": "pubmed",
|
| 725 |
+
"term": query,
|
| 726 |
+
"retmax": retmax,
|
| 727 |
+
"retmode": "json",
|
| 728 |
+
"sort": "relevance"
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
if api_key:
|
| 732 |
+
search_params["api_key"] = api_key
|
| 733 |
+
|
| 734 |
+
search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
|
| 735 |
+
|
| 736 |
+
if search_response.status_code != 200:
|
| 737 |
+
print(f"PubMed search error: {search_response.status_code}")
|
| 738 |
+
return []
|
| 739 |
+
|
| 740 |
+
search_data = search_response.json()
|
| 741 |
+
|
| 742 |
+
if "esearchresult" not in search_data or "idlist" not in search_data["esearchresult"]:
|
| 743 |
+
print("No results found or invalid response format")
|
| 744 |
+
return []
|
| 745 |
+
|
| 746 |
+
pmids = search_data["esearchresult"]["idlist"]
|
| 747 |
+
|
| 748 |
+
if not pmids:
|
| 749 |
+
print("No PMIDs found for the query")
|
| 750 |
+
return []
|
| 751 |
+
|
| 752 |
+
# Rate limiting before second request
|
| 753 |
+
time.sleep(0.33 if api_key is None else 0.1)
|
| 754 |
+
|
| 755 |
+
# Step 2: Use EFetch to get article details with abstracts
|
| 756 |
+
fetch_params = {
|
| 757 |
+
"db": "pubmed",
|
| 758 |
+
"id": ",".join(pmids),
|
| 759 |
+
"retmode": "xml",
|
| 760 |
+
"rettype": "abstract"
|
| 761 |
+
}
|
| 762 |
+
|
| 763 |
+
if api_key:
|
| 764 |
+
fetch_params["api_key"] = api_key
|
| 765 |
+
|
| 766 |
+
fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
|
| 767 |
+
|
| 768 |
+
if fetch_response.status_code != 200:
|
| 769 |
+
print(f"PubMed fetch error: {fetch_response.status_code}")
|
| 770 |
+
return []
|
| 771 |
+
|
| 772 |
+
# Step 3: Parse XML response
|
| 773 |
+
root = ET.fromstring(fetch_response.text)
|
| 774 |
+
|
| 775 |
+
for article in root.findall(".//PubmedArticle"):
|
| 776 |
+
try:
|
| 777 |
+
# Extract PMID
|
| 778 |
+
pmid = article.findtext(".//PMID")
|
| 779 |
+
if not pmid:
|
| 780 |
+
continue
|
| 781 |
+
|
| 782 |
+
# Extract title
|
| 783 |
+
title = article.findtext(".//ArticleTitle") or "No title available"
|
| 784 |
+
|
| 785 |
+
# Extract abstract sections with labels if available
|
| 786 |
+
abstract_sections = []
|
| 787 |
+
for abstract_text in article.findall(".//AbstractText"):
|
| 788 |
+
label = abstract_text.get("Label", "")
|
| 789 |
+
text = abstract_text.text or ""
|
| 790 |
+
|
| 791 |
+
if label and text:
|
| 792 |
+
abstract_sections.append(f"{label}: {text}")
|
| 793 |
+
elif text:
|
| 794 |
+
abstract_sections.append(text)
|
| 795 |
+
|
| 796 |
+
# If no structured abstract, try to get the plain abstract
|
| 797 |
+
if not abstract_sections:
|
| 798 |
+
abstract_text = article.findtext(".//Abstract/AbstractText")
|
| 799 |
+
if abstract_text:
|
| 800 |
+
abstract_sections.append(abstract_text)
|
| 801 |
+
|
| 802 |
+
# Join all abstract sections
|
| 803 |
+
abstract = " ".join(abstract_sections) or "Abstract not available"
|
| 804 |
+
|
| 805 |
+
# Extract authors
|
| 806 |
+
authors = []
|
| 807 |
+
for author in article.findall(".//Author"):
|
| 808 |
+
last_name = author.findtext(".//LastName") or ""
|
| 809 |
+
initials = author.findtext(".//Initials") or ""
|
| 810 |
+
if last_name and initials:
|
| 811 |
+
authors.append(f"{last_name} {initials}")
|
| 812 |
+
|
| 813 |
+
# Format authors for citation
|
| 814 |
+
author_text = ""
|
| 815 |
+
if authors:
|
| 816 |
+
if len(authors) == 1:
|
| 817 |
+
author_text = authors[0]
|
| 818 |
+
elif len(authors) == 2:
|
| 819 |
+
author_text = f"{authors[0]} & {authors[1]}"
|
| 820 |
+
else:
|
| 821 |
+
author_text = f"{authors[0]} et al."
|
| 822 |
+
|
| 823 |
+
# Extract journal and publication year
|
| 824 |
+
journal = article.findtext(".//Journal/Title") or "Unknown Journal"
|
| 825 |
+
year = article.findtext(".//PubDate/Year") or ""
|
| 826 |
+
|
| 827 |
+
# Create direct URL to PubMed article
|
| 828 |
+
url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
| 829 |
+
|
| 830 |
+
# Create citation
|
| 831 |
+
citation = f"{author_text}{' ' if author_text else ''}({year}). {title}. {journal}. PMID: {pmid}"
|
| 832 |
+
|
| 833 |
+
# Check for full text availability
|
| 834 |
+
pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
|
| 835 |
+
has_full_text = bool(pmc_id)
|
| 836 |
+
full_text_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/" if pmc_id else None
|
| 837 |
+
|
| 838 |
+
# Create result object
|
| 839 |
+
result = {
|
| 840 |
+
"pmid": pmid,
|
| 841 |
+
"title": title,
|
| 842 |
+
"abstract": abstract,
|
| 843 |
+
"authors": authors,
|
| 844 |
+
"journal": journal,
|
| 845 |
+
"year": year,
|
| 846 |
+
"url": url,
|
| 847 |
+
"full_text_url": full_text_url,
|
| 848 |
+
"has_full_text": has_full_text,
|
| 849 |
+
"citation": citation
|
| 850 |
+
}
|
| 851 |
+
|
| 852 |
+
results.append(result)
|
| 853 |
+
|
| 854 |
+
except Exception as e:
|
| 855 |
+
print(f"Error parsing article {pmid}: {str(e)}")
|
| 856 |
+
continue
|
| 857 |
+
|
| 858 |
+
return results
|
| 859 |
+
|
| 860 |
+
except Exception as e:
|
| 861 |
+
print(f"Error in PubMed search: {str(e)}")
|
| 862 |
+
return []
|
| 863 |
+
|
| 864 |
+
# Europe PMC search function
|
| 865 |
+
def search_europe_pmc(query, max_results=2):
|
| 866 |
+
"""
|
| 867 |
+
Search Europe PMC for biomedical articles, with a focus on retrieving full text when available.
|
| 868 |
+
Europe PMC provides more open access content than standard PubMed.
|
| 869 |
+
|
| 870 |
+
Args:
|
| 871 |
+
query (str): Search query string
|
| 872 |
+
max_results (int): Maximum number of results to return
|
| 873 |
+
|
| 874 |
+
Returns:
|
| 875 |
+
list: List of article dictionaries with title, abstract, PMID, URL, and full text URL
|
| 876 |
+
"""
|
| 877 |
+
results = []
|
| 878 |
+
|
| 879 |
+
# Rate limiting - Europe PMC allows 30 requests per minute per IP
|
| 880 |
+
time.sleep(2.0) # Conservative rate limiting
|
| 881 |
+
|
| 882 |
+
try:
|
| 883 |
+
# Europe PMC API base URL
|
| 884 |
+
base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 885 |
+
|
| 886 |
+
# Search parameters - specifically looking for open access when possible
|
| 887 |
+
search_params = {
|
| 888 |
+
"query": f"({query}) AND OPEN_ACCESS:y", # Prioritize open access
|
| 889 |
+
"format": "json",
|
| 890 |
+
"pageSize": max_results,
|
| 891 |
+
"resultType": "core" # Get core metadata
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
print(f"Searching Europe PMC with query: {query}")
|
| 895 |
+
response = requests.get(base_url, params=search_params)
|
| 896 |
+
|
| 897 |
+
if response.status_code != 200:
|
| 898 |
+
print(f"Europe PMC search error: {response.status_code}")
|
| 899 |
+
# Try again without open access restriction if no results
|
| 900 |
+
search_params["query"] = query
|
| 901 |
+
response = requests.get(base_url, params=search_params)
|
| 902 |
+
if response.status_code != 200:
|
| 903 |
+
return []
|
| 904 |
+
|
| 905 |
+
data = response.json()
|
| 906 |
+
|
| 907 |
+
# Check if we have results
|
| 908 |
+
hit_count = data.get("hitCount", 0)
|
| 909 |
+
if hit_count == 0:
|
| 910 |
+
print("No Europe PMC results found")
|
| 911 |
+
return []
|
| 912 |
+
|
| 913 |
+
# Process results
|
| 914 |
+
articles = data.get("resultList", {}).get("result", [])
|
| 915 |
+
|
| 916 |
+
for article in articles:
|
| 917 |
+
try:
|
| 918 |
+
# Extract basic metadata
|
| 919 |
+
pmid = article.get("pmid")
|
| 920 |
+
doi = article.get("doi")
|
| 921 |
+
title = article.get("title", "No title available")
|
| 922 |
+
abstract = article.get("abstractText", "Abstract not available")
|
| 923 |
+
journal = article.get("journalTitle", "Unknown Journal")
|
| 924 |
+
pub_year = article.get("pubYear", "")
|
| 925 |
+
|
| 926 |
+
# Check if it's open access
|
| 927 |
+
is_open_access = article.get("isOpenAccess") == "Y"
|
| 928 |
+
|
| 929 |
+
# Get full text URL if available
|
| 930 |
+
full_text_url = None
|
| 931 |
+
full_text_urls = article.get("fullTextUrlList", {}).get("fullTextUrl", [])
|
| 932 |
+
for url_entry in full_text_urls:
|
| 933 |
+
if url_entry.get("availability") == "Open access" or url_entry.get("documentStyle") == "pdf":
|
| 934 |
+
full_text_url = url_entry.get("url")
|
| 935 |
+
break
|
| 936 |
+
|
| 937 |
+
# If no specific full text URL found but we have a PMID, create Europe PMC link
|
| 938 |
+
if not full_text_url and pmid:
|
| 939 |
+
full_text_url = f"https://europepmc.org/article/MED/{pmid}"
|
| 940 |
+
elif not full_text_url and doi:
|
| 941 |
+
full_text_url = f"https://doi.org/{doi}"
|
| 942 |
+
|
| 943 |
+
# Get authors
|
| 944 |
+
author_list = article.get("authorList", {}).get("author", [])
|
| 945 |
+
authors = []
|
| 946 |
+
|
| 947 |
+
for author in author_list:
|
| 948 |
+
last_name = author.get("lastName", "")
|
| 949 |
+
initials = author.get("initials", "")
|
| 950 |
+
if last_name:
|
| 951 |
+
authors.append(f"{last_name} {initials}")
|
| 952 |
+
|
| 953 |
+
# Format author citation
|
| 954 |
+
author_text = ""
|
| 955 |
+
if authors:
|
| 956 |
+
if len(authors) == 1:
|
| 957 |
+
author_text = authors[0]
|
| 958 |
+
elif len(authors) == 2:
|
| 959 |
+
author_text = f"{authors[0]} & {authors[1]}"
|
| 960 |
+
else:
|
| 961 |
+
author_text = f"{authors[0]} et al."
|
| 962 |
+
|
| 963 |
+
# Create citation
|
| 964 |
+
citation = f"{author_text}{' ' if author_text else ''}({pub_year}). {title}. {journal}."
|
| 965 |
+
if pmid:
|
| 966 |
+
citation += f" PMID: {pmid}"
|
| 967 |
+
if doi:
|
| 968 |
+
citation += f" DOI: {doi}"
|
| 969 |
+
|
| 970 |
+
# Create a direct URL to access the article
|
| 971 |
+
url = full_text_url if full_text_url else (
|
| 972 |
+
f"https://europepmc.org/article/MED/{pmid}" if pmid else (
|
| 973 |
+
f"https://doi.org/{doi}" if doi else ""
|
| 974 |
+
)
|
| 975 |
+
)
|
| 976 |
+
|
| 977 |
+
# Create source type with OA indicator
|
| 978 |
+
source_type = "Europe PMC" + (" (Open Access)" if is_open_access else "")
|
| 979 |
+
|
| 980 |
+
# Format for compatibility with existing code
|
| 981 |
+
result = {
|
| 982 |
+
"pmid": pmid, # May be None for some articles
|
| 983 |
+
"doi": doi, # Alternative identifier
|
| 984 |
+
"title": title,
|
| 985 |
+
"abstract": abstract,
|
| 986 |
+
"authors": authors,
|
| 987 |
+
"journal": journal,
|
| 988 |
+
"year": pub_year,
|
| 989 |
+
"url": url,
|
| 990 |
+
"full_text_url": full_text_url,
|
| 991 |
+
"has_full_text": is_open_access or full_text_url is not None,
|
| 992 |
+
"citation": citation,
|
| 993 |
+
"source_type": source_type,
|
| 994 |
+
"is_open_access": is_open_access
|
| 995 |
+
}
|
| 996 |
+
|
| 997 |
+
results.append(result)
|
| 998 |
+
|
| 999 |
+
except Exception as e:
|
| 1000 |
+
print(f"Error parsing Europe PMC article: {str(e)}")
|
| 1001 |
+
continue
|
| 1002 |
+
|
| 1003 |
+
print(f"Found {len(results)} Europe PMC articles")
|
| 1004 |
+
return results
|
| 1005 |
+
|
| 1006 |
+
except Exception as e:
|
| 1007 |
+
print(f"Error in Europe PMC search: {str(e)}")
|
| 1008 |
+
return []
|
| 1009 |
+
|
| 1010 |
# Enhanced RAG System with focused PubMed searches
|
| 1011 |
def fetch_medical_evidence(query, max_results=3):
|
| 1012 |
"""
|
| 1013 |
+
Fetch medical evidence using a multi-source approach:
|
| 1014 |
+
1. Search with extracted medical terms in PubMed
|
| 1015 |
+
2. Search with the original query in PubMed
|
| 1016 |
+
3. Search in Europe PMC for additional full-text articles
|
| 1017 |
|
| 1018 |
+
This provides better coverage and relevance from multiple sources.
|
| 1019 |
|
| 1020 |
Args:
|
| 1021 |
query (str): The user's original query
|
| 1022 |
max_results (int): Maximum number of results to return (now set to 3)
|
| 1023 |
|
| 1024 |
Returns:
|
| 1025 |
+
list: Combined and deduplicated results from all searches
|
| 1026 |
"""
|
| 1027 |
# Define API key if available
|
| 1028 |
pubmed_api_key = os.environ.get("PUBMED_API_KEY")
|
|
|
|
| 1038 |
|
| 1039 |
# Search with extracted terms (Search A)
|
| 1040 |
# Increase from 2 to 3 results from this search
|
| 1041 |
+
terms_results = enhanced_search_pubmed(terms_query, retmax=2, api_key=pubmed_api_key)
|
| 1042 |
else:
|
| 1043 |
terms_results = []
|
| 1044 |
|
| 1045 |
# Step 2: Search with the full original query (Search B)
|
| 1046 |
# Increase from 2 to 3 results from this search
|
| 1047 |
print(f"Searching PubMed with full query")
|
| 1048 |
+
full_query_results = enhanced_search_pubmed(query, retmax=2, api_key=pubmed_api_key)
|
| 1049 |
|
| 1050 |
+
# Step 3: Search in Europe PMC for additional results with full text
|
| 1051 |
+
print(f"Searching Europe PMC")
|
| 1052 |
+
europepmc_results = search_europe_pmc(query, max_results=2)
|
| 1053 |
+
|
| 1054 |
+
# Step 4: Combine results, ensuring no duplicates by PMID
|
| 1055 |
all_results = []
|
| 1056 |
seen_pmids = set()
|
| 1057 |
+
seen_dois = set()
|
| 1058 |
+
|
| 1059 |
+
# Process results in order of preference:
|
| 1060 |
+
# 1. Terms search from PubMed
|
| 1061 |
+
# 2. Europe PMC results (likely to have more full text)
|
| 1062 |
+
# 3. Full query search from PubMed
|
| 1063 |
|
| 1064 |
# Add results from terms search first (often more relevant)
|
| 1065 |
for result in terms_results:
|
|
|
|
| 1078 |
"pmid": pmid # Keep the original PMID for direct access
|
| 1079 |
})
|
| 1080 |
|
| 1081 |
+
# Add Europe PMC results next (prioritizing full text articles)
|
| 1082 |
+
for result in europepmc_results:
|
| 1083 |
+
# Some Europe PMC articles may not have a PMID, use DOI as fallback
|
| 1084 |
+
pmid = result.get("pmid")
|
| 1085 |
+
doi = result.get("doi")
|
| 1086 |
+
|
| 1087 |
+
# Skip if we've already seen this article via PMID
|
| 1088 |
+
if pmid and pmid in seen_pmids:
|
| 1089 |
+
continue
|
| 1090 |
+
|
| 1091 |
+
# Skip if we've already seen this article via DOI
|
| 1092 |
+
if doi and doi in seen_dois:
|
| 1093 |
+
continue
|
| 1094 |
+
|
| 1095 |
+
# Skip if we've reached our max
|
| 1096 |
+
if len(all_results) >= max_results:
|
| 1097 |
+
break
|
| 1098 |
+
|
| 1099 |
+
# Add to seen IDs
|
| 1100 |
+
if pmid:
|
| 1101 |
+
seen_pmids.add(pmid)
|
| 1102 |
+
if doi:
|
| 1103 |
+
seen_dois.add(doi)
|
| 1104 |
+
|
| 1105 |
+
# Create identifier
|
| 1106 |
+
identifier = f"PMID:{pmid}" if pmid else f"DOI:{doi}"
|
| 1107 |
+
|
| 1108 |
+
# Add to results
|
| 1109 |
+
all_results.append({
|
| 1110 |
+
"id": identifier,
|
| 1111 |
+
"title": result["title"],
|
| 1112 |
+
"text": result["abstract"],
|
| 1113 |
+
"citation": result["citation"],
|
| 1114 |
+
"url": result["url"],
|
| 1115 |
+
"source_type": result["source_type"],
|
| 1116 |
+
"is_open_access": result["is_open_access"],
|
| 1117 |
+
"pmid": pmid, # May be None
|
| 1118 |
+
"doi": doi # Alternative identifier
|
| 1119 |
+
})
|
| 1120 |
+
|
| 1121 |
# Then add results from full query search
|
| 1122 |
for result in full_query_results:
|
| 1123 |
pmid = result["pmid"]
|
|
|
|
| 1135 |
"pmid": pmid # Keep the original PMID for direct access
|
| 1136 |
})
|
| 1137 |
|
| 1138 |
+
# Step 5: Ensure we have at least some results
|
| 1139 |
if not all_results:
|
| 1140 |
+
print("No relevant medical evidence found")
|
| 1141 |
else:
|
| 1142 |
+
print(f"Found {len(all_results)} relevant medical articles across all sources")
|
| 1143 |
|
| 1144 |
return all_results
|
| 1145 |
|
|
|
|
| 1246 |
|
| 1247 |
1. IMPORTANT: You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
|
| 1248 |
|
| 1249 |
+
2. When citing information from these articles, use the following formats:
|
| 1250 |
+
• For PubMed articles: [PMID:123456] where 123456 is the actual PubMed ID
|
| 1251 |
+
• For Europe PMC articles without PMID: [DOI:10.xxxx/yyyy] where 10.xxxx/yyyy is the DOI
|
| 1252 |
+
|
| 1253 |
Example: "Recent studies have shown improved outcomes with early intervention [PMID:34567890]."
|
| 1254 |
+
Example: "Current guidelines recommend a multidisciplinary approach [DOI:10.1234/abcd]."
|
| 1255 |
|
| 1256 |
3. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
|
| 1257 |
|
|
|
|
| 1264 |
|
| 1265 |
7. Use the most recent sources when available, especially for treatment recommendations.
|
| 1266 |
|
| 1267 |
+
8. If full text is available (marked as "Open Access" or "Full Text Available"), prioritize information from those sources as they contain more complete data.
|
| 1268 |
+
|
| 1269 |
+
9. Europe PMC sources often provide more complete full text access, so give them equal consideration to PubMed sources.
|
| 1270 |
"""
|
| 1271 |
|
| 1272 |
msgs.append({"role": "system", "content": evidence_text})
|
|
|
|
| 1290 |
1. A direct answer to the patient's concerns.
|
| 1291 |
2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
|
| 1292 |
3. Recommendations for a treatment plan or next steps.
|
| 1293 |
+
4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using either:
|
| 1294 |
+
• [PMID:123456] format for PubMed articles
|
| 1295 |
+
• [DOI:10.xxxx/yyyy] format for Europe PMC articles without PMID
|
| 1296 |
+
|
| 1297 |
+
Use no more than 3 sources and no fewer than 2 sources.
|
| 1298 |
|
| 1299 |
**After your main response, ALWAYS include these sections:**
|
| 1300 |
- **Reasoning**: Bullet points detailing your clinical reasoning.
|
| 1301 |
+
- **Sources**: A list of all references cited in your main response (2-3 sources), formatted as:
|
| 1302 |
- PMID: 12345678 - Author et al. (Year). Title. Journal.
|
| 1303 |
URL: https://pubmed.ncbi.nlm.nih.gov/12345678/
|
| 1304 |
+
- DOI: 10.xxxx/yyyy - Author et al. (Year). Title. Journal.
|
| 1305 |
+
URL: https://doi.org/10.xxxx/yyyy
|
| 1306 |
|
| 1307 |
+
IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references, PMIDs, or DOIs.
|
| 1308 |
"""
|
| 1309 |
else:
|
| 1310 |
# Different instructions when RAG is disabled - no mention of sources or citations
|
|
|
|
| 1515 |
result = list(medical_terms)[:max_terms]
|
| 1516 |
return result
|
| 1517 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1518 |
# JSON schema for the search_pubmed function for API documentation
|
| 1519 |
SEARCH_PUBMED_SCHEMA = {
|
| 1520 |
"name": "search_pubmed",
|