Dhruv-Ty commited on
Commit
0a6ef9c
·
verified ·
1 Parent(s): bbf8466

Update src/model.py

Browse files
Files changed (1) hide show
  1. src/model.py +440 -187
src/model.py CHANGED
@@ -67,7 +67,7 @@ Remember that this is an ongoing consultation where continuity of care is import
67
  def extract_and_link_sources(text, evidence_snippets):
68
  """
69
  Replace [PMID:123456] citation placeholders with actual links to PubMed articles.
70
- Also handles other citation formats for compatibility.
71
 
72
  Args:
73
  text (str): Text containing citations
@@ -78,16 +78,20 @@ def extract_and_link_sources(text, evidence_snippets):
78
  """
79
  # Look for [PMID:123456] format first (preferred)
80
  pmid_pattern = r'\[PMID:(\d+)\]'
 
 
81
  # Also look for older [source_id] format for compatibility
82
  source_pattern = r'\[([\w\d:_\-\.+]+)\]'
83
 
84
  # Find all PMID citations
85
  pmid_matches = re.findall(pmid_pattern, text)
 
 
86
  # Find all other citation formats
87
  source_matches = re.findall(source_pattern, text)
88
 
89
- # Remove PMID matches from source matches to avoid duplicates
90
- source_matches = [s for s in source_matches if not s.startswith('PMID:')]
91
 
92
  # Create source map
93
  source_map = {}
@@ -116,6 +120,30 @@ def extract_and_link_sources(text, evidence_snippets):
116
  }
117
  break
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # Process other citation formats for backward compatibility
120
  for source_id_match in source_matches:
121
  if source_id_match not in source_map and source_id_match != "source_id":
@@ -126,7 +154,8 @@ def extract_and_link_sources(text, evidence_snippets):
126
  "title": snippet["title"].strip(),
127
  "url": snippet["url"],
128
  "citation": snippet["citation"],
129
- "pmid": snippet.get("pmid", "")
 
130
  }
131
  break
132
 
@@ -144,9 +173,22 @@ def extract_and_link_sources(text, evidence_snippets):
144
 
145
  linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # Replace other citation formats
148
  for source_id_key, source_data in source_map.items():
149
- if not source_id_key.startswith("PMID:"):
150
  safe_id = re.escape(source_id_key)
151
  pattern = f"\\[{safe_id}\\]"
152
  replacement = f"[{source_data['title']}]({source_data['url']})"
@@ -163,7 +205,8 @@ def extract_and_link_sources(text, evidence_snippets):
163
  "title": snippet["title"].strip(),
164
  "url": snippet["url"],
165
  "citation": snippet["citation"],
166
- "pmid": snippet.get("pmid", "")
 
167
  }
168
  replacement = f"[{snippet['title']}]({snippet['url']})"
169
  linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
@@ -171,6 +214,7 @@ def extract_and_link_sources(text, evidence_snippets):
171
  # Final fallback for any remaining placeholders
172
  linked_text = re.sub(r'\[source_id\]', "[Medical Reference]", linked_text)
173
  linked_text = re.sub(r'\[PMID:(\d+)\]', r'[PubMed Article]', linked_text)
 
174
 
175
  return linked_text, source_map
176
 
@@ -652,21 +696,333 @@ def fetch_from_core_api(query, max_results=2, api_key=None):
652
  except Exception:
653
  return []
654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  # Enhanced RAG System with focused PubMed searches
656
  def fetch_medical_evidence(query, max_results=3):
657
  """
658
- Fetch medical evidence using a dual-search approach:
659
- 1. Search with extracted medical terms
660
- 2. Search with the original query
 
661
 
662
- This provides better coverage and relevance while focusing exclusively on PubMed.
663
 
664
  Args:
665
  query (str): The user's original query
666
  max_results (int): Maximum number of results to return (now set to 3)
667
 
668
  Returns:
669
- list: Combined and deduplicated results from both searches
670
  """
671
  # Define API key if available
672
  pubmed_api_key = os.environ.get("PUBMED_API_KEY")
@@ -682,18 +1038,28 @@ def fetch_medical_evidence(query, max_results=3):
682
 
683
  # Search with extracted terms (Search A)
684
  # Increase from 2 to 3 results from this search
685
- terms_results = enhanced_search_pubmed(terms_query, retmax=3, api_key=pubmed_api_key)
686
  else:
687
  terms_results = []
688
 
689
  # Step 2: Search with the full original query (Search B)
690
  # Increase from 2 to 3 results from this search
691
  print(f"Searching PubMed with full query")
692
- full_query_results = enhanced_search_pubmed(query, retmax=3, api_key=pubmed_api_key)
693
 
694
- # Step 3: Combine results, ensuring no duplicates by PMID
 
 
 
 
695
  all_results = []
696
  seen_pmids = set()
 
 
 
 
 
 
697
 
698
  # Add results from terms search first (often more relevant)
699
  for result in terms_results:
@@ -712,6 +1078,46 @@ def fetch_medical_evidence(query, max_results=3):
712
  "pmid": pmid # Keep the original PMID for direct access
713
  })
714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
  # Then add results from full query search
716
  for result in full_query_results:
717
  pmid = result["pmid"]
@@ -729,11 +1135,11 @@ def fetch_medical_evidence(query, max_results=3):
729
  "pmid": pmid # Keep the original PMID for direct access
730
  })
731
 
732
- # Step 4: Ensure we have at least some results
733
  if not all_results:
734
- print("No relevant medical evidence found in PubMed")
735
  else:
736
- print(f"Found {len(all_results)} relevant medical articles")
737
 
738
  return all_results
739
 
@@ -840,8 +1246,12 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
840
 
841
  1. IMPORTANT: You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
842
 
843
- 2. When citing information from these articles, use the format [PMID:123456] where 123456 is the actual PubMed ID.
 
 
 
844
  Example: "Recent studies have shown improved outcomes with early intervention [PMID:34567890]."
 
845
 
846
  3. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
847
 
@@ -854,7 +1264,9 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
854
 
855
  7. Use the most recent sources when available, especially for treatment recommendations.
856
 
857
- 8. If full text is available, prioritize information from those sources as they contain more complete data.
 
 
858
  """
859
 
860
  msgs.append({"role": "system", "content": evidence_text})
@@ -878,15 +1290,21 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
878
  1. A direct answer to the patient's concerns.
879
  2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
880
  3. Recommendations for a treatment plan or next steps.
881
- 4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using the [PMID:123456] format for claims or information taken from the provided PubMed articles. Use no more than 3 sources and no fewer than 2 sources.
 
 
 
 
882
 
883
  **After your main response, ALWAYS include these sections:**
884
  - **Reasoning**: Bullet points detailing your clinical reasoning.
885
- - **Sources**: A list of all PubMed references cited in your main response (2-3 sources), formatted as:
886
  - PMID: 12345678 - Author et al. (Year). Title. Journal.
887
  URL: https://pubmed.ncbi.nlm.nih.gov/12345678/
 
 
888
 
889
- IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references or PMIDs.
890
  """
891
  else:
892
  # Different instructions when RAG is disabled - no mention of sources or citations
@@ -1097,171 +1515,6 @@ def extract_medical_terms(query, max_terms=5):
1097
  result = list(medical_terms)[:max_terms]
1098
  return result
1099
 
1100
- # Enhanced PubMed search function
1101
- def enhanced_search_pubmed(query, retmax=2, api_key=None):
1102
- """
1103
- Enhanced PubMed search using E-utilities API with improved parsing and error handling.
1104
-
1105
- Args:
1106
- query (str): Search query string
1107
- retmax (int): Maximum number of results to return
1108
- api_key (str, optional): NCBI API key for higher rate limits
1109
-
1110
- Returns:
1111
- list: List of article dictionaries with title, abstract, PMID, URL
1112
- """
1113
- results = []
1114
-
1115
- # Base URLs for PubMed E-utilities
1116
- base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
1117
-
1118
- # Rate limiting - sleep to avoid hitting rate limits
1119
- # NCBI allows 3 requests/second without API key, 10 with key
1120
- time.sleep(0.33 if api_key is None else 0.1)
1121
-
1122
- try:
1123
- # Step 1: Use ESearch to get PMIDs
1124
- search_params = {
1125
- "db": "pubmed",
1126
- "term": query,
1127
- "retmax": retmax,
1128
- "retmode": "json",
1129
- "sort": "relevance"
1130
- }
1131
-
1132
- if api_key:
1133
- search_params["api_key"] = api_key
1134
-
1135
- search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
1136
-
1137
- if search_response.status_code != 200:
1138
- print(f"PubMed search error: {search_response.status_code}")
1139
- return []
1140
-
1141
- search_data = search_response.json()
1142
-
1143
- if "esearchresult" not in search_data or "idlist" not in search_data["esearchresult"]:
1144
- print("No results found or invalid response format")
1145
- return []
1146
-
1147
- pmids = search_data["esearchresult"]["idlist"]
1148
-
1149
- if not pmids:
1150
- print("No PMIDs found for the query")
1151
- return []
1152
-
1153
- # Rate limiting before second request
1154
- time.sleep(0.33 if api_key is None else 0.1)
1155
-
1156
- # Step 2: Use EFetch to get article details with abstracts
1157
- fetch_params = {
1158
- "db": "pubmed",
1159
- "id": ",".join(pmids),
1160
- "retmode": "xml",
1161
- "rettype": "abstract"
1162
- }
1163
-
1164
- if api_key:
1165
- fetch_params["api_key"] = api_key
1166
-
1167
- fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
1168
-
1169
- if fetch_response.status_code != 200:
1170
- print(f"PubMed fetch error: {fetch_response.status_code}")
1171
- return []
1172
-
1173
- # Step 3: Parse XML response
1174
- root = ET.fromstring(fetch_response.text)
1175
-
1176
- for article in root.findall(".//PubmedArticle"):
1177
- try:
1178
- # Extract PMID
1179
- pmid = article.findtext(".//PMID")
1180
- if not pmid:
1181
- continue
1182
-
1183
- # Extract title
1184
- title = article.findtext(".//ArticleTitle") or "No title available"
1185
-
1186
- # Extract abstract sections with labels if available
1187
- abstract_sections = []
1188
- for abstract_text in article.findall(".//AbstractText"):
1189
- label = abstract_text.get("Label", "")
1190
- text = abstract_text.text or ""
1191
-
1192
- if label and text:
1193
- abstract_sections.append(f"{label}: {text}")
1194
- elif text:
1195
- abstract_sections.append(text)
1196
-
1197
- # If no structured abstract, try to get the plain abstract
1198
- if not abstract_sections:
1199
- abstract_text = article.findtext(".//Abstract/AbstractText")
1200
- if abstract_text:
1201
- abstract_sections.append(abstract_text)
1202
-
1203
- # Join all abstract sections
1204
- abstract = " ".join(abstract_sections) or "Abstract not available"
1205
-
1206
- # Extract authors
1207
- authors = []
1208
- for author in article.findall(".//Author"):
1209
- last_name = author.findtext(".//LastName") or ""
1210
- initials = author.findtext(".//Initials") or ""
1211
- if last_name and initials:
1212
- authors.append(f"{last_name} {initials}")
1213
-
1214
- # Format authors for citation
1215
- author_text = ""
1216
- if authors:
1217
- if len(authors) == 1:
1218
- author_text = authors[0]
1219
- elif len(authors) == 2:
1220
- author_text = f"{authors[0]} & {authors[1]}"
1221
- else:
1222
- author_text = f"{authors[0]} et al."
1223
-
1224
- # Extract journal and publication year
1225
- journal = article.findtext(".//Journal/Title") or "Unknown Journal"
1226
- year = article.findtext(".//PubDate/Year") or ""
1227
-
1228
- # Create direct URL to PubMed article
1229
- url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
1230
-
1231
- # Create citation
1232
- citation = f"{author_text}{' ' if author_text else ''}({year}). {title}. {journal}. PMID: {pmid}"
1233
-
1234
- # Check for full text availability
1235
- pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
1236
- has_full_text = bool(pmc_id)
1237
- full_text_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/" if pmc_id else None
1238
-
1239
- # Create result object
1240
- result = {
1241
- "pmid": pmid,
1242
- "title": title,
1243
- "abstract": abstract,
1244
- "authors": authors,
1245
- "journal": journal,
1246
- "year": year,
1247
- "url": url,
1248
- "full_text_url": full_text_url,
1249
- "has_full_text": has_full_text,
1250
- "citation": citation
1251
- }
1252
-
1253
- results.append(result)
1254
-
1255
- except Exception as e:
1256
- print(f"Error parsing article {pmid}: {str(e)}")
1257
- continue
1258
-
1259
- return results
1260
-
1261
- except Exception as e:
1262
- print(f"Error in PubMed search: {str(e)}")
1263
- return []
1264
-
1265
  # JSON schema for the search_pubmed function for API documentation
1266
  SEARCH_PUBMED_SCHEMA = {
1267
  "name": "search_pubmed",
 
67
  def extract_and_link_sources(text, evidence_snippets):
68
  """
69
  Replace [PMID:123456] citation placeholders with actual links to PubMed articles.
70
+ Also handles DOI citations and other citation formats for compatibility.
71
 
72
  Args:
73
  text (str): Text containing citations
 
78
  """
79
  # Look for [PMID:123456] format first (preferred)
80
  pmid_pattern = r'\[PMID:(\d+)\]'
81
+ # Look for [DOI:10.xxxx/yyyy] format for Europe PMC articles
82
+ doi_pattern = r'\[DOI:(10\.\d+\/[^\]]+)\]'
83
  # Also look for older [source_id] format for compatibility
84
  source_pattern = r'\[([\w\d:_\-\.+]+)\]'
85
 
86
  # Find all PMID citations
87
  pmid_matches = re.findall(pmid_pattern, text)
88
+ # Find all DOI citations
89
+ doi_matches = re.findall(doi_pattern, text)
90
  # Find all other citation formats
91
  source_matches = re.findall(source_pattern, text)
92
 
93
+ # Remove PMID and DOI matches from source matches to avoid duplicates
94
+ source_matches = [s for s in source_matches if not (s.startswith('PMID:') or s.startswith('DOI:'))]
95
 
96
  # Create source map
97
  source_map = {}
 
120
  }
121
  break
122
 
123
+ # Process DOI citations
124
+ for doi in doi_matches:
125
+ for snippet in evidence_snippets:
126
+ # Check if this is a direct DOI match
127
+ if 'doi' in snippet and snippet['doi'] == doi:
128
+ source_map[f"DOI:{doi}"] = {
129
+ "id": snippet.get("id", f"DOI:{doi}"),
130
+ "title": snippet["title"].strip(),
131
+ "url": snippet["url"],
132
+ "citation": snippet["citation"],
133
+ "doi": doi
134
+ }
135
+ break
136
+ # Also check the ID field which might contain DOI
137
+ elif snippet.get("id") == f"DOI:{doi}":
138
+ source_map[f"DOI:{doi}"] = {
139
+ "id": snippet["id"],
140
+ "title": snippet["title"].strip(),
141
+ "url": snippet["url"],
142
+ "citation": snippet["citation"],
143
+ "doi": doi
144
+ }
145
+ break
146
+
147
  # Process other citation formats for backward compatibility
148
  for source_id_match in source_matches:
149
  if source_id_match not in source_map and source_id_match != "source_id":
 
154
  "title": snippet["title"].strip(),
155
  "url": snippet["url"],
156
  "citation": snippet["citation"],
157
+ "pmid": snippet.get("pmid", ""),
158
+ "doi": snippet.get("doi", "")
159
  }
160
  break
161
 
 
173
 
174
  linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
175
 
176
+ # Replace DOI citations with links
177
+ for doi_key in [f"DOI:{doi}" for doi in doi_matches]:
178
+ if doi_key in source_map:
179
+ source_data = source_map[doi_key]
180
+ safe_key = re.escape(doi_key)
181
+ pattern = f"\\[{safe_key}\\]"
182
+
183
+ # Create a replacement with title and URL
184
+ short_title = source_data['title'][:60] + "..." if len(source_data['title']) > 60 else source_data['title']
185
+ replacement = f"[{short_title}]({source_data['url']})"
186
+
187
+ linked_text = re.sub(f"\\[{safe_key}\\]", replacement, linked_text)
188
+
189
  # Replace other citation formats
190
  for source_id_key, source_data in source_map.items():
191
+ if not (source_id_key.startswith("PMID:") or source_id_key.startswith("DOI:")):
192
  safe_id = re.escape(source_id_key)
193
  pattern = f"\\[{safe_id}\\]"
194
  replacement = f"[{source_data['title']}]({source_data['url']})"
 
205
  "title": snippet["title"].strip(),
206
  "url": snippet["url"],
207
  "citation": snippet["citation"],
208
+ "pmid": snippet.get("pmid", ""),
209
+ "doi": snippet.get("doi", "")
210
  }
211
  replacement = f"[{snippet['title']}]({snippet['url']})"
212
  linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
 
214
  # Final fallback for any remaining placeholders
215
  linked_text = re.sub(r'\[source_id\]', "[Medical Reference]", linked_text)
216
  linked_text = re.sub(r'\[PMID:(\d+)\]', r'[PubMed Article]', linked_text)
217
+ linked_text = re.sub(r'\[DOI:(10\.\d+\/[^\]]+)\]', r'[Europe PMC Article]', linked_text)
218
 
219
  return linked_text, source_map
220
 
 
696
  except Exception:
697
  return []
698
 
699
+ # Enhanced PubMed search function
700
+ def enhanced_search_pubmed(query, retmax=2, api_key=None):
701
+ """
702
+ Enhanced PubMed search using E-utilities API with improved parsing and error handling.
703
+
704
+ Args:
705
+ query (str): Search query string
706
+ retmax (int): Maximum number of results to return
707
+ api_key (str, optional): NCBI API key for higher rate limits
708
+
709
+ Returns:
710
+ list: List of article dictionaries with title, abstract, PMID, URL
711
+ """
712
+ results = []
713
+
714
+ # Base URLs for PubMed E-utilities
715
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
716
+
717
+ # Rate limiting - sleep to avoid hitting rate limits
718
+ # NCBI allows 3 requests/second without API key, 10 with key
719
+ time.sleep(0.33 if api_key is None else 0.1)
720
+
721
+ try:
722
+ # Step 1: Use ESearch to get PMIDs
723
+ search_params = {
724
+ "db": "pubmed",
725
+ "term": query,
726
+ "retmax": retmax,
727
+ "retmode": "json",
728
+ "sort": "relevance"
729
+ }
730
+
731
+ if api_key:
732
+ search_params["api_key"] = api_key
733
+
734
+ search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
735
+
736
+ if search_response.status_code != 200:
737
+ print(f"PubMed search error: {search_response.status_code}")
738
+ return []
739
+
740
+ search_data = search_response.json()
741
+
742
+ if "esearchresult" not in search_data or "idlist" not in search_data["esearchresult"]:
743
+ print("No results found or invalid response format")
744
+ return []
745
+
746
+ pmids = search_data["esearchresult"]["idlist"]
747
+
748
+ if not pmids:
749
+ print("No PMIDs found for the query")
750
+ return []
751
+
752
+ # Rate limiting before second request
753
+ time.sleep(0.33 if api_key is None else 0.1)
754
+
755
+ # Step 2: Use EFetch to get article details with abstracts
756
+ fetch_params = {
757
+ "db": "pubmed",
758
+ "id": ",".join(pmids),
759
+ "retmode": "xml",
760
+ "rettype": "abstract"
761
+ }
762
+
763
+ if api_key:
764
+ fetch_params["api_key"] = api_key
765
+
766
+ fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
767
+
768
+ if fetch_response.status_code != 200:
769
+ print(f"PubMed fetch error: {fetch_response.status_code}")
770
+ return []
771
+
772
+ # Step 3: Parse XML response
773
+ root = ET.fromstring(fetch_response.text)
774
+
775
+ for article in root.findall(".//PubmedArticle"):
776
+ try:
777
+ # Extract PMID
778
+ pmid = article.findtext(".//PMID")
779
+ if not pmid:
780
+ continue
781
+
782
+ # Extract title
783
+ title = article.findtext(".//ArticleTitle") or "No title available"
784
+
785
+ # Extract abstract sections with labels if available
786
+ abstract_sections = []
787
+ for abstract_text in article.findall(".//AbstractText"):
788
+ label = abstract_text.get("Label", "")
789
+ text = abstract_text.text or ""
790
+
791
+ if label and text:
792
+ abstract_sections.append(f"{label}: {text}")
793
+ elif text:
794
+ abstract_sections.append(text)
795
+
796
+ # If no structured abstract, try to get the plain abstract
797
+ if not abstract_sections:
798
+ abstract_text = article.findtext(".//Abstract/AbstractText")
799
+ if abstract_text:
800
+ abstract_sections.append(abstract_text)
801
+
802
+ # Join all abstract sections
803
+ abstract = " ".join(abstract_sections) or "Abstract not available"
804
+
805
+ # Extract authors
806
+ authors = []
807
+ for author in article.findall(".//Author"):
808
+ last_name = author.findtext(".//LastName") or ""
809
+ initials = author.findtext(".//Initials") or ""
810
+ if last_name and initials:
811
+ authors.append(f"{last_name} {initials}")
812
+
813
+ # Format authors for citation
814
+ author_text = ""
815
+ if authors:
816
+ if len(authors) == 1:
817
+ author_text = authors[0]
818
+ elif len(authors) == 2:
819
+ author_text = f"{authors[0]} & {authors[1]}"
820
+ else:
821
+ author_text = f"{authors[0]} et al."
822
+
823
+ # Extract journal and publication year
824
+ journal = article.findtext(".//Journal/Title") or "Unknown Journal"
825
+ year = article.findtext(".//PubDate/Year") or ""
826
+
827
+ # Create direct URL to PubMed article
828
+ url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
829
+
830
+ # Create citation
831
+ citation = f"{author_text}{' ' if author_text else ''}({year}). {title}. {journal}. PMID: {pmid}"
832
+
833
+ # Check for full text availability
834
+ pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
835
+ has_full_text = bool(pmc_id)
836
+ full_text_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/" if pmc_id else None
837
+
838
+ # Create result object
839
+ result = {
840
+ "pmid": pmid,
841
+ "title": title,
842
+ "abstract": abstract,
843
+ "authors": authors,
844
+ "journal": journal,
845
+ "year": year,
846
+ "url": url,
847
+ "full_text_url": full_text_url,
848
+ "has_full_text": has_full_text,
849
+ "citation": citation
850
+ }
851
+
852
+ results.append(result)
853
+
854
+ except Exception as e:
855
+ print(f"Error parsing article {pmid}: {str(e)}")
856
+ continue
857
+
858
+ return results
859
+
860
+ except Exception as e:
861
+ print(f"Error in PubMed search: {str(e)}")
862
+ return []
863
+
864
+ # Europe PMC search function
865
+ def search_europe_pmc(query, max_results=2):
866
+ """
867
+ Search Europe PMC for biomedical articles, with a focus on retrieving full text when available.
868
+ Europe PMC provides more open access content than standard PubMed.
869
+
870
+ Args:
871
+ query (str): Search query string
872
+ max_results (int): Maximum number of results to return
873
+
874
+ Returns:
875
+ list: List of article dictionaries with title, abstract, PMID, URL, and full text URL
876
+ """
877
+ results = []
878
+
879
+ # Rate limiting - Europe PMC allows 30 requests per minute per IP
880
+ time.sleep(2.0) # Conservative rate limiting
881
+
882
+ try:
883
+ # Europe PMC API base URL
884
+ base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
885
+
886
+ # Search parameters - specifically looking for open access when possible
887
+ search_params = {
888
+ "query": f"({query}) AND OPEN_ACCESS:y", # Prioritize open access
889
+ "format": "json",
890
+ "pageSize": max_results,
891
+ "resultType": "core" # Get core metadata
892
+ }
893
+
894
+ print(f"Searching Europe PMC with query: {query}")
895
+ response = requests.get(base_url, params=search_params)
896
+
897
+ if response.status_code != 200:
898
+ print(f"Europe PMC search error: {response.status_code}")
899
+ # Try again without open access restriction if no results
900
+ search_params["query"] = query
901
+ response = requests.get(base_url, params=search_params)
902
+ if response.status_code != 200:
903
+ return []
904
+
905
+ data = response.json()
906
+
907
+ # Check if we have results
908
+ hit_count = data.get("hitCount", 0)
909
+ if hit_count == 0:
910
+ print("No Europe PMC results found")
911
+ return []
912
+
913
+ # Process results
914
+ articles = data.get("resultList", {}).get("result", [])
915
+
916
+ for article in articles:
917
+ try:
918
+ # Extract basic metadata
919
+ pmid = article.get("pmid")
920
+ doi = article.get("doi")
921
+ title = article.get("title", "No title available")
922
+ abstract = article.get("abstractText", "Abstract not available")
923
+ journal = article.get("journalTitle", "Unknown Journal")
924
+ pub_year = article.get("pubYear", "")
925
+
926
+ # Check if it's open access
927
+ is_open_access = article.get("isOpenAccess") == "Y"
928
+
929
+ # Get full text URL if available
930
+ full_text_url = None
931
+ full_text_urls = article.get("fullTextUrlList", {}).get("fullTextUrl", [])
932
+ for url_entry in full_text_urls:
933
+ if url_entry.get("availability") == "Open access" or url_entry.get("documentStyle") == "pdf":
934
+ full_text_url = url_entry.get("url")
935
+ break
936
+
937
+ # If no specific full text URL found but we have a PMID, create Europe PMC link
938
+ if not full_text_url and pmid:
939
+ full_text_url = f"https://europepmc.org/article/MED/{pmid}"
940
+ elif not full_text_url and doi:
941
+ full_text_url = f"https://doi.org/{doi}"
942
+
943
+ # Get authors
944
+ author_list = article.get("authorList", {}).get("author", [])
945
+ authors = []
946
+
947
+ for author in author_list:
948
+ last_name = author.get("lastName", "")
949
+ initials = author.get("initials", "")
950
+ if last_name:
951
+ authors.append(f"{last_name} {initials}")
952
+
953
+ # Format author citation
954
+ author_text = ""
955
+ if authors:
956
+ if len(authors) == 1:
957
+ author_text = authors[0]
958
+ elif len(authors) == 2:
959
+ author_text = f"{authors[0]} & {authors[1]}"
960
+ else:
961
+ author_text = f"{authors[0]} et al."
962
+
963
+ # Create citation
964
+ citation = f"{author_text}{' ' if author_text else ''}({pub_year}). {title}. {journal}."
965
+ if pmid:
966
+ citation += f" PMID: {pmid}"
967
+ if doi:
968
+ citation += f" DOI: {doi}"
969
+
970
+ # Create a direct URL to access the article
971
+ url = full_text_url if full_text_url else (
972
+ f"https://europepmc.org/article/MED/{pmid}" if pmid else (
973
+ f"https://doi.org/{doi}" if doi else ""
974
+ )
975
+ )
976
+
977
+ # Create source type with OA indicator
978
+ source_type = "Europe PMC" + (" (Open Access)" if is_open_access else "")
979
+
980
+ # Format for compatibility with existing code
981
+ result = {
982
+ "pmid": pmid, # May be None for some articles
983
+ "doi": doi, # Alternative identifier
984
+ "title": title,
985
+ "abstract": abstract,
986
+ "authors": authors,
987
+ "journal": journal,
988
+ "year": pub_year,
989
+ "url": url,
990
+ "full_text_url": full_text_url,
991
+ "has_full_text": is_open_access or full_text_url is not None,
992
+ "citation": citation,
993
+ "source_type": source_type,
994
+ "is_open_access": is_open_access
995
+ }
996
+
997
+ results.append(result)
998
+
999
+ except Exception as e:
1000
+ print(f"Error parsing Europe PMC article: {str(e)}")
1001
+ continue
1002
+
1003
+ print(f"Found {len(results)} Europe PMC articles")
1004
+ return results
1005
+
1006
+ except Exception as e:
1007
+ print(f"Error in Europe PMC search: {str(e)}")
1008
+ return []
1009
+
1010
  # Enhanced RAG System with focused PubMed searches
1011
  def fetch_medical_evidence(query, max_results=3):
1012
  """
1013
+ Fetch medical evidence using a multi-source approach:
1014
+ 1. Search with extracted medical terms in PubMed
1015
+ 2. Search with the original query in PubMed
1016
+ 3. Search in Europe PMC for additional full-text articles
1017
 
1018
+ This provides better coverage and relevance from multiple sources.
1019
 
1020
  Args:
1021
  query (str): The user's original query
1022
  max_results (int): Maximum number of results to return (now set to 3)
1023
 
1024
  Returns:
1025
+ list: Combined and deduplicated results from all searches
1026
  """
1027
  # Define API key if available
1028
  pubmed_api_key = os.environ.get("PUBMED_API_KEY")
 
1038
 
1039
  # Search with extracted terms (Search A)
1040
  # Increase from 2 to 3 results from this search
1041
+ terms_results = enhanced_search_pubmed(terms_query, retmax=2, api_key=pubmed_api_key)
1042
  else:
1043
  terms_results = []
1044
 
1045
  # Step 2: Search with the full original query (Search B)
1046
  # Increase from 2 to 3 results from this search
1047
  print(f"Searching PubMed with full query")
1048
+ full_query_results = enhanced_search_pubmed(query, retmax=2, api_key=pubmed_api_key)
1049
 
1050
+ # Step 3: Search in Europe PMC for additional results with full text
1051
+ print(f"Searching Europe PMC")
1052
+ europepmc_results = search_europe_pmc(query, max_results=2)
1053
+
1054
+ # Step 4: Combine results, ensuring no duplicates by PMID
1055
  all_results = []
1056
  seen_pmids = set()
1057
+ seen_dois = set()
1058
+
1059
+ # Process results in order of preference:
1060
+ # 1. Terms search from PubMed
1061
+ # 2. Europe PMC results (likely to have more full text)
1062
+ # 3. Full query search from PubMed
1063
 
1064
  # Add results from terms search first (often more relevant)
1065
  for result in terms_results:
 
1078
  "pmid": pmid # Keep the original PMID for direct access
1079
  })
1080
 
1081
+ # Add Europe PMC results next (prioritizing full text articles)
1082
+ for result in europepmc_results:
1083
+ # Some Europe PMC articles may not have a PMID, use DOI as fallback
1084
+ pmid = result.get("pmid")
1085
+ doi = result.get("doi")
1086
+
1087
+ # Skip if we've already seen this article via PMID
1088
+ if pmid and pmid in seen_pmids:
1089
+ continue
1090
+
1091
+ # Skip if we've already seen this article via DOI
1092
+ if doi and doi in seen_dois:
1093
+ continue
1094
+
1095
+ # Skip if we've reached our max
1096
+ if len(all_results) >= max_results:
1097
+ break
1098
+
1099
+ # Add to seen IDs
1100
+ if pmid:
1101
+ seen_pmids.add(pmid)
1102
+ if doi:
1103
+ seen_dois.add(doi)
1104
+
1105
+ # Create identifier
1106
+ identifier = f"PMID:{pmid}" if pmid else f"DOI:{doi}"
1107
+
1108
+ # Add to results
1109
+ all_results.append({
1110
+ "id": identifier,
1111
+ "title": result["title"],
1112
+ "text": result["abstract"],
1113
+ "citation": result["citation"],
1114
+ "url": result["url"],
1115
+ "source_type": result["source_type"],
1116
+ "is_open_access": result["is_open_access"],
1117
+ "pmid": pmid, # May be None
1118
+ "doi": doi # Alternative identifier
1119
+ })
1120
+
1121
  # Then add results from full query search
1122
  for result in full_query_results:
1123
  pmid = result["pmid"]
 
1135
  "pmid": pmid # Keep the original PMID for direct access
1136
  })
1137
 
1138
+ # Step 5: Ensure we have at least some results
1139
  if not all_results:
1140
+ print("No relevant medical evidence found")
1141
  else:
1142
+ print(f"Found {len(all_results)} relevant medical articles across all sources")
1143
 
1144
  return all_results
1145
 
 
1246
 
1247
  1. IMPORTANT: You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
1248
 
1249
+ 2. When citing information from these articles, use the following formats:
1250
+ • For PubMed articles: [PMID:123456] where 123456 is the actual PubMed ID
1251
+ • For Europe PMC articles without PMID: [DOI:10.xxxx/yyyy] where 10.xxxx/yyyy is the DOI
1252
+
1253
  Example: "Recent studies have shown improved outcomes with early intervention [PMID:34567890]."
1254
+ Example: "Current guidelines recommend a multidisciplinary approach [DOI:10.1234/abcd]."
1255
 
1256
  3. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
1257
 
 
1264
 
1265
  7. Use the most recent sources when available, especially for treatment recommendations.
1266
 
1267
+ 8. If full text is available (marked as "Open Access" or "Full Text Available"), prioritize information from those sources as they contain more complete data.
1268
+
1269
+ 9. Europe PMC sources often provide more complete full text access, so give them equal consideration to PubMed sources.
1270
  """
1271
 
1272
  msgs.append({"role": "system", "content": evidence_text})
 
1290
  1. A direct answer to the patient's concerns.
1291
  2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
1292
  3. Recommendations for a treatment plan or next steps.
1293
+ 4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using either:
1294
+ • [PMID:123456] format for PubMed articles
1295
+ • [DOI:10.xxxx/yyyy] format for Europe PMC articles without PMID
1296
+
1297
+ Use no more than 3 sources and no fewer than 2 sources.
1298
 
1299
  **After your main response, ALWAYS include these sections:**
1300
  - **Reasoning**: Bullet points detailing your clinical reasoning.
1301
+ - **Sources**: A list of all references cited in your main response (2-3 sources), formatted as:
1302
  - PMID: 12345678 - Author et al. (Year). Title. Journal.
1303
  URL: https://pubmed.ncbi.nlm.nih.gov/12345678/
1304
+ - DOI: 10.xxxx/yyyy - Author et al. (Year). Title. Journal.
1305
+ URL: https://doi.org/10.xxxx/yyyy
1306
 
1307
+ IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references, PMIDs, or DOIs.
1308
  """
1309
  else:
1310
  # Different instructions when RAG is disabled - no mention of sources or citations
 
1515
  result = list(medical_terms)[:max_terms]
1516
  return result
1517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1518
  # JSON schema for the search_pubmed function for API documentation
1519
  SEARCH_PUBMED_SCHEMA = {
1520
  "name": "search_pubmed",