Dhruv-Ty commited on
Commit
5566537
Β·
verified Β·
1 Parent(s): d118374

Update src/model.py

Browse files
Files changed (1) hide show
  1. src/model.py +410 -704
src/model.py CHANGED
@@ -27,7 +27,6 @@ openai.api_key = get_openai_api_key()
27
  # System prompts
28
  SYSTEM_PROMPT = """You are an advanced clinical AI assistant designed to aid healthcare professionals.
29
  Follow these guidelines in all responses:
30
-
31
  1. **Clarify First**: Before providing any diagnosis or plan, if the user's query is underspecified, ALWAYS ask relevant clarifying questions to gather necessary patient information. This includes, but is not limited to, symptoms, duration, severity, medical history, age, lifestyle factors (diet, exercise), and current medications.
32
  2. Professional tone: Maintain a clear, respectful, and professional tone appropriate for medical consultation.
33
  3. Evidence-based practice: Base all responses on current medical evidence and guidelines.
@@ -37,20 +36,17 @@ Follow these guidelines in all responses:
37
  7. Limitations: Acknowledge the limits of AI medical advice and recommend in-person consultation when appropriate.
38
  8. Comprehensive approach: Consider differential diagnoses and relevant contextual factors.
39
  9. Patient-centered: Focus on clinically relevant information while maintaining respect for the patient.
40
-
41
  For each consultation:
42
  1. Ask clarifying questions if needed (as per guideline 1).
43
  2. Provide differential diagnosis with likelihood assessment.
44
  3. Suggest appropriate next steps (testing, treatment, referral).
45
  4. Include reasoning for your conclusions.
46
  5. Cite medical literature or guidelines supporting your assessment using [source_id].
47
-
48
  IMPORTANT: Your primary duty is to support clinical decision-making, not replace clinical judgment.
49
  """
50
 
51
  FOLLOW_UP_PROMPT = """Continue this medical consultation based on the previous discussion.
52
  Consider the information already gathered and the tentative diagnosis/plan.
53
-
54
  When responding to the follow-up:
55
  1. Reference relevant details from the prior conversation.
56
  2. Address the specific follow-up question with evidence-based information.
@@ -58,82 +54,13 @@ When responding to the follow-up:
58
  4. Update recommendations if appropriate.
59
  5. Maintain the same structured approach with transparent reasoning.
60
  6. Cite additional medical literature or guidelines when relevant using [source_id].
61
-
62
  Remember that this is an ongoing consultation where continuity of care is important.
63
  """
64
 
65
- # Enhanced medical query preprocessing
66
- def preprocess_medical_query(query):
67
- """
68
- Use GPT-4o to extract key medical terms and concepts from the user query.
69
- This improves search relevance by identifying proper medical terminology.
70
- """
71
- try:
72
- system_prompt = """
73
- You are a medical term extraction system. Extract key medical concepts from the input query.
74
- Focus on:
75
- 1. Symptoms (e.g., fever, pain, cough)
76
- 2. Conditions (e.g., diabetes, hypertension)
77
- 3. Anatomical structures (e.g., liver, heart)
78
- 4. Medications (e.g., aspirin, insulin)
79
- 5. Procedures (e.g., MRI, surgery)
80
-
81
- For each term extracted, try to provide the corresponding medical terminology or MeSH term if applicable.
82
-
83
- Return your answer in this JSON format:
84
- {
85
- "extracted_terms": ["term1", "term2", ...],
86
- "mesh_mappings": {"term1": "MeSH term1", "term2": "MeSH term2", ...},
87
- "optimized_search_query": "term1 AND term2 AND term3..."
88
- }
89
-
90
- Include only the JSON object, nothing else.
91
- """
92
-
93
- response = openai.ChatCompletion.create(
94
- model="gpt-4o-mini", # Using gpt-4o-mini for efficiency, can upgrade to gpt-4o if needed
95
- messages=[
96
- {"role": "system", "content": system_prompt},
97
- {"role": "user", "content": f"Extract medical terms from this query: {query}"}
98
- ],
99
- temperature=0.1, # Low temperature for consistent outputs
100
- max_tokens=500,
101
- )
102
-
103
- # Extract json from response
104
- result_text = response.choices[0].message['content']
105
- try:
106
- # Find JSON object
107
- json_match = re.search(r'({[\s\S]*})', result_text)
108
- if json_match:
109
- json_str = json_match.group(1)
110
- return json.loads(json_str)
111
- else:
112
- return json.loads(result_text)
113
- except json.JSONDecodeError:
114
- # Fallback to original query if parsing fails
115
- return {
116
- "extracted_terms": [query],
117
- "mesh_mappings": {},
118
- "optimized_search_query": query
119
- }
120
- except Exception as e:
121
- print(f"Error in query preprocessing: {str(e)}")
122
- # Fallback to original query
123
- return {
124
- "extracted_terms": [query],
125
- "mesh_mappings": {},
126
- "optimized_search_query": query
127
- }
128
-
129
  # Function to extract source IDs and replace them with actual links
130
  def extract_and_link_sources(text, evidence_snippets):
131
- """
132
- Enhanced function to replace [source_id] placeholders with actual source information.
133
- Improved to handle various citation formats and provide richer context.
134
- """
135
- # Expanded pattern to handle more citation formats
136
- source_pattern = r'\[([\w\d:_\-\.+]+)\]' # Basic citation format [source_id]
137
  matches = re.findall(source_pattern, text)
138
 
139
  source_map = {} # Map to store source_id -> source data
@@ -146,54 +73,28 @@ def extract_and_link_sources(text, evidence_snippets):
146
  "id": snippet["id"],
147
  "title": snippet["title"].strip(),
148
  "url": snippet["url"],
149
- "citation": snippet["citation"],
150
- "has_full_text": snippet.get("has_full_text", False),
151
- "journal": snippet.get("journal", ""),
152
- "year": snippet.get("year", "")
153
  }
154
  break
155
 
156
  # Next, try fuzzy matching for cases where the exact ID isn't matched
157
  for source_id_match in matches:
158
  if source_id_match not in source_map and source_id_match != "source_id":
159
- # Try multiple matching strategies
160
-
161
- # Strategy 1: Match on ID prefix (e.g., pubmed-12345 might match pubmed-12345678)
162
  for snippet in evidence_snippets:
163
- if snippet["id"].startswith(source_id_match) or source_id_match.startswith(snippet["id"]):
 
 
 
 
 
 
164
  source_map[source_id_match] = {
165
  "id": snippet["id"],
166
  "title": snippet["title"].strip(),
167
  "url": snippet["url"],
168
- "citation": snippet["citation"],
169
- "has_full_text": snippet.get("has_full_text", False),
170
- "journal": snippet.get("journal", ""),
171
- "year": snippet.get("year", "")
172
  }
173
  break
174
-
175
- # Strategy 2: Try to match on partial IDs broken by source type
176
- if source_id_match not in source_map:
177
- # Split on common delimiters
178
- for snippet in evidence_snippets:
179
- snippet_id_parts = re.split(r'[-_:.]', snippet["id"])
180
- source_id_parts = re.split(r'[-_:.]', source_id_match)
181
-
182
- # Check if any significant parts match
183
- if (len(snippet_id_parts) > 0 and len(source_id_parts) > 0 and
184
- (snippet_id_parts[0] == source_id_parts[0] or # First part matches (e.g., "pubmed")
185
- (len(snippet_id_parts) > 1 and len(source_id_parts) > 1 and
186
- snippet_id_parts[1] == source_id_parts[1]))): # Second part matches (e.g., the ID number)
187
- source_map[source_id_match] = {
188
- "id": snippet["id"],
189
- "title": snippet["title"].strip(),
190
- "url": snippet["url"],
191
- "citation": snippet["citation"],
192
- "has_full_text": snippet.get("has_full_text", False),
193
- "journal": snippet.get("journal", ""),
194
- "year": snippet.get("year", "")
195
- }
196
- break
197
 
198
  # Handle generic [source_id] placeholder
199
  if "source_id" in matches:
@@ -205,10 +106,7 @@ def extract_and_link_sources(text, evidence_snippets):
205
  "id": snippet["id"],
206
  "title": snippet["title"].strip(),
207
  "url": snippet["url"],
208
- "citation": snippet["citation"],
209
- "has_full_text": snippet.get("has_full_text", False),
210
- "journal": snippet.get("journal", ""),
211
- "year": snippet.get("year", "")
212
  }
213
 
214
  # Replace source_id placeholders with actual links in the text
@@ -216,24 +114,13 @@ def extract_and_link_sources(text, evidence_snippets):
216
  for source_id_key, source_data in source_map.items():
217
  safe_id = re.escape(source_id_key)
218
  pattern = f"\\[{safe_id}\\]"
219
-
220
- # Create a more informative replacement that shows the title and preserves the source ID
221
- title = source_data['title']
222
- short_title = title[:60] + "..." if len(title) > 60 else title
223
-
224
- # Include year if available for better context
225
- year_text = f" ({source_data['year']})" if source_data.get('year') else ""
226
-
227
- # Create the replacement with hover tooltip (works in many markdown renderers)
228
- replacement = f"[{short_title}{year_text}]({source_data['url']} \"{title}\")"
229
-
230
  linked_text = re.sub(pattern, replacement, linked_text)
231
 
232
  # Handle remaining [source_id] placeholders
233
  if "source_id" in source_map and "[source_id]" in linked_text:
234
  generic_data = source_map["source_id"]
235
- year_text = f" ({generic_data['year']})" if generic_data.get('year') else ""
236
- replacement = f"[{generic_data['title']}{year_text}]({generic_data['url']})"
237
  linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
238
 
239
  # Final fallback for any [source_id] not mapped at all
@@ -243,29 +130,48 @@ def extract_and_link_sources(text, evidence_snippets):
243
 
244
  # Implement PubMed API integration for medical evidence retrieval
245
  def fetch_from_pubmed_api(query, max_results=3, api_key=None):
246
- """
247
- Enhanced PubMed API integration using E-utilities (ESearch + EFetch)
248
- to retrieve more detailed article information with better abstracts.
249
- """
250
  results = []
251
- base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
252
 
253
- # Step 1: Process the query with medical term extraction if it's complex enough
254
- if len(query.split()) > 3: # Only process complex queries
255
- query_analysis = preprocess_medical_query(query)
256
- search_query = query_analysis.get("optimized_search_query", query)
257
-
258
- # Get extracted terms for relevance scoring later
259
- extracted_terms = query_analysis.get("extracted_terms", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  else:
261
- search_query = query
262
- extracted_terms = [query]
263
 
264
- # Step 2: Use ESearch to find article IDs
 
 
 
 
 
 
265
  search_params = {
266
  "db": "pubmed",
267
- "term": search_query,
268
- "retmax": max_results * 2, # Get more results than needed for filtering
269
  "retmode": "json",
270
  "sort": "relevance"
271
  }
@@ -273,7 +179,7 @@ def fetch_from_pubmed_api(query, max_results=3, api_key=None):
273
  # Add API key if provided (increases rate limits)
274
  if api_key:
275
  search_params["api_key"] = api_key
276
-
277
  try:
278
  # First get article IDs
279
  search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
@@ -284,132 +190,83 @@ def fetch_from_pubmed_api(query, max_results=3, api_key=None):
284
  search_data = search_response.json()
285
 
286
  if "esearchresult" in search_data and "idlist" in search_data["esearchresult"]:
287
- pmids = search_data["esearchresult"]["idlist"]
288
-
289
- if not pmids:
290
- return []
291
-
292
- # Step 3: Use EFetch to get article details
293
- fetch_params = {
294
- "db": "pubmed",
295
- "id": ",".join(pmids),
296
- "retmode": "xml",
297
- "rettype": "abstract"
298
- }
299
-
300
- if api_key:
301
- fetch_params["api_key"] = api_key
302
-
303
- fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
304
-
305
- if fetch_response.status_code != 200:
306
- return []
307
-
308
- # Step 4: Parse XML to extract article details
309
- root = ET.fromstring(fetch_response.text)
310
 
311
- # Process each article
312
- for i, article in enumerate(root.findall(".//PubmedArticle")):
313
- try:
314
- # Extract PMID
315
- pmid = article.findtext(".//PMID")
316
-
317
- # Extract title
318
- title = article.findtext(".//ArticleTitle") or "No title available"
319
-
320
- # Extract authors
321
- authors = []
322
- for author in article.findall(".//Author"):
323
- last_name = author.findtext(".//LastName") or ""
324
- initials = author.findtext(".//Initials") or ""
325
- if last_name or initials:
326
- authors.append(f"{last_name} {initials}".strip())
327
-
328
- # Extract journal and publication date
329
- journal = article.findtext(".//Journal/Title") or "Unknown Journal"
330
- year = article.findtext(".//PubDate/Year") or "Unknown Year"
331
-
332
- # Extract abstract with sections
333
- abstract_texts = []
334
-
335
- # Get structured abstract if available
336
- abstract_sections = article.findall(".//AbstractText")
337
- if abstract_sections:
338
- for section in abstract_sections:
339
- label = section.get("Label", "")
340
- text = section.text or ""
341
- if label and text:
342
- abstract_texts.append(f"{label}: {text}")
343
- elif text:
344
- abstract_texts.append(text)
345
- else:
346
- # Try single abstract text
347
- main_abstract = article.findtext(".//Abstract/AbstractText")
348
- if main_abstract:
349
- abstract_texts.append(main_abstract)
350
-
351
- abstract = " ".join(abstract_texts) if abstract_texts else "Abstract not available"
352
-
353
- # Check if full text is available in PMC
354
- pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
355
- pmc_link = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/" if pmc_id else None
356
- has_full_text = bool(pmc_id)
357
-
358
- # Build citation
359
- author_citation = ""
360
- if authors:
361
- if len(authors) == 1:
362
- author_citation = authors[0]
363
- elif len(authors) == 2:
364
- author_citation = f"{authors[0]} & {authors[1]}"
365
- else:
366
- author_citation = f"{authors[0]} et al."
367
-
368
- citation = f"{author_citation} ({year}). {title}. {journal}."
369
 
370
- # Build result object
371
- result = {
372
- "id": f"pubmed-{pmid}",
373
- "title": title,
374
- "authors": authors,
375
- "journal": journal,
376
- "year": year,
377
- "text": abstract,
378
- "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
379
- "pmc_url": pmc_link,
380
- "has_full_text": has_full_text,
381
- "source_type": "PubMed" + (" (Full Text Available)" if has_full_text else ""),
382
- "citation": citation
383
- }
384
 
385
- # Calculate relevance score
386
- relevance_score = 0
387
- for term in extracted_terms:
388
- if term.lower() in title.lower():
389
- relevance_score += 2 # Higher weight for terms in title
390
- if term.lower() in abstract.lower():
391
- relevance_score += 1 # Lower weight for terms in abstract
392
 
393
- result["relevance_score"] = relevance_score
394
- results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
- # If we have enough results, break
397
- if len(results) >= max_results:
398
- break
399
-
400
- except Exception as e:
401
- print(f"Error parsing article {pmid}: {str(e)}")
402
- continue
403
-
404
- # Sort by relevance score
405
- results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
406
- return results[:max_results]
407
-
408
- except Exception as e:
409
- print(f"Error in PubMed API fetch: {str(e)}")
410
  return []
411
-
412
- return results
413
 
414
  def fetch_from_pmc_api(query, max_results=2, api_key=None):
415
  """Fetch free full text articles from PubMed Central (PMC)"""
@@ -749,346 +606,50 @@ def fetch_from_core_api(query, max_results=2, api_key=None):
749
  except Exception:
750
  return []
751
 
752
- # Europe PMC API integration for open access full-text articles
753
- def fetch_from_europe_pmc(query, max_results=2):
754
- """
755
- Fetch research articles from Europe PMC's API, which provides better
756
- access to full-text content than regular PubMed.
757
- """
758
- results = []
759
-
760
- # Process the query with medical term extraction if it's complex enough
761
- if len(query.split()) > 3:
762
- query_analysis = preprocess_medical_query(query)
763
- search_query = query_analysis.get("optimized_search_query", query)
764
- extracted_terms = query_analysis.get("extracted_terms", [])
765
- else:
766
- search_query = query
767
- extracted_terms = [query]
768
-
769
- try:
770
- # Europe PMC REST API URL
771
- api_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
772
-
773
- # Request parameters
774
- params = {
775
- "query": search_query,
776
- "format": "json",
777
- "resultType": "core", # Core result fields
778
- "pageSize": max_results * 2, # Get more results for filtering
779
- "cursorMark": "*", # Starting point for pagination
780
- "sort": "relevance", # Sort by relevance
781
- "synonym": "TRUE", # Include MeSH term synonyms for better matching
782
- "hasTextMinedTerms": "TRUE", # Filter for text-mined terms
783
- "hasLabsLinks": "TRUE" # Include links to full text
784
- }
785
-
786
- # Make the request
787
- response = requests.get(api_url, params=params)
788
-
789
- if response.status_code != 200:
790
- return []
791
-
792
- data = response.json()
793
-
794
- # Check if results exist
795
- if "resultList" not in data or "result" not in data["resultList"]:
796
- return []
797
-
798
- # Process results
799
- for article in data["resultList"]["result"]:
800
- try:
801
- # Extract basic article info
802
- pmid = article.get("pmid", "")
803
- title = article.get("title", "No title available")
804
-
805
- # Extract abstract - Europe PMC sometimes provides better abstracts
806
- abstract = article.get("abstractText", "Abstract not available")
807
-
808
- # Get author information
809
- authors = []
810
- if "authorList" in article and "author" in article["authorList"]:
811
- for author in article["authorList"]["author"]:
812
- author_name = []
813
- if "lastName" in author:
814
- author_name.append(author["lastName"])
815
- if "initials" in author:
816
- author_name.append(author["initials"])
817
- if author_name:
818
- authors.append(" ".join(author_name))
819
-
820
- # Get journal info
821
- journal = article.get("journalTitle", "Unknown Journal")
822
- year = article.get("pubYear", "Unknown Year")
823
-
824
- # Check if full text is available
825
- has_full_text = False
826
- full_text_url = None
827
-
828
- # Europe PMC provides several indicators for full text
829
- if "isOpenAccess" in article and article["isOpenAccess"] == "Y":
830
- has_full_text = True
831
-
832
- # Get PMC ID if available
833
- pmc_id = article.get("pmcid", "")
834
- if pmc_id:
835
- has_full_text = True
836
- full_text_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
837
-
838
- # Build citation
839
- author_citation = ""
840
- if authors:
841
- if len(authors) == 1:
842
- author_citation = authors[0]
843
- elif len(authors) == 2:
844
- author_citation = f"{authors[0]} & {authors[1]}"
845
- else:
846
- author_citation = f"{authors[0]} et al."
847
-
848
- citation = f"{author_citation} ({year}). {title}. {journal}."
849
-
850
- # Building the URL - prefer Europe PMC links as they often have better HTML rendering
851
- url = f"https://europepmc.org/article/MED/{pmid}" if pmid else ""
852
-
853
- # If no PMID but has DOI, use DOI link
854
- if not url and "doi" in article:
855
- url = f"https://doi.org/{article['doi']}"
856
-
857
- # Use PMC link if available
858
- if full_text_url:
859
- url = full_text_url
860
-
861
- # Create result object
862
- result = {
863
- "id": f"epmc-{pmid if pmid else article.get('id', uuid.uuid4().hex[:8])}",
864
- "title": title,
865
- "authors": authors,
866
- "journal": journal,
867
- "year": year,
868
- "text": abstract,
869
- "url": url,
870
- "has_full_text": has_full_text,
871
- "source_type": "Europe PMC" + (" (Full Text Available)" if has_full_text else ""),
872
- "citation": citation
873
- }
874
-
875
- # Calculate relevance score
876
- relevance_score = 0
877
- for term in extracted_terms:
878
- if term.lower() in title.lower():
879
- relevance_score += 2 # Higher weight for terms in title
880
- if term.lower() in abstract.lower():
881
- relevance_score += 1 # Lower weight for terms in abstract
882
-
883
- result["relevance_score"] = relevance_score
884
- results.append(result)
885
-
886
- except Exception as e:
887
- print(f"Error processing Europe PMC article: {str(e)}")
888
- continue
889
-
890
- # Sort by relevance score
891
- results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
892
- return results[:max_results]
893
-
894
- except Exception as e:
895
- print(f"Error in Europe PMC API fetch: {str(e)}")
896
- return []
897
-
898
- return results
899
-
900
- # Relevance filtering for medical evidence
901
- def assess_evidence_relevance(query, evidence_snippets, max_results=5):
902
- """
903
- Use GPT-4o to assess the relevance of evidence snippets to the user query
904
- and select the most applicable ones for response generation.
905
- """
906
- if not evidence_snippets:
907
- return []
908
-
909
- try:
910
- # If we already have ranking from the API calls, use that first
911
- if "relevance_score" in evidence_snippets[0]:
912
- # Sort by the existing relevance score
913
- evidence_snippets.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
914
-
915
- # If we have less than or equal to max_results, return all
916
- if len(evidence_snippets) <= max_results:
917
- return evidence_snippets
918
-
919
- # If more than 10 snippets, only run GPT filtering on the top 10
920
- snippets_to_assess = evidence_snippets[:min(10, len(evidence_snippets))]
921
- else:
922
- snippets_to_assess = evidence_snippets
923
-
924
- # Prepare snippets for assessment
925
- snippets_text = ""
926
- for i, snippet in enumerate(snippets_to_assess):
927
- snippets_text += f"---ARTICLE {i+1}---\n"
928
- snippets_text += f"Title: {snippet['title']}\n"
929
- snippets_text += f"Source: {snippet['source_type']}\n"
930
-
931
- # Limit text length to avoid token limits
932
- text = snippet['text']
933
- if len(text) > 800:
934
- text = text[:800] + "..."
935
-
936
- snippets_text += f"Content: {text}\n\n"
937
-
938
- # Create system prompt for assessment
939
- system_prompt = """
940
- You are an expert medical research assistant helping to select the most relevant medical evidence for a patient query.
941
-
942
- Review the provided medical articles and evaluate their relevance to the patient's query.
943
- Focus on these factors:
944
- 1. Direct relevance to the medical condition or symptoms described
945
- 2. Quality and comprehensiveness of the information
946
- 3. Whether it covers diagnosis, treatment, or management aspects needed
947
- 4. Recency and reliability of the source
948
- 5. Presence of actionable information that would help answer the query
949
-
950
- For each article, assign a relevance score from 1-10 (10 being most relevant).
951
-
952
- Return your assessment as a JSON object:
953
- {
954
- "article_rankings": [
955
- {"article_index": 1, "relevance_score": 8, "reason": "Directly addresses the primary symptom with treatment options"},
956
- {"article_index": 2, "relevance_score": 4, "reason": "Tangentially related but not focused on the main condition"},
957
- ...
958
- ],
959
- "recommended_indices": [1, 3, 5] // Indices of the most relevant articles to use, in order of relevance
960
- }
961
-
962
- Include only the JSON in your response.
963
- """
964
-
965
- # Call GPT-4o
966
- response = openai.ChatCompletion.create(
967
- model="gpt-4o-mini",
968
- messages=[
969
- {"role": "system", "content": system_prompt},
970
- {"role": "user", "content": f"Patient query: {query}\n\nArticles to assess:\n{snippets_text}"}
971
- ],
972
- temperature=0.1,
973
- max_tokens=1000,
974
- )
975
-
976
- result_text = response.choices[0].message['content']
977
-
978
- # Extract the JSON
979
- try:
980
- # Find JSON object
981
- json_match = re.search(r'({[\s\S]*})', result_text)
982
- if json_match:
983
- json_str = json_match.group(1)
984
- assessment = json.loads(json_str)
985
- else:
986
- assessment = json.loads(result_text)
987
-
988
- # Get recommended indices
989
- recommended_indices = assessment.get("recommended_indices", [])
990
-
991
- # If no recommendations, fall back to top 5 from original list
992
- if not recommended_indices and len(evidence_snippets) > max_results:
993
- return evidence_snippets[:max_results]
994
-
995
- # Filter evidence snippets based on recommended indices
996
- filtered_snippets = []
997
- for idx in recommended_indices:
998
- if 0 <= idx-1 < len(snippets_to_assess):
999
- filtered_snippets.append(snippets_to_assess[idx-1])
1000
-
1001
- # If we have less than max_results, add more from original sorted list
1002
- if len(filtered_snippets) < max_results and len(evidence_snippets) > len(filtered_snippets):
1003
- # Get indices of snippets already included
1004
- included_indices = set()
1005
- for snippet in filtered_snippets:
1006
- for i, original in enumerate(evidence_snippets):
1007
- if snippet["id"] == original["id"]:
1008
- included_indices.add(i)
1009
- break
1010
-
1011
- # Add more snippets that weren't already included
1012
- for i, snippet in enumerate(evidence_snippets):
1013
- if i not in included_indices and len(filtered_snippets) < max_results:
1014
- filtered_snippets.append(snippet)
1015
-
1016
- return filtered_snippets[:max_results]
1017
-
1018
- except json.JSONDecodeError:
1019
- # Fallback to sorted snippets if parsing fails
1020
- return evidence_snippets[:max_results]
1021
-
1022
- except Exception as e:
1023
- print(f"Error in evidence relevance assessment: {str(e)}")
1024
- # Fallback to the original sorting
1025
- return evidence_snippets[:max_results]
1026
-
1027
  # Enhanced RAG System with real medical sources
1028
  def fetch_medical_evidence(query, max_results=5):
1029
- """Fetch medical evidence from multiple sources using real APIs with improved relevance assessment"""
1030
- all_results = []
1031
 
1032
  # Define API keys
1033
  pubmed_api_key = os.environ.get("PUBMED_API_KEY")
1034
  core_api_key = os.environ.get("CORE_API_KEY")
1035
 
1036
- # Step 1: Query preprocessing with GPT-4o to extract medical terms
1037
- query_analysis = preprocess_medical_query(query)
1038
- processed_query = query_analysis.get("optimized_search_query", query)
1039
-
1040
- # Step 2: Gather evidence from multiple sources
1041
-
1042
  # Source 1: PubMed API - prioritize for relevant medical research
1043
- pubmed_results = fetch_from_pubmed_api(query, max_results=max(3, max_results//2), api_key=pubmed_api_key)
1044
  if pubmed_results:
1045
- all_results.extend(pubmed_results)
1046
 
1047
- # Source 2: Europe PMC - for better full text access
1048
- epmc_results = fetch_from_europe_pmc(query, max_results=max(2, max_results//3))
1049
- if epmc_results:
1050
- all_results.extend(epmc_results)
1051
-
1052
- # Source 3: PubMed Central - free full text articles
1053
- if len(all_results) < max_results * 2: # Get more than needed for filtering
1054
- remaining = (max_results * 2) - len(all_results)
1055
- pmc_results = fetch_from_pmc_api(processed_query, max_results=remaining, api_key=pubmed_api_key)
1056
  if pmc_results:
1057
- all_results.extend(pmc_results)
1058
 
1059
- # Source 4: CORE API - open access research papers
1060
- if len(all_results) < max_results * 2:
1061
- remaining = (max_results * 2) - len(all_results)
1062
- core_results = fetch_from_core_api(processed_query, max_results=remaining, api_key=core_api_key)
1063
  if core_results:
1064
- all_results.extend(core_results)
1065
 
1066
- # Source 5: WHO Guidelines - if still need more results
1067
- if len(all_results) < max_results:
1068
- remaining = max_results - len(all_results)
1069
- who_results = fetch_from_who_api(processed_query, max_results=remaining)
1070
  if who_results:
1071
- all_results.extend(who_results)
1072
 
1073
- # Step 3: Initial sorting by source quality and full text availability
1074
  # Prioritize sources with full text for better diagnosis
1075
- all_results.sort(key=lambda x: (
1076
- x.get("relevance_score", 0), # First by relevance score if available
1077
- "Full Text" in x.get("source_type", ""), # Then by full text availability
1078
- "Europe PMC" in x.get("source_type", ""), # Europe PMC often has better full text access
1079
- "CORE" in x.get("source_type", ""), # CORE for open access
1080
- "PMC" in x.get("source_type", ""), # PMC for free full text
1081
- "PubMed" in x.get("source_type", "") # Regular PubMed last
1082
  ), reverse=True)
1083
 
1084
- # Step 4: Relevance assessment with GPT-4o
1085
- # Only run this if we have more results than needed
1086
- if len(all_results) > max_results:
1087
- filtered_results = assess_evidence_relevance(query, all_results, max_results)
1088
- else:
1089
- filtered_results = all_results
1090
-
1091
- return filtered_results # Return the filtered and ranked results
1092
 
1093
  # Function to parse doctor agent responses
1094
  def parse_doctor_response(response_text):
@@ -1158,7 +719,7 @@ def doctor_agent(messages):
1158
 
1159
  # Single orchestrator turn with enhanced reasoning and citation tracking
1160
  def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1161
- """Handle a single turn of conversation with the doctor agent with improved evidence utilization"""
1162
  # Select appropriate system prompt based on whether this is a follow-up
1163
  if is_follow_up:
1164
  system = {"role": "system", "content": FOLLOW_UP_PROMPT}
@@ -1167,97 +728,30 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1167
 
1168
  msgs = [system] + history
1169
 
1170
- # Evidence gathering with improved relevance
1171
  evidence_snippets = []
1172
  if use_rag:
1173
  # Only fetch and format evidence if RAG is enabled
1174
  evidence_snippets = fetch_medical_evidence(query)
1175
 
1176
- # Format evidence for the model with enhanced structure
1177
  if evidence_snippets:
1178
  evidence_text = "MEDICAL EVIDENCE FROM AUTHORITATIVE SOURCES:\n\n"
1179
 
1180
  for i, snippet in enumerate(evidence_snippets):
1181
- # Create a more readable format with clear section separation
1182
- evidence_text += f"SOURCE {i+1}: [{snippet['id']}]\n"
1183
- evidence_text += f"TITLE: {snippet['title']}\n"
1184
-
1185
- # Add author information if available
1186
- if "authors" in snippet and snippet["authors"]:
1187
- authors_text = ", ".join(snippet["authors"][:3])
1188
- if len(snippet["authors"]) > 3:
1189
- authors_text += " et al."
1190
- evidence_text += f"AUTHORS: {authors_text}\n"
1191
-
1192
- # Add journal and year if available
1193
- journal_info = []
1194
- if "journal" in snippet and snippet["journal"]:
1195
- journal_info.append(snippet["journal"])
1196
- if "year" in snippet and snippet["year"]:
1197
- journal_info.append(snippet["year"])
1198
- if journal_info:
1199
- evidence_text += f"PUBLICATION: {', '.join(journal_info)}\n"
1200
-
1201
- # Format the source type with emphasis on full text availability
1202
- source_type = snippet.get("source_type", "Unknown Source")
1203
- evidence_text += f"SOURCE TYPE: {source_type}\n"
1204
-
1205
- # Format the text with section labels if available
1206
- text = snippet.get("text", "").strip()
1207
- # Split by section labels if they exist (e.g., "METHODS:", "RESULTS:")
1208
- sections = re.split(r'([A-Z][A-Z\s]+:)', text)
1209
-
1210
- if len(sections) > 1:
1211
- formatted_text = ""
1212
- current_section = None
1213
- for section in sections:
1214
- if re.match(r'[A-Z][A-Z\s]+:', section):
1215
- current_section = section
1216
- formatted_text += f"\n{current_section}\n"
1217
- elif current_section is not None:
1218
- formatted_text += section.strip() + "\n"
1219
- evidence_text += f"CONTENT:\n{formatted_text}\n"
1220
- else:
1221
- evidence_text += f"CONTENT:\n{text}\n"
1222
-
1223
- # Add URL for verification
1224
- evidence_text += f"URL: {snippet.get('url', 'No URL available')}\n"
1225
-
1226
- # Additional link to full text if available
1227
- if snippet.get("has_full_text", False) and snippet.get("pmc_url"):
1228
- evidence_text += f"FULL TEXT: {snippet.get('pmc_url')}\n"
1229
-
1230
- # Add citation
1231
- evidence_text += f"CITATION: {snippet.get('citation', 'Citation not available')}\n\n"
1232
-
1233
- # Add a separator between articles
1234
- evidence_text += "----------------------------------------------\n\n"
1235
 
1236
  # Enhanced instructions for better source utilization
1237
- evidence_text += """CITATION AND EVIDENCE USE INSTRUCTIONS:
1238
-
1239
- 1. IMPORTANT: When referencing these sources in your response, use the format [source_id] to cite them.
1240
- Example: "Recent studies have shown improved outcomes with early intervention [pubmed-12345678]."
1241
-
1242
- 2. Focus on articles marked "Full Text Available" as they provide more comprehensive information.
1243
-
1244
- 3. When multiple sources support a claim, cite all of them for stronger evidence.
1245
- Example: "This treatment approach is supported by multiple studies [pubmed-12345678][epmc-87654321]."
1246
-
1247
- 4. For each diagnostic or treatment recommendation, provide at least one citation.
1248
-
1249
- 5. Read the CONTENT sections carefully and extract specific details - don't just cite generally.
1250
-
1251
- 6. If sources have conflicting information, acknowledge this and present both perspectives with citations.
1252
-
1253
- 7. Use the most recent sources when available, especially for treatment recommendations.
1254
-
1255
- 8. For each recommendation, try to provide evidence on:
1256
- - Efficacy (how well it works)
1257
- - Safety (potential side effects)
1258
- - Appropriateness for this specific patient scenario
1259
-
1260
- 9. If full text is available, prioritize information from those sources as they contain more complete data.
1261
  """
1262
 
1263
  msgs.append({"role": "system", "content": evidence_text})
@@ -1272,38 +766,31 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1272
  if use_rag:
1273
  output_instructions = """
1274
  Please structure your response clearly.
1275
-
1276
  **Priority 1: Ask Clarifying Questions**
1277
  If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
1278
-
1279
  **Priority 2: Main Response (After Clarification)**
1280
  Once sufficient information is available (either initially or after asking questions), provide:
1281
  1. A direct answer to the patient's concerns.
1282
  2. If appropriate, a clear diagnosis or differential diagnosis.
1283
  3. Recommendations for a treatment plan or next steps.
1284
  4. Ensure you cite medical evidence using the [source_id] format for any claims or information taken from the provided MEDICAL EVIDENCE snippets.
1285
-
1286
  **After your main response, ALWAYS include these sections:**
1287
  - **Reasoning**: Bullet points detailing your clinical reasoning.
1288
- - **Sources**: A list of all references cited in your main response, with their full titles and publication information.
1289
  """
1290
  else:
1291
  # Different instructions when RAG is disabled - no mention of sources or citations
1292
  output_instructions = """
1293
  Please structure your response clearly.
1294
-
1295
  **Priority 1: Ask Clarifying Questions**
1296
  If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
1297
-
1298
  **Priority 2: Main Response (After Clarification)**
1299
  Once sufficient information is available (either initially or after asking questions), provide:
1300
  1. A direct answer to the patient's concerns.
1301
  2. If appropriate, a clear diagnosis or differential diagnosis.
1302
  3. Recommendations for a treatment plan or next steps.
1303
-
1304
  **After your main response, ALWAYS include this section:**
1305
  - **Reasoning**: Bullet points detailing your clinical reasoning.
1306
-
1307
  IMPORTANT: Since database search is disabled, do not include citations or sources in your response.
1308
  """
1309
 
@@ -1313,42 +800,261 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1313
  # Get response from doctor agent
1314
  response = doctor_agent(msgs)
1315
 
1316
- # Extract and process sources
1317
- explanation = None
1318
- evidence = None
1319
-
1320
  if use_rag:
1321
- # Parse the doctor's response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1322
  parsed_response = parse_doctor_response(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1323
 
1324
- # Main response with embedded citations
1325
- response = parsed_response.get("main_response", response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1326
 
1327
- # Extract reasoning for separate display
1328
- reasoning = parsed_response.get("reasoning", [])
1329
- if reasoning:
1330
- if isinstance(reasoning, list):
1331
- explanation = "\n".join([f"- {r}" for r in reasoning])
 
 
 
 
 
 
 
1332
  else:
1333
- explanation = reasoning
1334
 
1335
- # Process source citations
1336
- if evidence_snippets:
1337
- # Replace source IDs with actual links
1338
- linked_response, source_map = extract_and_link_sources(response, evidence_snippets)
1339
- response = linked_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1340
 
1341
- # Collect evidence for display
1342
- evidence = []
1343
- for source_id, source_data in source_map.items():
1344
- evidence.append({
1345
- "id": source_data["id"],
1346
- "title": source_data["title"],
1347
- "url": source_data["url"],
1348
- "citation": source_data["citation"]
1349
- })
1350
-
1351
- return response, explanation, evidence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1352
 
1353
  # Enhanced interactive loop with better handling of consultations
1354
  def run_consultation(use_rag=True):
 
27
  # System prompts
28
  SYSTEM_PROMPT = """You are an advanced clinical AI assistant designed to aid healthcare professionals.
29
  Follow these guidelines in all responses:
 
30
  1. **Clarify First**: Before providing any diagnosis or plan, if the user's query is underspecified, ALWAYS ask relevant clarifying questions to gather necessary patient information. This includes, but is not limited to, symptoms, duration, severity, medical history, age, lifestyle factors (diet, exercise), and current medications.
31
  2. Professional tone: Maintain a clear, respectful, and professional tone appropriate for medical consultation.
32
  3. Evidence-based practice: Base all responses on current medical evidence and guidelines.
 
36
  7. Limitations: Acknowledge the limits of AI medical advice and recommend in-person consultation when appropriate.
37
  8. Comprehensive approach: Consider differential diagnoses and relevant contextual factors.
38
  9. Patient-centered: Focus on clinically relevant information while maintaining respect for the patient.
 
39
  For each consultation:
40
  1. Ask clarifying questions if needed (as per guideline 1).
41
  2. Provide differential diagnosis with likelihood assessment.
42
  3. Suggest appropriate next steps (testing, treatment, referral).
43
  4. Include reasoning for your conclusions.
44
  5. Cite medical literature or guidelines supporting your assessment using [source_id].
 
45
  IMPORTANT: Your primary duty is to support clinical decision-making, not replace clinical judgment.
46
  """
47
 
48
  FOLLOW_UP_PROMPT = """Continue this medical consultation based on the previous discussion.
49
  Consider the information already gathered and the tentative diagnosis/plan.
 
50
  When responding to the follow-up:
51
  1. Reference relevant details from the prior conversation.
52
  2. Address the specific follow-up question with evidence-based information.
 
54
  4. Update recommendations if appropriate.
55
  5. Maintain the same structured approach with transparent reasoning.
56
  6. Cite additional medical literature or guidelines when relevant using [source_id].
 
57
  Remember that this is an ongoing consultation where continuity of care is important.
58
  """
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # Function to extract source IDs and replace them with actual links
61
  def extract_and_link_sources(text, evidence_snippets):
62
+ """Replace [source_id] placeholders with actual source information"""
63
+ source_pattern = r'\[([\w\d:_\-\.+]+)\]' # Expanded to handle more characters including +
 
 
 
 
64
  matches = re.findall(source_pattern, text)
65
 
66
  source_map = {} # Map to store source_id -> source data
 
73
  "id": snippet["id"],
74
  "title": snippet["title"].strip(),
75
  "url": snippet["url"],
76
+ "citation": snippet["citation"]
 
 
 
77
  }
78
  break
79
 
80
  # Next, try fuzzy matching for cases where the exact ID isn't matched
81
  for source_id_match in matches:
82
  if source_id_match not in source_map and source_id_match != "source_id":
 
 
 
83
  for snippet in evidence_snippets:
84
+ # Try to match on partial IDs (e.g. part before a hyphen)
85
+ snippet_id_parts = snippet["id"].split("-")
86
+ source_id_parts = source_id_match.split("-")
87
+
88
+ # Check if the first parts match (journal name)
89
+ if (snippet_id_parts and source_id_parts and
90
+ snippet_id_parts[0] == source_id_parts[0]):
91
  source_map[source_id_match] = {
92
  "id": snippet["id"],
93
  "title": snippet["title"].strip(),
94
  "url": snippet["url"],
95
+ "citation": snippet["citation"]
 
 
 
96
  }
97
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  # Handle generic [source_id] placeholder
100
  if "source_id" in matches:
 
106
  "id": snippet["id"],
107
  "title": snippet["title"].strip(),
108
  "url": snippet["url"],
109
+ "citation": snippet["citation"]
 
 
 
110
  }
111
 
112
  # Replace source_id placeholders with actual links in the text
 
114
  for source_id_key, source_data in source_map.items():
115
  safe_id = re.escape(source_id_key)
116
  pattern = f"\\[{safe_id}\\]"
117
+ replacement = f"[{source_data['title']}]({source_data['url']})"
 
 
 
 
 
 
 
 
 
 
118
  linked_text = re.sub(pattern, replacement, linked_text)
119
 
120
  # Handle remaining [source_id] placeholders
121
  if "source_id" in source_map and "[source_id]" in linked_text:
122
  generic_data = source_map["source_id"]
123
+ replacement = f"[{generic_data['title']}]({generic_data['url']})"
 
124
  linked_text = re.sub(r'\[source_id\]', replacement, linked_text)
125
 
126
  # Final fallback for any [source_id] not mapped at all
 
130
 
131
  # Implement PubMed API integration for medical evidence retrieval
132
  def fetch_from_pubmed_api(query, max_results=3, api_key=None):
133
+ """Fetch medical evidence from PubMed API using E-utilities"""
 
 
 
134
  results = []
 
135
 
136
+ # Clean up the query for better results
137
+ cleaned_query = re.sub(r'^(hi|hello|hey|greetings|good morning|good afternoon|good evening)[,\.]?\s+', '', query.lower())
138
+ cleaned_query = re.sub(r"(i'?m|i am)\s+a\s+\d+[-\s]year[-\s]old", '', cleaned_query)
139
+ cleaned_query = re.sub(r'(my name is|i am|i have been|i\'ve been|i was|i have|i\'ve had|i feel|i\'m feeling|i experienced)', '', cleaned_query)
140
+
141
+ # Try to extract key medical symptoms
142
+ symptom_patterns = [
143
+ r'(muscle weakness)', r'(fatigue)', r'(rash)', r'(pain)', r'(swelling)',
144
+ r'(difficulty breathing|shortness of breath)', r'(fever)', r'(headache)',
145
+ r'(nausea|vomiting)', r'(dizziness)', r'(numbness)', r'(tingling)'
146
+ ]
147
+
148
+ medical_terms = []
149
+ for pattern in symptom_patterns:
150
+ matches = re.findall(pattern, query.lower())
151
+ if matches:
152
+ medical_terms.extend(matches)
153
+
154
+ # If we found medical terms, prioritize them in the search
155
+ if medical_terms:
156
+ search_query = " AND ".join(medical_terms)
157
+ # Add the complete cleaned query as a less weighted part
158
+ if cleaned_query:
159
+ search_query = f"({search_query}) OR ({cleaned_query})"
160
  else:
161
+ # If no medical terms found, use the cleaned query
162
+ search_query = cleaned_query
163
 
164
+ # Encode the query for the API
165
+ encoded_query = urllib.parse.quote(search_query)
166
+
167
+ # Base URL for PubMed E-utilities
168
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
169
+
170
+ # Search parameters
171
  search_params = {
172
  "db": "pubmed",
173
+ "term": encoded_query,
174
+ "retmax": max_results,
175
  "retmode": "json",
176
  "sort": "relevance"
177
  }
 
179
  # Add API key if provided (increases rate limits)
180
  if api_key:
181
  search_params["api_key"] = api_key
182
+
183
  try:
184
  # First get article IDs
185
  search_response = requests.get(f"{base_url}esearch.fcgi", params=search_params)
 
190
  search_data = search_response.json()
191
 
192
  if "esearchresult" in search_data and "idlist" in search_data["esearchresult"]:
193
+ ids = search_data["esearchresult"]["idlist"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ if ids:
196
+ # Fetch article details
197
+ fetch_params = {
198
+ "db": "pubmed",
199
+ "id": ",".join(ids),
200
+ "retmode": "xml"
201
+ }
202
+ if api_key:
203
+ fetch_params["api_key"] = api_key
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ fetch_response = requests.get(f"{base_url}efetch.fcgi", params=fetch_params)
206
+
207
+ if fetch_response.status_code != 200:
208
+ return []
 
 
 
 
 
 
 
 
 
 
209
 
210
+ try:
211
+ # Parse XML response
212
+ root = ET.fromstring(fetch_response.text)
 
 
 
 
213
 
214
+ for article in root.findall(".//PubmedArticle"):
215
+ try:
216
+ pmid = article.findtext(".//PMID")
217
+ title = article.findtext(".//ArticleTitle") or "No title available"
218
+
219
+ # Extract abstract
220
+ abstract_elements = article.findall(".//AbstractText")
221
+ abstract = " ".join([(elem.text or "") for elem in abstract_elements])
222
+
223
+ # Extract authors
224
+ authors = []
225
+ for author in article.findall(".//Author"):
226
+ last_name = author.findtext(".//LastName") or ""
227
+ initials = author.findtext(".//Initials") or ""
228
+ if last_name and initials:
229
+ authors.append(f"{last_name} {initials}")
230
+
231
+ author_str = ", ".join(authors[:3])
232
+ if len(authors) > 3:
233
+ author_str += " et al."
234
+
235
+ # Extract journal and date
236
+ journal = article.findtext(".//Journal/Title") or "Journal not specified"
237
+ year = article.findtext(".//PubDate/Year") or "N/A"
238
+
239
+ # Create citation
240
+ citation = f"{author_str}. ({year}). {title}. {journal}. PMID: {pmid}"
241
+
242
+ # Create direct access URL
243
+ url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
244
+
245
+ # Check if free full text is available via PMC
246
+ pmc_id = article.findtext(".//ArticleId[@IdType='pmc']")
247
+ has_free_text = bool(pmc_id) or article.findtext(".//PublicationStatus") == "epublish"
248
+
249
+ # If PMC ID is available, use that URL instead as it provides full text
250
+ if pmc_id:
251
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
252
+
253
+ results.append({
254
+ "id": f"pubmed:{pmid}",
255
+ "title": title,
256
+ "text": abstract[:800] + "..." if len(abstract) > 800 else abstract,
257
+ "citation": citation,
258
+ "url": url,
259
+ "source_type": "PubMed" + (" (Free Full Text)" if has_free_text else ""),
260
+ "is_open_access": has_free_text
261
+ })
262
+ except Exception:
263
+ continue
264
+ except ET.ParseError:
265
+ return []
266
 
267
+ return results
268
+ except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
269
  return []
 
 
270
 
271
  def fetch_from_pmc_api(query, max_results=2, api_key=None):
272
  """Fetch free full text articles from PubMed Central (PMC)"""
 
606
  except Exception:
607
  return []
608
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  # Enhanced RAG System with real medical sources
610
  def fetch_medical_evidence(query, max_results=5):
611
+ """Fetch medical evidence from multiple sources using real APIs"""
612
+ results = []
613
 
614
  # Define API keys
615
  pubmed_api_key = os.environ.get("PUBMED_API_KEY")
616
  core_api_key = os.environ.get("CORE_API_KEY")
617
 
 
 
 
 
 
 
618
  # Source 1: PubMed API - prioritize for relevant medical research
619
+ pubmed_results = fetch_from_pubmed_api(query, max_results=max(2, max_results//2), api_key=pubmed_api_key)
620
  if pubmed_results:
621
+ results.extend(pubmed_results)
622
 
623
+ # Source 2: PubMed Central - free full text articles
624
+ if len(results) < max_results:
625
+ remaining = max_results - len(results)
626
+ pmc_results = fetch_from_pmc_api(query, max_results=remaining, api_key=pubmed_api_key)
 
 
 
 
 
627
  if pmc_results:
628
+ results.extend(pmc_results)
629
 
630
+ # Source 3: CORE API - open access research papers
631
+ if len(results) < max_results:
632
+ remaining = max_results - len(results)
633
+ core_results = fetch_from_core_api(query, max_results=remaining, api_key=core_api_key)
634
  if core_results:
635
+ results.extend(core_results)
636
 
637
+ # Source 4: WHO Guidelines - if still need more results
638
+ if len(results) < max_results:
639
+ remaining = max_results - len(results)
640
+ who_results = fetch_from_who_api(query, max_results=remaining)
641
  if who_results:
642
+ results.extend(who_results)
643
 
 
644
  # Prioritize sources with full text for better diagnosis
645
+ results.sort(key=lambda x: (
646
+ "Full Text" in x.get("source_type", ""),
647
+ "CORE" in x.get("source_type", ""),
648
+ "PMC" in x.get("source_type", ""),
649
+ "PubMed" in x.get("source_type", "")
 
 
650
  ), reverse=True)
651
 
652
+ return results[:max_results] # Limit to requested number after sorting
 
 
 
 
 
 
 
653
 
654
  # Function to parse doctor agent responses
655
  def parse_doctor_response(response_text):
 
719
 
720
  # Single orchestrator turn with enhanced reasoning and citation tracking
721
  def orchestrator_chat(history, query, use_rag, is_follow_up=False):
722
+ """Handle a single turn of conversation with the doctor agent"""
723
  # Select appropriate system prompt based on whether this is a follow-up
724
  if is_follow_up:
725
  system = {"role": "system", "content": FOLLOW_UP_PROMPT}
 
728
 
729
  msgs = [system] + history
730
 
731
+ # Evidence gathering
732
  evidence_snippets = []
733
  if use_rag:
734
  # Only fetch and format evidence if RAG is enabled
735
  evidence_snippets = fetch_medical_evidence(query)
736
 
737
+ # Format evidence for the model
738
  if evidence_snippets:
739
  evidence_text = "MEDICAL EVIDENCE FROM AUTHORITATIVE SOURCES:\n\n"
740
 
741
  for i, snippet in enumerate(evidence_snippets):
742
+ evidence_text += f"[{snippet['id']}] {snippet['title']}\n"
743
+ evidence_text += f"Source: {snippet['source_type']}\n"
744
+ evidence_text += f"Content: {snippet['text']}\n"
745
+ evidence_text += f"Citation: {snippet['citation']}\n"
746
+ evidence_text += f"URL: {snippet['url']}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747
 
748
  # Enhanced instructions for better source utilization
749
+ evidence_text += """CITATION INSTRUCTIONS:
750
+ 1. When referencing these sources in your response, use the format [source_id] to cite them.
751
+ 2. Prioritize information from sources marked with "Full Text Available" as they provide more comprehensive data.
752
+ 3. CORE API sources provide open access full text articles that are particularly valuable for diagnosis.
753
+ 4. Use the most relevant medical evidence to support your diagnostic reasoning.
754
+ 5. Try to cite multiple sources to provide a well-rounded assessment.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755
  """
756
 
757
  msgs.append({"role": "system", "content": evidence_text})
 
766
  if use_rag:
767
  output_instructions = """
768
  Please structure your response clearly.
 
769
  **Priority 1: Ask Clarifying Questions**
770
  If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
 
771
  **Priority 2: Main Response (After Clarification)**
772
  Once sufficient information is available (either initially or after asking questions), provide:
773
  1. A direct answer to the patient's concerns.
774
  2. If appropriate, a clear diagnosis or differential diagnosis.
775
  3. Recommendations for a treatment plan or next steps.
776
  4. Ensure you cite medical evidence using the [source_id] format for any claims or information taken from the provided MEDICAL EVIDENCE snippets.
 
777
  **After your main response, ALWAYS include these sections:**
778
  - **Reasoning**: Bullet points detailing your clinical reasoning.
779
+ - **Sources**: A list of all references cited in your main response, using their full titles and corresponding URLs if they were linked (e.g., [Title of Source](URL)). If a source was just an ID without a direct link in the text, list its ID or citation.
780
  """
781
  else:
782
  # Different instructions when RAG is disabled - no mention of sources or citations
783
  output_instructions = """
784
  Please structure your response clearly.
 
785
  **Priority 1: Ask Clarifying Questions**
786
  If the user's query lacks detail for a proper assessment (e.g., age, specific symptoms, medical history, duration, severity), your HIGHEST priority is to ask these questions first. Do not provide a diagnosis or plan until sufficient information is gathered.
 
787
  **Priority 2: Main Response (After Clarification)**
788
  Once sufficient information is available (either initially or after asking questions), provide:
789
  1. A direct answer to the patient's concerns.
790
  2. If appropriate, a clear diagnosis or differential diagnosis.
791
  3. Recommendations for a treatment plan or next steps.
 
792
  **After your main response, ALWAYS include this section:**
793
  - **Reasoning**: Bullet points detailing your clinical reasoning.
 
794
  IMPORTANT: Since database search is disabled, do not include citations or sources in your response.
795
  """
796
 
 
800
  # Get response from doctor agent
801
  response = doctor_agent(msgs)
802
 
803
+ # Process the response based on whether RAG is enabled
 
 
 
804
  if use_rag:
805
+ # Process the response to replace source placeholders with actual links
806
+ linked_response, source_map = extract_and_link_sources(response, evidence_snippets)
807
+
808
+ # Parse the response
809
+ parsed_response = parse_doctor_response(linked_response)
810
+
811
+ # Enhance source information with evidence snippets data
812
+ enhanced_sources = []
813
+ # Use the source_map from extract_and_link_sources as the primary guide for cited sources
814
+ for source_id_key, mapped_data in source_map.items():
815
+ enhanced_sources.append({
816
+ "id": mapped_data["id"], # This is the original ID from the snippet
817
+ "title": mapped_data["title"],
818
+ "citation": mapped_data["citation"],
819
+ "url": mapped_data["url"],
820
+ "source_type": "Referenced Source" # Or derive from snippet if available
821
+ })
822
+
823
+ # Get source types and open access status from original snippets
824
+ for es in enhanced_sources:
825
+ for snippet in evidence_snippets:
826
+ if es["id"] == snippet["id"]:
827
+ es["source_type"] = snippet.get("source_type", "Referenced Source")
828
+ es["is_open_access"] = snippet.get("is_open_access", False)
829
+ break
830
+
831
+ # If there are sources in parsed_response["sources"] that are not in source_map
832
+ # (e.g., LLM hallucinated an ID or cited something not in snippets), add them.
833
+ current_enhanced_ids = {es['id'] for es in enhanced_sources}
834
+
835
+ for source_text in parsed_response["sources"]: # source_text could be "[id]", "title (url)", or just "citation"
836
+ source_id_candidate = source_text.strip("[]") # Basic extraction
837
+
838
+ # Check if this source_id_candidate was part of the original evidence
839
+ found_in_evidence = False
840
+ for snippet in evidence_snippets:
841
+ if source_id_candidate == snippet["id"]:
842
+ if source_id_candidate not in current_enhanced_ids:
843
+ enhanced_sources.append({
844
+ "id": snippet["id"],
845
+ "title": snippet["title"],
846
+ "citation": snippet["citation"],
847
+ "url": snippet["url"],
848
+ "source_type": snippet["source_type"],
849
+ "is_open_access": snippet.get("is_open_access", False)
850
+ })
851
+ current_enhanced_ids.add(snippet["id"]) # Add to set to avoid re-adding
852
+ found_in_evidence = True
853
+ break
854
+
855
+ if not found_in_evidence:
856
+ # If it's not in source_map and not directly in evidence_snippets by a simple ID match,
857
+ # it might be a raw citation or a URL. Add it with available info.
858
+ is_duplicate = False
859
+ for es_item in enhanced_sources:
860
+ if es_item["title"] == source_text or es_item["url"] == source_text or es_item["citation"] == source_text:
861
+ is_duplicate = True
862
+ break
863
+ if not is_duplicate and source_text not in current_enhanced_ids:
864
+ # Try to extract a URL if present in markdown format
865
+ url_match = re.search(r'\[(.*?)\]\((https?://[^)]+)\)', source_text)
866
+ if url_match:
867
+ title = url_match.group(1)
868
+ url = url_match.group(2)
869
+ else:
870
+ title = source_text # Could be a citation string or a plain title
871
+ url = "" # No URL found directly
872
+
873
+ enhanced_sources.append({
874
+ "id": source_id_candidate, # Use the candidate, might be a simple title or part of citation
875
+ "title": title,
876
+ "citation": source_text, # The original text from LLM's source list
877
+ "url": url,
878
+ "source_type": "Referenced Source (uncategorized)"
879
+ })
880
+ current_enhanced_ids.add(source_id_candidate)
881
+
882
+ # Add the enhanced sources back to the parsed response
883
+ parsed_response["enhanced_sources"] = enhanced_sources
884
+ main_response = linked_response
885
+ else:
886
+ # If RAG is disabled, just parse the response without source processing
887
  parsed_response = parse_doctor_response(response)
888
+ parsed_response["enhanced_sources"] = []
889
+ main_response = response
890
+
891
+ # Create detailed explanation with reasoning and sources
892
+ explanation = []
893
+
894
+ # Add reasoning section
895
+ if parsed_response["reasoning"]:
896
+ explanation.append("## REASONING")
897
+ for i, reason in enumerate(parsed_response["reasoning"]):
898
+ explanation.append(f"{i+1}. {reason}")
899
+ explanation.append("")
900
+
901
+ # Only add sources section if RAG is enabled
902
+ if use_rag and parsed_response["enhanced_sources"]:
903
+ explanation.append("## SOURCES USED")
904
+
905
+ # Add enhanced sources first (these are the ones actually cited in the response)
906
+ source_added_count = 0
907
 
908
+ unique_sources_for_display = {} # id: {title, url, citation, source_type}
909
+ for source in parsed_response["enhanced_sources"]:
910
+ # Prefer using the mapped title and URL from extract_and_link_sources if available
911
+ display_id = source.get('id', source.get('title', 'Unknown Source'))
912
+
913
+ if display_id not in unique_sources_for_display:
914
+ unique_sources_for_display[display_id] = {
915
+ "title": source.get('title', 'N/A'),
916
+ "url": source.get('url', ''),
917
+ "citation": source.get('citation', ''),
918
+ "source_type": source.get('source_type', 'Referenced Source'),
919
+ "is_open_access": source.get('is_open_access', False)
920
+ }
921
+
922
+ # Create a categorized display of sources
923
+ source_categories = {
924
+ "CORE": [], # CORE API full text
925
+ "PMC": [], # PubMed Central full text
926
+ "PubMed": [], # PubMed abstracts
927
+ "WHO": [], # WHO guidelines
928
+ "Other": [] # Uncategorized
929
+ }
930
 
931
+ # Categorize sources
932
+ for key, src_data in unique_sources_for_display.items():
933
+ source_type = src_data['source_type']
934
+
935
+ if "CORE" in source_type:
936
+ source_categories["CORE"].append((key, src_data))
937
+ elif "PMC" in source_type:
938
+ source_categories["PMC"].append((key, src_data))
939
+ elif "PubMed" in source_type:
940
+ source_categories["PubMed"].append((key, src_data))
941
+ elif "WHO" in source_type:
942
+ source_categories["WHO"].append((key, src_data))
943
  else:
944
+ source_categories["Other"].append((key, src_data))
945
 
946
+ # Display sources by category
947
+ for category, sources in source_categories.items():
948
+ if sources:
949
+ if category != "Other": # Skip category header for Other
950
+ explanation.append(f"### {category} Sources:")
951
+
952
+ for key, src_data in sources:
953
+ title = src_data['title']
954
+ url = src_data['url']
955
+ is_open_access = src_data.get('is_open_access', False)
956
+
957
+ if url: # If URL exists, make it a markdown link
958
+ explanation.append(f"- [{title}]({url}) {' πŸ”“' if is_open_access else ''}")
959
+ else: # Otherwise, just list the title or ID
960
+ explanation.append(f"- {title}")
961
+
962
+ if src_data['source_type']:
963
+ explanation.append(f" Source Type: {src_data['source_type']}")
964
+ if src_data['citation']: # Always show citation if available
965
+ explanation.append(f" Citation: {src_data['citation']}")
966
+ explanation.append("") # Add a blank line for spacing
967
+ source_added_count += 1
968
+
969
+ if source_added_count == 0 and parsed_response["sources"]: # Fallback to raw sources if enhanced list is empty but LLM listed some
970
+ explanation.append("## SOURCES MENTIONED (Raw)") # Indicate these are less processed
971
+ for source_text in parsed_response["sources"]:
972
+ explanation.append(f"- {source_text.strip()}")
973
+ explanation.append("")
974
+ source_added_count +=1
975
+
976
+ # If we still have no sources, remove the header
977
+ if source_added_count == 0: # Check if any sources were actually added to explanation
978
+ # Remove "## SOURCES USED" header if it was added but no sources followed
979
+ if explanation and explanation[-1] == "## SOURCES USED":
980
+ explanation.pop()
981
+
982
+ # Enhanced version to display clickable article links
983
+ # Check if we have evidence snippets but no sources in the explanation
984
+ if evidence_snippets and "## SOURCES USED" not in "\n".join(explanation):
985
+ # If AI didn't explicitly cite sources, show available evidence anyway
986
+ additional_explanation = ["\n## AVAILABLE MEDICAL SOURCES"]
987
 
988
+ # Create categorized display of all available sources
989
+ categorized_snippets = {
990
+ "CORE Open Access": [], # CORE API full text
991
+ "PubMed Central": [], # PMC full text
992
+ "PubMed": [], # PubMed abstracts
993
+ "WHO Guidelines": [], # WHO guidelines
994
+ "Other": [] # Uncategorized
995
+ }
996
+
997
+ # Categorize snippets
998
+ for snippet in evidence_snippets:
999
+ source_type = snippet.get("source_type", "")
1000
+
1001
+ if "CORE" in source_type:
1002
+ categorized_snippets["CORE Open Access"].append(snippet)
1003
+ elif "PMC" in source_type:
1004
+ categorized_snippets["PubMed Central"].append(snippet)
1005
+ elif "PubMed" in source_type and "PMC" not in source_type:
1006
+ categorized_snippets["PubMed"].append(snippet)
1007
+ elif "WHO" in source_type:
1008
+ categorized_snippets["WHO Guidelines"].append(snippet)
1009
+ else:
1010
+ categorized_snippets["Other"].append(snippet)
1011
+
1012
+ # Display snippets by category
1013
+ for category, snippets in categorized_snippets.items():
1014
+ if snippets:
1015
+ if category != "Other": # Skip category header for Other
1016
+ additional_explanation.append(f"### {category}:")
1017
+
1018
+ for snippet in snippets:
1019
+ title = snippet.get("title", "Unknown Title")
1020
+ url = snippet.get("url", "")
1021
+ source_type = snippet.get("source_type", "Medical Source")
1022
+ is_open_access = snippet.get("is_open_access", False)
1023
+
1024
+ if url:
1025
+ # Format as clickable markdown link with open access indicator
1026
+ additional_explanation.append(f"- [{title}]({url}) {' πŸ”“' if is_open_access else ''}")
1027
+ else:
1028
+ additional_explanation.append(f"- {title} {' πŸ”“' if is_open_access else ''}")
1029
+
1030
+ if "source_type" in snippet:
1031
+ additional_explanation.append(f" Source Type: {snippet['source_type']}")
1032
+ if "citation" in snippet:
1033
+ additional_explanation.append(f" Citation: {snippet['citation']}")
1034
+ additional_explanation.append("")
1035
+
1036
+ # Add to the main explanation
1037
+ explanation.extend(additional_explanation)
1038
+
1039
+ # Add a note about data availability
1040
+ data_availability_note = [
1041
+ "\n## DATA AVAILABILITY NOTE",
1042
+ "- PubMed sources typically provide abstracts only, unless marked as free full text",
1043
+ "- PubMed Central (PMC) sources provide complete free full text articles",
1044
+ "- CORE Open Access sources provide full text content from research repositories",
1045
+ "- WHO Guidelines provide official medical recommendations and protocols",
1046
+ "- Sources marked with πŸ”“ indicate open access content with full text available"
1047
+ ]
1048
+ explanation.extend(data_availability_note)
1049
+
1050
+ # Format explanation as string
1051
+ explanation_text = "\n".join(explanation)
1052
+
1053
+ # Update conversation history
1054
+ history.append({"role": "user", "content": query})
1055
+ history.append({"role": "assistant", "content": main_response})
1056
+
1057
+ return main_response, explanation_text, evidence_snippets
1058
 
1059
  # Enhanced interactive loop with better handling of consultations
1060
  def run_consultation(use_rag=True):