Dhruv-Ty commited on
Commit
312d099
·
verified ·
1 Parent(s): f7fc61a

Update src/model.py

Browse files
Files changed (1) hide show
  1. src/model.py +167 -64
src/model.py CHANGED
@@ -692,7 +692,7 @@ def fetch_from_core_api(query, max_results=2, api_key=None):
692
  return []
693
 
694
  # Enhanced PubMed search function
695
- def enhanced_search_pubmed(query, retmax=2, api_key=None):
696
  """
697
  Enhanced PubMed search using E-utilities API with improved parsing and error handling.
698
 
@@ -857,7 +857,7 @@ def enhanced_search_pubmed(query, retmax=2, api_key=None):
857
  return []
858
 
859
  # Europe PMC search function
860
- def search_europe_pmc(query, max_results=2):
861
  """
862
  Search Europe PMC for biomedical articles, with a focus on retrieving full text when available.
863
  Europe PMC provides more open access content than standard PubMed.
@@ -865,6 +865,8 @@ def search_europe_pmc(query, max_results=2):
865
  Args:
866
  query (str): Search query string
867
  max_results (int): Maximum number of results to return
 
 
868
 
869
  Returns:
870
  list: List of article dictionaries with title, abstract, PMID, URL, and full text URL
@@ -878,21 +880,29 @@ def search_europe_pmc(query, max_results=2):
878
  # Europe PMC API base URL
879
  base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
880
 
 
 
 
 
 
 
 
 
881
  # Search parameters - specifically looking for open access when possible
882
  search_params = {
883
- "query": f"({query}) AND OPEN_ACCESS:y", # Prioritize open access
884
  "format": "json",
885
  "pageSize": max_results,
886
  "resultType": "core" # Get core metadata
887
  }
888
 
889
- print(f"Searching Europe PMC with query: {query}")
890
  response = requests.get(base_url, params=search_params)
891
 
892
  if response.status_code != 200:
893
  print(f"Europe PMC search error: {response.status_code}")
894
  # Try again without open access restriction if no results
895
- search_params["query"] = query
896
  response = requests.get(base_url, params=search_params)
897
  if response.status_code != 200:
898
  return []
@@ -903,6 +913,10 @@ def search_europe_pmc(query, max_results=2):
903
  hit_count = data.get("hitCount", 0)
904
  if hit_count == 0:
905
  print("No Europe PMC results found")
 
 
 
 
906
  return []
907
 
908
  # Process results
@@ -1007,8 +1021,9 @@ def fetch_medical_evidence(query, max_results=3):
1007
  """
1008
  Fetch medical evidence using a multi-source approach:
1009
  1. Search with extracted medical terms in PubMed
1010
- 2. Search with the original query in PubMed
1011
- 3. Search in Europe PMC for additional full-text articles
 
1012
 
1013
  This provides better coverage and relevance from multiple sources.
1014
 
@@ -1024,42 +1039,51 @@ def fetch_medical_evidence(query, max_results=3):
1024
 
1025
  # Step 1: Extract medical terms from the query
1026
  medical_terms = extract_medical_terms(query)
 
 
 
 
 
 
 
1027
 
1028
  # Only use extracted terms if we found any
1029
- if medical_terms:
1030
- # Join terms with commas for better search
1031
  terms_query = ", ".join(medical_terms)
1032
  print(f"Searching PubMed with extracted terms: {terms_query}")
1033
 
1034
- # Search with extracted terms (Search A)
1035
- # Increase from 2 to 3 results from this search
1036
- terms_results = enhanced_search_pubmed(terms_query, retmax=2, api_key=pubmed_api_key)
1037
- else:
1038
- terms_results = []
 
 
 
1039
 
1040
- # Step 2: Search with the full original query (Search B)
1041
- # Increase from 2 to 3 results from this search
1042
  print(f"Searching PubMed with full query")
1043
- full_query_results = enhanced_search_pubmed(query, retmax=2, api_key=pubmed_api_key)
1044
 
1045
- # Step 3: Search in Europe PMC for additional results with full text
1046
- print(f"Searching Europe PMC")
1047
- europepmc_results = search_europe_pmc(query, max_results=2)
1048
 
1049
- # Step 4: Combine results, ensuring no duplicates by PMID
1050
  all_results = []
1051
  seen_pmids = set()
1052
  seen_dois = set()
1053
 
1054
  # Process results in order of preference:
1055
- # 1. Terms search from PubMed
1056
- # 2. Europe PMC results (likely to have more full text)
1057
- # 3. Full query search from PubMed
 
1058
 
1059
  # Add results from terms search first (often more relevant)
1060
- for result in terms_results:
1061
- pmid = result["pmid"]
1062
- if pmid not in seen_pmids and len(all_results) < max_results:
1063
  seen_pmids.add(pmid)
1064
  # Format for compatibility with existing code
1065
  all_results.append({
@@ -1068,23 +1092,19 @@ def fetch_medical_evidence(query, max_results=3):
1068
  "text": result["abstract"],
1069
  "citation": result["citation"],
1070
  "url": result["url"],
1071
- "source_type": "PubMed" + (" (Full Text Available)" if result["has_full_text"] else ""),
1072
- "is_open_access": result["has_full_text"],
1073
  "pmid": pmid # Keep the original PMID for direct access
1074
  })
1075
 
1076
- # Add Europe PMC results next (prioritizing full text articles)
1077
- for result in europepmc_results:
1078
  # Some Europe PMC articles may not have a PMID, use DOI as fallback
1079
  pmid = result.get("pmid")
1080
  doi = result.get("doi")
1081
 
1082
- # Skip if we've already seen this article via PMID
1083
- if pmid and pmid in seen_pmids:
1084
- continue
1085
-
1086
- # Skip if we've already seen this article via DOI
1087
- if doi and doi in seen_dois:
1088
  continue
1089
 
1090
  # Skip if we've reached our max
@@ -1097,12 +1117,12 @@ def fetch_medical_evidence(query, max_results=3):
1097
  if doi:
1098
  seen_dois.add(doi)
1099
 
1100
- # Create identifier
1101
- identifier = f"PMID:{pmid}" if pmid else f"DOI:{doi}"
1102
 
1103
  # Add to results
1104
  all_results.append({
1105
- "id": identifier,
1106
  "title": result["title"],
1107
  "text": result["abstract"],
1108
  "citation": result["citation"],
@@ -1110,33 +1130,62 @@ def fetch_medical_evidence(query, max_results=3):
1110
  "source_type": result["source_type"],
1111
  "is_open_access": result["is_open_access"],
1112
  "pmid": pmid, # May be None
1113
- "doi": doi # Alternative identifier
1114
  })
1115
 
1116
- # Then add results from full query search
1117
- for result in full_query_results:
1118
- pmid = result["pmid"]
1119
- if pmid not in seen_pmids and len(all_results) < max_results:
1120
  seen_pmids.add(pmid)
1121
- # Format for compatibility with existing code
1122
  all_results.append({
1123
  "id": f"PMID:{pmid}",
1124
  "title": result["title"],
1125
  "text": result["abstract"],
1126
  "citation": result["citation"],
1127
  "url": result["url"],
1128
- "source_type": "PubMed" + (" (Full Text Available)" if result["has_full_text"] else ""),
1129
- "is_open_access": result["has_full_text"],
1130
- "pmid": pmid # Keep the original PMID for direct access
1131
  })
1132
 
1133
- # Step 5: Ensure we have at least some results
1134
- if not all_results:
1135
- print("No relevant medical evidence found")
1136
- else:
1137
- print(f"Found {len(all_results)} relevant medical articles across all sources")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1138
 
1139
- return all_results
 
1140
 
1141
  # Function to parse doctor agent responses
1142
  def parse_doctor_response(response_text):
@@ -1223,13 +1272,21 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1223
 
1224
  # Format evidence for the model
1225
  if evidence_snippets:
1226
- evidence_text = "MEDICAL EVIDENCE FROM PUBMED:\n\n"
1227
 
1228
  for i, snippet in enumerate(evidence_snippets):
1229
- # Format the evidence with clear PMID for citation
1230
  pmid = snippet.get("pmid", "")
 
 
1231
  evidence_text += f"--- ARTICLE {i+1} ---\n"
1232
- evidence_text += f"PMID: {pmid}\n"
 
 
 
 
 
 
1233
  evidence_text += f"Title: {snippet['title']}\n"
1234
  evidence_text += f"Source: {snippet['source_type']}\n"
1235
  evidence_text += f"Content: {snippet['text']}\n"
@@ -1258,7 +1315,7 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1258
  msgs.append({"role": "system", "content": evidence_text})
1259
  else:
1260
  # If no evidence was found, inform the model
1261
- no_evidence_msg = ("Note: No specific medical evidence was found in PubMed for this query. "
1262
  "Please rely on your general medical knowledge and be sure to recommend "
1263
  "appropriate diagnostic steps and medical consultation.")
1264
  msgs.append({"role": "system", "content": no_evidence_msg})
@@ -1355,7 +1412,7 @@ def run_consultation(use_rag=True):
1355
  print("Type 'exit' to end or 'next' for a new case.\n")
1356
 
1357
  if use_rag:
1358
- print("Using medical evidence from: PubMed, PMC, CORE, and WHO")
1359
  print("Sources marked with 🔓 provide full text access\n")
1360
 
1361
  consultation_id = str(uuid.uuid4())[:8]
@@ -1506,8 +1563,8 @@ SEARCH_PUBMED_SCHEMA = {
1506
  },
1507
  "retmax": {
1508
  "type": "integer",
1509
- "description": "Maximum number of results to return (default: 2)",
1510
- "default": 2
1511
  },
1512
  "api_key": {
1513
  "type": "string",
@@ -1530,6 +1587,52 @@ EXAMPLE_FUNCTION_CALL = {
1530
  "name": "search_pubmed",
1531
  "arguments": {
1532
  "query": "headaches, fatigue, dizziness",
1533
- "retmax": 2
1534
  }
1535
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
  return []
693
 
694
  # Enhanced PubMed search function
695
+ def enhanced_search_pubmed(query, retmax=3, api_key=None):
696
  """
697
  Enhanced PubMed search using E-utilities API with improved parsing and error handling.
698
 
 
857
  return []
858
 
859
  # Europe PMC search function
860
+ def search_europe_pmc(query, max_results=3, use_extracted_terms=False, extracted_terms=None):
861
  """
862
  Search Europe PMC for biomedical articles, with a focus on retrieving full text when available.
863
  Europe PMC provides more open access content than standard PubMed.
 
865
  Args:
866
  query (str): Search query string
867
  max_results (int): Maximum number of results to return
868
+ use_extracted_terms (bool): Whether to use the extracted medical terms
869
+ extracted_terms (list): List of extracted medical terms from the query
870
 
871
  Returns:
872
  list: List of article dictionaries with title, abstract, PMID, URL, and full text URL
 
880
  # Europe PMC API base URL
881
  base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
882
 
883
+ # Construct search query based on parameters
884
+ search_query = query
885
+ if use_extracted_terms and extracted_terms and len(extracted_terms) > 0:
886
+ # Join terms with AND for better search
887
+ terms_query = " AND ".join(extracted_terms)
888
+ search_query = terms_query
889
+ print(f"Searching Europe PMC with extracted terms: {terms_query}")
890
+
891
  # Search parameters - specifically looking for open access when possible
892
  search_params = {
893
+ "query": f"({search_query}) AND OPEN_ACCESS:y", # Prioritize open access
894
  "format": "json",
895
  "pageSize": max_results,
896
  "resultType": "core" # Get core metadata
897
  }
898
 
899
+ print(f"Searching Europe PMC with query: {search_query}")
900
  response = requests.get(base_url, params=search_params)
901
 
902
  if response.status_code != 200:
903
  print(f"Europe PMC search error: {response.status_code}")
904
  # Try again without open access restriction if no results
905
+ search_params["query"] = search_query
906
  response = requests.get(base_url, params=search_params)
907
  if response.status_code != 200:
908
  return []
 
913
  hit_count = data.get("hitCount", 0)
914
  if hit_count == 0:
915
  print("No Europe PMC results found")
916
+ # If we used extracted terms and got no results, try with the original query
917
+ if use_extracted_terms and extracted_terms:
918
+ print("Retrying Europe PMC search with original query")
919
+ return search_europe_pmc(query, max_results, False, None)
920
  return []
921
 
922
  # Process results
 
1021
  """
1022
  Fetch medical evidence using a multi-source approach:
1023
  1. Search with extracted medical terms in PubMed
1024
+ 2. Search with extracted medical terms in Europe PMC
1025
+ 3. Search with the original query in PubMed
1026
+ 4. Search with the original query in Europe PMC
1027
 
1028
  This provides better coverage and relevance from multiple sources.
1029
 
 
1039
 
1040
  # Step 1: Extract medical terms from the query
1041
  medical_terms = extract_medical_terms(query)
1042
+ has_medical_terms = len(medical_terms) > 0
1043
+
1044
+ # Initialize results containers
1045
+ terms_pubmed_results = []
1046
+ full_pubmed_results = []
1047
+ terms_europepmc_results = []
1048
+ full_europepmc_results = []
1049
 
1050
  # Only use extracted terms if we found any
1051
+ if has_medical_terms:
1052
+ # Join terms with commas for PubMed
1053
  terms_query = ", ".join(medical_terms)
1054
  print(f"Searching PubMed with extracted terms: {terms_query}")
1055
 
1056
+ # Search PubMed with extracted terms
1057
+ terms_pubmed_results = enhanced_search_pubmed(terms_query, retmax=2, api_key=pubmed_api_key)
1058
+
1059
+ # Search Europe PMC with extracted terms
1060
+ print(f"Searching Europe PMC with extracted terms")
1061
+ terms_europepmc_results = search_europe_pmc(query, max_results=2,
1062
+ use_extracted_terms=True,
1063
+ extracted_terms=medical_terms)
1064
 
1065
+ # Search with the full original query in both sources
 
1066
  print(f"Searching PubMed with full query")
1067
+ full_pubmed_results = enhanced_search_pubmed(query, retmax=2, api_key=pubmed_api_key)
1068
 
1069
+ print(f"Searching Europe PMC with full query")
1070
+ full_europepmc_results = search_europe_pmc(query, max_results=2)
 
1071
 
1072
+ # Step 3: Combine results, ensuring no duplicates by PMID or DOI
1073
  all_results = []
1074
  seen_pmids = set()
1075
  seen_dois = set()
1076
 
1077
  # Process results in order of preference:
1078
+ # 1. Terms search from PubMed (if available)
1079
+ # 2. Terms search from Europe PMC (if available)
1080
+ # 3. Full query from PubMed
1081
+ # 4. Full query from Europe PMC
1082
 
1083
  # Add results from terms search first (often more relevant)
1084
+ for result in terms_pubmed_results:
1085
+ pmid = result.get("pmid")
1086
+ if pmid and pmid not in seen_pmids and len(all_results) < max_results:
1087
  seen_pmids.add(pmid)
1088
  # Format for compatibility with existing code
1089
  all_results.append({
 
1092
  "text": result["abstract"],
1093
  "citation": result["citation"],
1094
  "url": result["url"],
1095
+ "source_type": "PubMed" + (" (Full Text Available)" if result.get("has_full_text") else ""),
1096
+ "is_open_access": result.get("has_full_text", False),
1097
  "pmid": pmid # Keep the original PMID for direct access
1098
  })
1099
 
1100
+ # Add Europe PMC terms results
1101
+ for result in terms_europepmc_results:
1102
  # Some Europe PMC articles may not have a PMID, use DOI as fallback
1103
  pmid = result.get("pmid")
1104
  doi = result.get("doi")
1105
 
1106
+ # Skip if we've already seen this article via PMID or DOI
1107
+ if (pmid and pmid in seen_pmids) or (doi and doi in seen_dois):
 
 
 
 
1108
  continue
1109
 
1110
  # Skip if we've reached our max
 
1117
  if doi:
1118
  seen_dois.add(doi)
1119
 
1120
+ # Determine ID format (prefer PMID if available, fall back to DOI)
1121
+ article_id = f"PMID:{pmid}" if pmid else (f"DOI:{doi}" if doi else str(uuid.uuid4())[:8])
1122
 
1123
  # Add to results
1124
  all_results.append({
1125
+ "id": article_id,
1126
  "title": result["title"],
1127
  "text": result["abstract"],
1128
  "citation": result["citation"],
 
1130
  "source_type": result["source_type"],
1131
  "is_open_access": result["is_open_access"],
1132
  "pmid": pmid, # May be None
1133
+ "doi": doi # May be None
1134
  })
1135
 
1136
+ # Add full query PubMed results if we still need more
1137
+ for result in full_pubmed_results:
1138
+ pmid = result.get("pmid")
1139
+ if pmid and pmid not in seen_pmids and len(all_results) < max_results:
1140
  seen_pmids.add(pmid)
 
1141
  all_results.append({
1142
  "id": f"PMID:{pmid}",
1143
  "title": result["title"],
1144
  "text": result["abstract"],
1145
  "citation": result["citation"],
1146
  "url": result["url"],
1147
+ "source_type": "PubMed" + (" (Full Text Available)" if result.get("has_full_text") else ""),
1148
+ "is_open_access": result.get("has_full_text", False),
1149
+ "pmid": pmid
1150
  })
1151
 
1152
+ # Add full query Europe PMC results if we still need more
1153
+ for result in full_europepmc_results:
1154
+ pmid = result.get("pmid")
1155
+ doi = result.get("doi")
1156
+
1157
+ # Skip if we've already seen this article via PMID or DOI
1158
+ if (pmid and pmid in seen_pmids) or (doi and doi in seen_dois):
1159
+ continue
1160
+
1161
+ # Skip if we've reached our max
1162
+ if len(all_results) >= max_results:
1163
+ break
1164
+
1165
+ # Add to seen IDs
1166
+ if pmid:
1167
+ seen_pmids.add(pmid)
1168
+ if doi:
1169
+ seen_dois.add(doi)
1170
+
1171
+ # Determine ID format (prefer PMID if available, fall back to DOI)
1172
+ article_id = f"PMID:{pmid}" if pmid else (f"DOI:{doi}" if doi else str(uuid.uuid4())[:8])
1173
+
1174
+ # Add to results
1175
+ all_results.append({
1176
+ "id": article_id,
1177
+ "title": result["title"],
1178
+ "text": result["abstract"],
1179
+ "citation": result["citation"],
1180
+ "url": result["url"],
1181
+ "source_type": result["source_type"],
1182
+ "is_open_access": result["is_open_access"],
1183
+ "pmid": pmid, # May be None
1184
+ "doi": doi # May be None
1185
+ })
1186
 
1187
+ # Ensure we have exactly max_results results (or fewer if not enough found)
1188
+ return all_results[:max_results]
1189
 
1190
  # Function to parse doctor agent responses
1191
  def parse_doctor_response(response_text):
 
1272
 
1273
  # Format evidence for the model
1274
  if evidence_snippets:
1275
+ evidence_text = "MEDICAL EVIDENCE FROM MULTIPLE SOURCES:\n\n"
1276
 
1277
  for i, snippet in enumerate(evidence_snippets):
1278
+ # Format the evidence with clear PMID or DOI for citation
1279
  pmid = snippet.get("pmid", "")
1280
+ doi = snippet.get("doi", "")
1281
+
1282
  evidence_text += f"--- ARTICLE {i+1} ---\n"
1283
+
1284
+ # Include the appropriate identifiers
1285
+ if pmid:
1286
+ evidence_text += f"PMID: {pmid}\n"
1287
+ if doi:
1288
+ evidence_text += f"DOI: {doi}\n"
1289
+
1290
  evidence_text += f"Title: {snippet['title']}\n"
1291
  evidence_text += f"Source: {snippet['source_type']}\n"
1292
  evidence_text += f"Content: {snippet['text']}\n"
 
1315
  msgs.append({"role": "system", "content": evidence_text})
1316
  else:
1317
  # If no evidence was found, inform the model
1318
+ no_evidence_msg = ("Note: No specific medical evidence was found for this query in PubMed or Europe PMC. "
1319
  "Please rely on your general medical knowledge and be sure to recommend "
1320
  "appropriate diagnostic steps and medical consultation.")
1321
  msgs.append({"role": "system", "content": no_evidence_msg})
 
1412
  print("Type 'exit' to end or 'next' for a new case.\n")
1413
 
1414
  if use_rag:
1415
+ print("Using medical evidence from: PubMed, Europe PMC, and other medical databases")
1416
  print("Sources marked with 🔓 provide full text access\n")
1417
 
1418
  consultation_id = str(uuid.uuid4())[:8]
 
1563
  },
1564
  "retmax": {
1565
  "type": "integer",
1566
+ "description": "Maximum number of results to return (default: 3)",
1567
+ "default": 3
1568
  },
1569
  "api_key": {
1570
  "type": "string",
 
1587
  "name": "search_pubmed",
1588
  "arguments": {
1589
  "query": "headaches, fatigue, dizziness",
1590
+ "retmax": 3
1591
  }
1592
+ }
1593
+
1594
+ # Function to enhance medical queries using LLM
1595
+ def enhance_medical_query(original_query):
1596
+ """
1597
+ Uses LLM to enhance a medical query for better search results.
1598
+ This function is prepared for future use but is not currently enabled.
1599
+
1600
+ Args:
1601
+ original_query (str): The original user query
1602
+
1603
+ Returns:
1604
+ str: An enhanced query optimized for medical search
1605
+ """
1606
+ try:
1607
+ # System prompt for query enhancement
1608
+ system_prompt = """You are a medical search query optimizer.
1609
+ Your job is to take a user's medical question and rewrite it to be more effective for searching
1610
+ medical databases like PubMed and Europe PMC.
1611
+
1612
+ Guidelines:
1613
+ 1. Extract key medical terms, conditions, symptoms, and treatments
1614
+ 2. Use proper medical terminology where possible
1615
+ 3. Structure the query for optimal search performance
1616
+ 4. Return ONLY the enhanced query without explanation
1617
+ 5. Keep the query concise but comprehensive
1618
+ """
1619
+
1620
+ # Call OpenAI to enhance the query
1621
+ enhanced_response = openai.ChatCompletion.create(
1622
+ model="gpt-3.5-turbo", # Using a smaller model for speed and cost efficiency
1623
+ messages=[
1624
+ {"role": "system", "content": system_prompt},
1625
+ {"role": "user", "content": f"Optimize this medical query for database search: {original_query}"}
1626
+ ],
1627
+ temperature=0.3,
1628
+ max_tokens=100
1629
+ )
1630
+
1631
+ enhanced_query = enhanced_response.choices[0].message['content'].strip()
1632
+ print(f"Enhanced query: {enhanced_query}")
1633
+ return enhanced_query
1634
+
1635
+ except Exception as e:
1636
+ print(f"Error enhancing query: {str(e)}")
1637
+ # Fall back to original query if there's an error
1638
+ return original_query