Dhruv-Ty commited on
Commit
1672f00
·
verified ·
1 Parent(s): 88b24b5

added more sources support

Browse files
Files changed (1) hide show
  1. src/model.py +51 -198
src/model.py CHANGED
@@ -10,6 +10,8 @@ import openai
10
  import urllib.parse
11
  from dotenv import load_dotenv
12
  import time
 
 
13
 
14
  # Load environment variables
15
  load_dotenv()
@@ -25,40 +27,14 @@ def get_openai_api_key():
25
  # Set OpenAI API key
26
  openai.api_key = get_openai_api_key()
27
 
28
- # System prompts
29
- SYSTEM_PROMPT = """You are an advanced clinical AI assistant designed to aid healthcare professionals.
30
- Follow these guidelines in all responses:
31
- 1. **Answer Directly First**: Begin by providing your best answer based on available information. If information is limited, provide your assessment based on what is known, and indicate areas of uncertainty.
32
- 2. **Follow with Clarifying Questions**: After giving your initial assessment, include specific follow-up questions that would help refine your answer. These should be clearly labeled in a separate "Follow-up Questions:" section.
33
- 3. Professional tone: Maintain a clear, respectful, and professional tone appropriate for medical consultation.
34
- 4. Evidence-based practice: Base all responses on current medical evidence and guidelines.
35
- 5. Transparency: Clearly distinguish between established medical facts, clinical guidance, and areas of uncertainty.
36
- 6. Structured analysis: Present information in a clear, organized manner following clinical reasoning patterns.
37
- 7. Citation: Always cite specific sources for medical claims when available using the [PMID:123456] format, where 123456 is the actual PubMed ID number.
38
- 8. Limitations: Acknowledge the limits of AI medical advice and recommend in-person consultation when appropriate.
39
- 9. Comprehensive approach: Consider differential diagnoses and relevant contextual factors.
40
- 10. Patient-centered: Focus on clinically relevant information while maintaining respect for the patient.
41
- For each consultation:
42
- 1. Provide an initial assessment based on available information (as per guideline 1).
43
- 2. Include specific follow-up questions (as per guideline 2).
44
- 3. Provide differential diagnosis with likelihood assessment.
45
- 4. Suggest appropriate next steps (testing, treatment, referral).
46
- 5. Include reasoning for your conclusions.
47
- 6. Cite medical literature or guidelines supporting your assessment using [PMID:123456].
48
- IMPORTANT: Your primary duty is to support clinical decision-making, not replace clinical judgment.
49
- """
50
-
51
- FOLLOW_UP_PROMPT = """Continue this medical consultation based on the previous discussion.
52
- Consider the information already gathered and the tentative diagnosis/plan.
53
- When responding to the follow-up:
54
- 1. Directly address the follow-up question with evidence-based information.
55
- 2. Reference relevant details from the prior conversation.
56
- 3. If additional information would be helpful, include specific follow-up questions in a clearly labeled "Follow-up Questions:" section.
57
- 4. Update recommendations if appropriate based on new information.
58
- 5. Maintain the same structured approach with transparent reasoning.
59
- 6. Cite additional medical literature or guidelines when relevant using [PMID:123456].
60
- Remember that this is an ongoing consultation where continuity of care is important.
61
- """
62
 
63
  # Function to extract source IDs and replace them with actual links
64
  def extract_and_link_sources(text, evidence_snippets):
@@ -355,7 +331,7 @@ def fetch_from_pubmed_api(query, max_results=3, api_key=None):
355
  except Exception:
356
  return []
357
 
358
- def fetch_from_pmc_api(query, max_results=2, api_key=None):
359
  """Fetch free full text articles from PubMed Central (PMC)"""
360
  results = []
361
 
@@ -487,7 +463,7 @@ def fetch_from_pmc_api(query, max_results=2, api_key=None):
487
  except Exception:
488
  return []
489
 
490
- def fetch_from_who_api(query, max_results=1):
491
  """Fetch information from WHO guidelines - using web scraping as alternative to API"""
492
  try:
493
  # WHO search URL (as they don't have a public API, we use web scraping)
@@ -532,7 +508,7 @@ def fetch_from_who_api(query, max_results=1):
532
  except Exception:
533
  return []
534
 
535
- def fetch_from_core_api(query, max_results=2, api_key=None):
536
  """Fetch open access research papers from CORE API"""
537
  results = []
538
 
@@ -1018,8 +994,8 @@ def search_europe_pmc(query, max_results=3, use_extracted_terms=False, extracted
1018
  print(f"Error in Europe PMC search: {str(e)}")
1019
  return []
1020
 
1021
- # Enhanced RAG System with focused PubMed searches
1022
- def fetch_medical_evidence(query, max_results=3):
1023
  """
1024
  Fetch medical evidence using a multi-source approach:
1025
  1. Search with extracted medical terms in PubMed
@@ -1031,7 +1007,7 @@ def fetch_medical_evidence(query, max_results=3):
1031
 
1032
  Args:
1033
  query (str): The user's original query
1034
- max_results (int): Maximum number of results to return (now set to 3)
1035
 
1036
  Returns:
1037
  list: Combined and deduplicated results from all searches
@@ -1056,20 +1032,20 @@ def fetch_medical_evidence(query, max_results=3):
1056
  print(f"Searching PubMed with extracted terms: {terms_query}")
1057
 
1058
  # Search PubMed with extracted terms
1059
- terms_pubmed_results = enhanced_search_pubmed(terms_query, retmax=2, api_key=pubmed_api_key)
1060
 
1061
  # Search Europe PMC with extracted terms
1062
  print(f"Searching Europe PMC with extracted terms")
1063
- terms_europepmc_results = search_europe_pmc(query, max_results=2,
1064
  use_extracted_terms=True,
1065
  extracted_terms=medical_terms)
1066
 
1067
  # Search with the full original query in both sources
1068
  print(f"Searching PubMed with full query")
1069
- full_pubmed_results = enhanced_search_pubmed(query, retmax=2, api_key=pubmed_api_key)
1070
 
1071
  print(f"Searching Europe PMC with full query")
1072
- full_europepmc_results = search_europe_pmc(query, max_results=2)
1073
 
1074
  # Step 3: Combine results, ensuring no duplicates by PMID or DOI
1075
  all_results = []
@@ -1358,50 +1334,26 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1358
  if use_rag:
1359
  # Only fetch and format evidence if RAG is enabled
1360
  evidence_snippets = fetch_medical_evidence(query)
1361
-
1362
  # Format evidence for the model
1363
  if evidence_snippets:
1364
- evidence_text = "MEDICAL EVIDENCE FROM MULTIPLE SOURCES:\n\n"
1365
-
1366
- for i, snippet in enumerate(evidence_snippets):
1367
- # Format the evidence with clear PMID or DOI for citation
1368
- pmid = snippet.get("pmid", "")
1369
- doi = snippet.get("doi", "")
 
1370
 
1371
- evidence_text += f"--- ARTICLE {i+1} ---\n"
1372
-
1373
- # Include the appropriate identifiers
1374
- if pmid:
1375
- evidence_text += f"PMID: {pmid}\n"
1376
- if doi:
1377
- evidence_text += f"DOI: {doi}\n"
1378
-
1379
- evidence_text += f"Title: {snippet['title']}\n"
1380
- evidence_text += f"Source: {snippet['source_type']}\n"
1381
- evidence_text += f"Content: {snippet['text']}\n"
1382
- evidence_text += f"Citation: {snippet['citation']}\n"
1383
- evidence_text += f"URL: {snippet['url']}\n\n"
1384
 
1385
- # Enhanced instructions for better source utilization
1386
- evidence_text += """CITATION INSTRUCTIONS:
1387
- 1. IMPORTANT: Provide a direct answer first before asking follow-up questions. Even with limited information, give your best assessment.
1388
- 2. You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
1389
- 3. When citing information from these articles, use the following formats:
1390
- • For PubMed articles: [PMID:123456] where 123456 is the actual PubMed ID
1391
- • For Europe PMC articles without PMID: [DOI:10.xxxx/yyyy] where 10.xxxx/yyyy is the DOI
1392
-
1393
- Example: "Recent studies have shown improved outcomes with early intervention [PMID:34567890]."
1394
- Example: "Current guidelines recommend a multidisciplinary approach [DOI:10.1234/abcd]."
1395
- 4. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
1396
- 5. When multiple sources support a claim, cite all of them for stronger evidence.
1397
- Example: "This approach is supported by multiple studies [PMID:12345678][PMID:87654321]."
1398
- 6. Include full citations in your Sources section with clickable URLs.
1399
- 7. If the abstracts have conflicting information, acknowledge this and present both perspectives with citations.
1400
- 8. Use the most recent sources when available, especially for treatment recommendations.
1401
- 9. If full text is available (marked as "Open Access" or "Full Text Available"), prioritize information from those sources as they contain more complete data.
1402
- 10. Europe PMC sources often provide more complete full text access, so give them equal consideration to PubMed sources.
1403
- 11. After your direct answer, include specific follow-up questions in a clearly labeled "Follow-up Questions:" section.
1404
- """
1405
 
1406
  msgs.append({"role": "system", "content": evidence_text})
1407
  else:
@@ -1413,75 +1365,7 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1413
 
1414
  # Add instructions for structured output
1415
  if use_rag:
1416
- output_instructions = """
1417
- Please structure your response clearly.
1418
- **Priority 1: Direct Answer First**
1419
- Begin by providing your best assessment based on the available information without using "Direct Answer:" as a heading. Just start your response directly with the answer. If the query lacks some details, offer your initial thoughts based on what is known, while acknowledging areas of uncertainty.
1420
-
1421
- **Priority 2: Follow-up Questions**
1422
- After your direct answer, include a clearly labeled "Follow-up Questions:" section with specific questions that would help refine your assessment.
1423
-
1424
- **Main Response Structure:**
1425
- 1. A direct answer to the patient's concerns WITHOUT the heading "Direct Answer:".
1426
- 2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
1427
- 3. Recommendations for a treatment plan or next steps.
1428
- 4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using either:
1429
- • [PMID:123456] format for PubMed articles
1430
- • [DOI:10.xxxx/yyyy] format for Europe PMC articles without PMID
1431
-
1432
- Use no more than 3 sources and no fewer than 2 sources.
1433
-
1434
- **After your main response, ALWAYS include these sections:**
1435
- - **Follow-up Questions**: Specific numbered questions starting from 1, not bullets.
1436
- Do NOT start the first question with asterisks (**). Format each question properly with just a number.
1437
- - **Reasoning**: Provide a detailed, in-depth explanation of your clinical reasoning. Use bullet points for clarity. Aim for comprehensive insights that would be valuable to a healthcare professional.
1438
- Do NOT start the first point with asterisks (**). Format each bullet point properly.
1439
- - **Sources**: A list of all references cited in your main response (2-3 sources), formatted as:
1440
- - PMID: 12345678 - Author et al. (Year). Title. Journal.
1441
- URL: https://pubmed.ncbi.nlm.nih.gov/12345678/
1442
- - DOI: 10.xxxx/yyyy - Author et al. (Year). Title. Journal.
1443
- URL: https://doi.org/10.xxxx/yyyy
1444
-
1445
- **IMPORTANT FORMATTING NOTES:**
1446
- 1. Do NOT include technical information like URLs, PMIDs or DOIs in the main answer - these belong in the Sources section only.
1447
- 2. For follow-up questions, use numbered format (1. 2. 3.) not bullet points.
1448
- 3. Number the follow-up questions starting from 1, not from any other number.
1449
- 4. NEVER use markdown formatting like ** (asterisks) at the beginning of any points, questions, or lines.
1450
- 5. Make sure all bullet points and numbered items are clean, with no markdown formatting.
1451
-
1452
- IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references, PMIDs, or DOIs.
1453
- """
1454
- else:
1455
- # Different instructions when RAG is disabled - no mention of sources or citations
1456
- output_instructions = """
1457
- Please structure your response clearly.
1458
- **Priority 1: Direct Answer First**
1459
- Begin by providing your best assessment based on the available information without using "Direct Answer:" as a heading. Just start your response directly with the answer. If the query lacks some details, offer your initial thoughts based on what is known, while acknowledging areas of uncertainty.
1460
-
1461
- **Priority 2: Follow-up Questions**
1462
- After your direct answer, include a clearly labeled "Follow-up Questions:" section with specific questions that would help refine your assessment.
1463
-
1464
- **Main Response Structure:**
1465
- 1. A direct answer to the patient's concerns WITHOUT the heading "Direct Answer:".
1466
- 2. If appropriate, a clear diagnosis or differential diagnosis.
1467
- 3. Recommendations for a treatment plan or next steps.
1468
-
1469
- **After your main response, ALWAYS include these sections:**
1470
- - **Follow-up Questions**: Specific questions to gather additional information, numbered starting from 1 (not bullet points).
1471
- Do NOT start the first question with asterisks (**). Format each question properly with just a number.
1472
- - **Reasoning**: Provide a detailed, in-depth explanation of your clinical reasoning. Use bullet points for clarity. Aim for comprehensive insights that would be valuable to a healthcare professional.
1473
- Do NOT start the first bullet point with asterisks (**). Format each point properly.
1474
-
1475
- **IMPORTANT FORMATTING NOTES:**
1476
- 1. For follow-up questions, use numbered format (1. 2. 3.) not bullet points.
1477
- 2. Number the follow-up questions starting from 1, not from any other number.
1478
- 3. NEVER use markdown formatting like ** (asterisks) at the beginning of any points, questions, or lines.
1479
- 4. Make sure all bullet points and numbered items are clean, with no markdown formatting.
1480
-
1481
- IMPORTANT: Since database search is disabled, do not include citations or sources in your response.
1482
- """
1483
-
1484
- msgs.append({"role": "system", "content": output_instructions})
1485
  msgs.append({"role": "user", "content": query})
1486
 
1487
  # Get response from doctor agent
@@ -1528,21 +1412,15 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1528
  questions = parsed_response.get("follow_up_questions", [])
1529
  if questions:
1530
  if isinstance(questions, list):
1531
- # Format as a numbered list but check if already numbered
1532
- formatted_questions = []
1533
- for i, q in enumerate(questions):
1534
- if q:
1535
- # Check if question already starts with a number
1536
- if re.match(r'^\d+\.', q.strip()):
1537
- formatted_questions.append(q)
1538
- else:
1539
- formatted_questions.append(f"{i+1}. {q}")
1540
- follow_up_questions = "\n".join(formatted_questions)
1541
  else:
1542
  follow_up_questions = questions
1543
-
1544
- # Debug: Print follow-up questions
1545
- print(f"Follow-up questions generated: {follow_up_questions}")
 
 
1546
  else:
1547
  # If RAG is disabled, just parse the response without source processing
1548
  parsed_response = parse_doctor_response(response)
@@ -1569,26 +1447,12 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
1569
  questions = parsed_response.get("follow_up_questions", [])
1570
  if questions:
1571
  if isinstance(questions, list):
1572
- # Format as a numbered list starting with 1, but check if already numbered
1573
- formatted_questions = []
1574
- for i, q in enumerate(questions):
1575
- if q: # Ensure 'q' is not None or empty
1576
- # Check if question already starts with a number
1577
- if re.match(r'^\s*\d+\.\s*', q.strip()):
1578
- formatted_questions.append(q)
1579
- else:
1580
- # Remove any leading bullet points before adding numbers
1581
- q_cleaned = re.sub(r'^\s*[-•*]\s*', '', q.strip())
1582
- formatted_questions.append(f"{i+1}. {q_cleaned}")
1583
- follow_up_questions = "\n".join(formatted_questions)
1584
  else:
1585
  follow_up_questions = questions
1586
-
1587
- # Debug: Print follow-up questions
1588
- print(f"Follow-up questions generated: {follow_up_questions}")
1589
-
1590
- # Return four values: main response, explanation, follow-up questions, and evidence
1591
- return main_response, explanation, follow_up_questions, evidence_snippets
1592
 
1593
  # Enhanced interactive loop with better handling of consultations
1594
  def run_consultation(use_rag=True):
@@ -1797,18 +1661,8 @@ def enhance_medical_query(original_query):
1797
  str: An enhanced query optimized for medical search
1798
  """
1799
  try:
1800
- # System prompt for query enhancement
1801
- system_prompt = """You are a medical search query optimizer.
1802
- Your job is to take a user's medical question and rewrite it to be more effective for searching
1803
- medical databases like PubMed and Europe PMC.
1804
-
1805
- Guidelines:
1806
- 1. Extract key medical terms, conditions, symptoms, and treatments
1807
- 2. Use proper medical terminology where possible
1808
- 3. Structure the query for optimal search performance
1809
- 4. Return ONLY the enhanced query without explanation
1810
- 5. Keep the query concise but comprehensive
1811
- """
1812
 
1813
  # Call OpenAI to enhance the query
1814
  enhanced_response = openai.ChatCompletion.create(
@@ -1828,5 +1682,4 @@ def enhance_medical_query(original_query):
1828
  except Exception as e:
1829
  print(f"Error enhancing query: {str(e)}")
1830
  # Fall back to original query if there's an error
1831
- return original_query
1832
-
 
10
  import urllib.parse
11
  from dotenv import load_dotenv
12
  import time
13
+ from typing import List, Dict, Any, Tuple
14
+ import streamlit as st
15
 
16
  # Load environment variables
17
  load_dotenv()
 
27
  # Set OpenAI API key
28
  openai.api_key = get_openai_api_key()
29
 
30
+ # Remove all the existing prompt definitions and add imports
31
+ from prompts import (
32
+ SYSTEM_PROMPT,
33
+ FOLLOW_UP_PROMPT,
34
+ RAG_OUTPUT_INSTRUCTIONS,
35
+ CITATION_INSTRUCTIONS,
36
+ QUERY_ENHANCEMENT_PROMPT
37
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # Function to extract source IDs and replace them with actual links
40
  def extract_and_link_sources(text, evidence_snippets):
 
331
  except Exception:
332
  return []
333
 
334
+ def fetch_from_pmc_api(query, max_results=3, api_key=None):
335
  """Fetch free full text articles from PubMed Central (PMC)"""
336
  results = []
337
 
 
463
  except Exception:
464
  return []
465
 
466
+ def fetch_from_who_api(query, max_results=2):
467
  """Fetch information from WHO guidelines - using web scraping as alternative to API"""
468
  try:
469
  # WHO search URL (as they don't have a public API, we use web scraping)
 
508
  except Exception:
509
  return []
510
 
511
+ def fetch_from_core_api(query, max_results=3, api_key=None):
512
  """Fetch open access research papers from CORE API"""
513
  results = []
514
 
 
994
  print(f"Error in Europe PMC search: {str(e)}")
995
  return []
996
 
997
+ # Function to fetch medical evidence from multiple sources
998
+ def fetch_medical_evidence(query, max_results=5):
999
  """
1000
  Fetch medical evidence using a multi-source approach:
1001
  1. Search with extracted medical terms in PubMed
 
1007
 
1008
  Args:
1009
  query (str): The user's original query
1010
+ max_results (int): Maximum number of results to return (now set to 5)
1011
 
1012
  Returns:
1013
  list: Combined and deduplicated results from all searches
 
1032
  print(f"Searching PubMed with extracted terms: {terms_query}")
1033
 
1034
  # Search PubMed with extracted terms
1035
+ terms_pubmed_results = enhanced_search_pubmed(terms_query, retmax=3, api_key=pubmed_api_key)
1036
 
1037
  # Search Europe PMC with extracted terms
1038
  print(f"Searching Europe PMC with extracted terms")
1039
+ terms_europepmc_results = search_europe_pmc(query, max_results=3,
1040
  use_extracted_terms=True,
1041
  extracted_terms=medical_terms)
1042
 
1043
  # Search with the full original query in both sources
1044
  print(f"Searching PubMed with full query")
1045
+ full_pubmed_results = enhanced_search_pubmed(query, retmax=3, api_key=pubmed_api_key)
1046
 
1047
  print(f"Searching Europe PMC with full query")
1048
+ full_europepmc_results = search_europe_pmc(query, max_results=3)
1049
 
1050
  # Step 3: Combine results, ensuring no duplicates by PMID or DOI
1051
  all_results = []
 
1334
  if use_rag:
1335
  # Only fetch and format evidence if RAG is enabled
1336
  evidence_snippets = fetch_medical_evidence(query)
1337
+
1338
  # Format evidence for the model
1339
  if evidence_snippets:
1340
+ evidence_text = "Here are relevant medical evidence snippets to incorporate:\n\n"
1341
+ for i, evidence in enumerate(evidence_snippets, 1):
1342
+ title = evidence.get("title", "No title")
1343
+ abstract = evidence.get("abstract", "No abstract")
1344
+ source_id = evidence.get("source_id", "")
1345
+ source_type = evidence.get("source_type", "Unknown Source")
1346
+ is_open_access = evidence.get("is_open_access", False)
1347
 
1348
+ evidence_text += f"Evidence {i}:\n"
1349
+ evidence_text += f"Title: {title}\n"
1350
+ evidence_text += f"Source ID: {source_id}\n"
1351
+ evidence_text += f"Source Type: {source_type}\n"
1352
+ evidence_text += f"Open Access: {'🔓 Yes' if is_open_access else 'No'}\n"
1353
+ evidence_text += f"Abstract: {abstract}\n\n"
 
 
 
 
 
 
 
1354
 
1355
+ # Add citation instructions
1356
+ evidence_text += CITATION_INSTRUCTIONS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1357
 
1358
  msgs.append({"role": "system", "content": evidence_text})
1359
  else:
 
1365
 
1366
  # Add instructions for structured output
1367
  if use_rag:
1368
+ msgs.append({"role": "system", "content": RAG_OUTPUT_INSTRUCTIONS})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1369
  msgs.append({"role": "user", "content": query})
1370
 
1371
  # Get response from doctor agent
 
1412
  questions = parsed_response.get("follow_up_questions", [])
1413
  if questions:
1414
  if isinstance(questions, list):
1415
+ # Format questions with numbers
1416
+ follow_up_questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
 
 
 
 
 
 
 
 
1417
  else:
1418
  follow_up_questions = questions
1419
+
1420
+ # Set evidence from source map
1421
+ evidence = list(source_map.values()) if source_map else None
1422
+
1423
+ return main_response, explanation, follow_up_questions, evidence
1424
  else:
1425
  # If RAG is disabled, just parse the response without source processing
1426
  parsed_response = parse_doctor_response(response)
 
1447
  questions = parsed_response.get("follow_up_questions", [])
1448
  if questions:
1449
  if isinstance(questions, list):
1450
+ # Format questions with numbers
1451
+ follow_up_questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
 
 
 
 
 
 
 
 
 
 
1452
  else:
1453
  follow_up_questions = questions
1454
+
1455
+ return main_response, explanation, follow_up_questions, None
 
 
 
 
1456
 
1457
  # Enhanced interactive loop with better handling of consultations
1458
  def run_consultation(use_rag=True):
 
1661
  str: An enhanced query optimized for medical search
1662
  """
1663
  try:
1664
+ # Use imported prompt
1665
+ system_prompt = QUERY_ENHANCEMENT_PROMPT
 
 
 
 
 
 
 
 
 
 
1666
 
1667
  # Call OpenAI to enhance the query
1668
  enhanced_response = openai.ChatCompletion.create(
 
1682
  except Exception as e:
1683
  print(f"Error enhancing query: {str(e)}")
1684
  # Fall back to original query if there's an error
1685
+ return original_query