Spaces:

Dhruv-Ty
/

chat

Sleeping

App Files Files Community

Dhruv-Ty commited on May 26, 2025

Commit

1672f00

verified ·

1 Parent(s): 88b24b5

added more sources support

Browse files

Files changed (1) hide show

src/model.py +51 -198

src/model.py CHANGED Viewed

@@ -10,6 +10,8 @@ import openai
 import urllib.parse
 from dotenv import load_dotenv
 import time
 # Load environment variables
 load_dotenv()
@@ -25,40 +27,14 @@ def get_openai_api_key():
 # Set OpenAI API key
 openai.api_key = get_openai_api_key()
-# System prompts
-SYSTEM_PROMPT = """You are an advanced clinical AI assistant designed to aid healthcare professionals.
-Follow these guidelines in all responses:
-1.  **Answer Directly First**: Begin by providing your best answer based on available information. If information is limited, provide your assessment based on what is known, and indicate areas of uncertainty.
-2.  **Follow with Clarifying Questions**: After giving your initial assessment, include specific follow-up questions that would help refine your answer. These should be clearly labeled in a separate "Follow-up Questions:" section.
-3.  Professional tone: Maintain a clear, respectful, and professional tone appropriate for medical consultation.
-4.  Evidence-based practice: Base all responses on current medical evidence and guidelines.
-5.  Transparency: Clearly distinguish between established medical facts, clinical guidance, and areas of uncertainty.
-6.  Structured analysis: Present information in a clear, organized manner following clinical reasoning patterns.
-7.  Citation: Always cite specific sources for medical claims when available using the [PMID:123456] format, where 123456 is the actual PubMed ID number.
-8.  Limitations: Acknowledge the limits of AI medical advice and recommend in-person consultation when appropriate.
-9.  Comprehensive approach: Consider differential diagnoses and relevant contextual factors.
-10. Patient-centered: Focus on clinically relevant information while maintaining respect for the patient.
-For each consultation:
-1.  Provide an initial assessment based on available information (as per guideline 1).
-2.  Include specific follow-up questions (as per guideline 2).
-3.  Provide differential diagnosis with likelihood assessment.
-4.  Suggest appropriate next steps (testing, treatment, referral).
-5.  Include reasoning for your conclusions.
-6.  Cite medical literature or guidelines supporting your assessment using [PMID:123456].
-IMPORTANT: Your primary duty is to support clinical decision-making, not replace clinical judgment.
-"""
-FOLLOW_UP_PROMPT = """Continue this medical consultation based on the previous discussion.
-Consider the information already gathered and the tentative diagnosis/plan.
-When responding to the follow-up:
-1.  Directly address the follow-up question with evidence-based information.
-2.  Reference relevant details from the prior conversation.
-3.  If additional information would be helpful, include specific follow-up questions in a clearly labeled "Follow-up Questions:" section.
-4.  Update recommendations if appropriate based on new information.
-5.  Maintain the same structured approach with transparent reasoning.
-6.  Cite additional medical literature or guidelines when relevant using [PMID:123456].
-Remember that this is an ongoing consultation where continuity of care is important.
-"""
 # Function to extract source IDs and replace them with actual links
 def extract_and_link_sources(text, evidence_snippets):
@@ -355,7 +331,7 @@ def fetch_from_pubmed_api(query, max_results=3, api_key=None):
     except Exception:
         return []
-def fetch_from_pmc_api(query, max_results=2, api_key=None):
     """Fetch free full text articles from PubMed Central (PMC)"""
     results = []
@@ -487,7 +463,7 @@ def fetch_from_pmc_api(query, max_results=2, api_key=None):
     except Exception:
         return []
-def fetch_from_who_api(query, max_results=1):
     """Fetch information from WHO guidelines - using web scraping as alternative to API"""
     try:
         # WHO search URL (as they don't have a public API, we use web scraping)
@@ -532,7 +508,7 @@ def fetch_from_who_api(query, max_results=1):
     except Exception:
         return []
-def fetch_from_core_api(query, max_results=2, api_key=None):
     """Fetch open access research papers from CORE API"""
     results = []
@@ -1018,8 +994,8 @@ def search_europe_pmc(query, max_results=3, use_extracted_terms=False, extracted
         print(f"Error in Europe PMC search: {str(e)}")
         return []
-# Enhanced RAG System with focused PubMed searches
-def fetch_medical_evidence(query, max_results=3):
     """
     Fetch medical evidence using a multi-source approach:
     1. Search with extracted medical terms in PubMed
@@ -1031,7 +1007,7 @@ def fetch_medical_evidence(query, max_results=3):
     Args:
         query (str): The user's original query
-        max_results (int): Maximum number of results to return (now set to 3)
     Returns:
         list: Combined and deduplicated results from all searches
@@ -1056,20 +1032,20 @@ def fetch_medical_evidence(query, max_results=3):
         print(f"Searching PubMed with extracted terms: {terms_query}")
         # Search PubMed with extracted terms
-        terms_pubmed_results = enhanced_search_pubmed(terms_query, retmax=2, api_key=pubmed_api_key)
         # Search Europe PMC with extracted terms
         print(f"Searching Europe PMC with extracted terms")
-        terms_europepmc_results = search_europe_pmc(query, max_results=2,
                                                     use_extracted_terms=True,
                                                     extracted_terms=medical_terms)
     # Search with the full original query in both sources
     print(f"Searching PubMed with full query")
-    full_pubmed_results = enhanced_search_pubmed(query, retmax=2, api_key=pubmed_api_key)
     print(f"Searching Europe PMC with full query")
-    full_europepmc_results = search_europe_pmc(query, max_results=2)
     # Step 3: Combine results, ensuring no duplicates by PMID or DOI
     all_results = []
@@ -1358,50 +1334,26 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
     if use_rag:
         # Only fetch and format evidence if RAG is enabled
         evidence_snippets = fetch_medical_evidence(query)
         # Format evidence for the model
         if evidence_snippets:
-            evidence_text = "MEDICAL EVIDENCE FROM MULTIPLE SOURCES:\n\n"
-            for i, snippet in enumerate(evidence_snippets):
-                # Format the evidence with clear PMID or DOI for citation
-                pmid = snippet.get("pmid", "")
-                doi = snippet.get("doi", "")
-                evidence_text += f"--- ARTICLE {i+1} ---\n"
-                # Include the appropriate identifiers
-                if pmid:
-                    evidence_text += f"PMID: {pmid}\n"
-                if doi:
-                    evidence_text += f"DOI: {doi}\n"
-                evidence_text += f"Title: {snippet['title']}\n"
-                evidence_text += f"Source: {snippet['source_type']}\n"
-                evidence_text += f"Content: {snippet['text']}\n"
-                evidence_text += f"Citation: {snippet['citation']}\n"
-                evidence_text += f"URL: {snippet['url']}\n\n"
-            # Enhanced instructions for better source utilization
-            evidence_text += """CITATION INSTRUCTIONS:
-1. IMPORTANT: Provide a direct answer first before asking follow-up questions. Even with limited information, give your best assessment.
-2. You MUST cite 2-3 different sources in your response. Use no more than 3 sources and no fewer than 2 sources.
-3. When citing information from these articles, use the following formats:
-   • For PubMed articles: [PMID:123456] where 123456 is the actual PubMed ID
-   • For Europe PMC articles without PMID: [DOI:10.xxxx/yyyy] where 10.xxxx/yyyy is the DOI
-   Example: "Recent studies have shown improved outcomes with early intervention [PMID:34567890]."
-   Example: "Current guidelines recommend a multidisciplinary approach [DOI:10.1234/abcd]."
-4. Focus on specific details from the abstracts - extract actual findings, statistics, or recommendations.
-5. When multiple sources support a claim, cite all of them for stronger evidence.
-   Example: "This approach is supported by multiple studies [PMID:12345678][PMID:87654321]."
-6. Include full citations in your Sources section with clickable URLs.
-7. If the abstracts have conflicting information, acknowledge this and present both perspectives with citations.
-8. Use the most recent sources when available, especially for treatment recommendations.
-9. If full text is available (marked as "Open Access" or "Full Text Available"), prioritize information from those sources as they contain more complete data.
-10. Europe PMC sources often provide more complete full text access, so give them equal consideration to PubMed sources.
-11. After your direct answer, include specific follow-up questions in a clearly labeled "Follow-up Questions:" section.
-"""
             msgs.append({"role": "system", "content": evidence_text})
         else:
@@ -1413,75 +1365,7 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
     # Add instructions for structured output
     if use_rag:
-        output_instructions = """
-        Please structure your response clearly.
-        **Priority 1: Direct Answer First**
-        Begin by providing your best assessment based on the available information without using "Direct Answer:" as a heading. Just start your response directly with the answer. If the query lacks some details, offer your initial thoughts based on what is known, while acknowledging areas of uncertainty.
-        **Priority 2: Follow-up Questions**
-        After your direct answer, include a clearly labeled "Follow-up Questions:" section with specific questions that would help refine your assessment.
-        **Main Response Structure:**
-        1. A direct answer to the patient's concerns WITHOUT the heading "Direct Answer:".
-        2. If appropriate, a clear diagnosis or differential diagnosis with likelihood assessments.
-        3. Recommendations for a treatment plan or next steps.
-        4. IMPORTANT: You MUST cite between 2-3 different medical evidence sources using either:
-           • [PMID:123456] format for PubMed articles
-           • [DOI:10.xxxx/yyyy] format for Europe PMC articles without PMID
-           Use no more than 3 sources and no fewer than 2 sources.
-        **After your main response, ALWAYS include these sections:**
-        -   **Follow-up Questions**: Specific numbered questions starting from 1, not bullets.
-            Do NOT start the first question with asterisks (**). Format each question properly with just a number.
-        -   **Reasoning**: Provide a detailed, in-depth explanation of your clinical reasoning. Use bullet points for clarity. Aim for comprehensive insights that would be valuable to a healthcare professional.
-            Do NOT start the first point with asterisks (**). Format each bullet point properly.
-        -   **Sources**: A list of all references cited in your main response (2-3 sources), formatted as:
-             - PMID: 12345678 - Author et al. (Year). Title. Journal.
-               URL: https://pubmed.ncbi.nlm.nih.gov/12345678/
-             - DOI: 10.xxxx/yyyy - Author et al. (Year). Title. Journal.
-               URL: https://doi.org/10.xxxx/yyyy
-        **IMPORTANT FORMATTING NOTES:**
-        1. Do NOT include technical information like URLs, PMIDs or DOIs in the main answer - these belong in the Sources section only.
-        2. For follow-up questions, use numbered format (1. 2. 3.) not bullet points.
-        3. Number the follow-up questions starting from 1, not from any other number.
-        4. NEVER use markdown formatting like ** (asterisks) at the beginning of any points, questions, or lines.
-        5. Make sure all bullet points and numbered items are clean, with no markdown formatting.
-        IMPORTANT: Only cite sources that were provided in the evidence. Do not fabricate references, PMIDs, or DOIs.
-        """
-    else:
-        # Different instructions when RAG is disabled - no mention of sources or citations
-        output_instructions = """
-        Please structure your response clearly.
-        **Priority 1: Direct Answer First**
-        Begin by providing your best assessment based on the available information without using "Direct Answer:" as a heading. Just start your response directly with the answer. If the query lacks some details, offer your initial thoughts based on what is known, while acknowledging areas of uncertainty.
-        **Priority 2: Follow-up Questions**
-        After your direct answer, include a clearly labeled "Follow-up Questions:" section with specific questions that would help refine your assessment.
-        **Main Response Structure:**
-        1. A direct answer to the patient's concerns WITHOUT the heading "Direct Answer:".
-        2. If appropriate, a clear diagnosis or differential diagnosis.
-        3. Recommendations for a treatment plan or next steps.
-        **After your main response, ALWAYS include these sections:**
-        -   **Follow-up Questions**: Specific questions to gather additional information, numbered starting from 1 (not bullet points).
-            Do NOT start the first question with asterisks (**). Format each question properly with just a number.
-        -   **Reasoning**: Provide a detailed, in-depth explanation of your clinical reasoning. Use bullet points for clarity. Aim for comprehensive insights that would be valuable to a healthcare professional.
-            Do NOT start the first bullet point with asterisks (**). Format each point properly.
-        **IMPORTANT FORMATTING NOTES:**
-        1. For follow-up questions, use numbered format (1. 2. 3.) not bullet points.
-        2. Number the follow-up questions starting from 1, not from any other number.
-        3. NEVER use markdown formatting like ** (asterisks) at the beginning of any points, questions, or lines.
-        4. Make sure all bullet points and numbered items are clean, with no markdown formatting.
-        IMPORTANT: Since database search is disabled, do not include citations or sources in your response.
-        """
-    msgs.append({"role": "system", "content": output_instructions})
     msgs.append({"role": "user", "content": query})
     # Get response from doctor agent
@@ -1528,21 +1412,15 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
         questions = parsed_response.get("follow_up_questions", [])
         if questions:
             if isinstance(questions, list):
-                # Format as a numbered list but check if already numbered
-                formatted_questions = []
-                for i, q in enumerate(questions):
-                    if q:
-                        # Check if question already starts with a number
-                        if re.match(r'^\d+\.', q.strip()):
-                            formatted_questions.append(q)
-                        else:
-                            formatted_questions.append(f"{i+1}. {q}")
-                follow_up_questions = "\n".join(formatted_questions)
             else:
                 follow_up_questions = questions
-            # Debug: Print follow-up questions
-            print(f"Follow-up questions generated: {follow_up_questions}")
     else:
         # If RAG is disabled, just parse the response without source processing
         parsed_response = parse_doctor_response(response)
@@ -1569,26 +1447,12 @@ def orchestrator_chat(history, query, use_rag, is_follow_up=False):
         questions = parsed_response.get("follow_up_questions", [])
         if questions:
             if isinstance(questions, list):
-                # Format as a numbered list starting with 1, but check if already numbered
-                formatted_questions = []
-                for i, q in enumerate(questions):
-                    if q: # Ensure 'q' is not None or empty
-                        # Check if question already starts with a number
-                        if re.match(r'^\s*\d+\.\s*', q.strip()):
-                            formatted_questions.append(q)
-                        else:
-                            # Remove any leading bullet points before adding numbers
-                            q_cleaned = re.sub(r'^\s*[-•*]\s*', '', q.strip())
-                            formatted_questions.append(f"{i+1}. {q_cleaned}")
-                follow_up_questions = "\n".join(formatted_questions)
             else:
                 follow_up_questions = questions
-            # Debug: Print follow-up questions
-        print(f"Follow-up questions generated: {follow_up_questions}")
-    # Return four values: main response, explanation, follow-up questions, and evidence
-    return main_response, explanation, follow_up_questions, evidence_snippets
 # Enhanced interactive loop with better handling of consultations
 def run_consultation(use_rag=True):
@@ -1797,18 +1661,8 @@ def enhance_medical_query(original_query):
         str: An enhanced query optimized for medical search
     """
     try:
-        # System prompt for query enhancement
-        system_prompt = """You are a medical search query optimizer.
-        Your job is to take a user's medical question and rewrite it to be more effective for searching
-        medical databases like PubMed and Europe PMC.
-        Guidelines:
-        1. Extract key medical terms, conditions, symptoms, and treatments
-        2. Use proper medical terminology where possible
-        3. Structure the query for optimal search performance
-        4. Return ONLY the enhanced query without explanation
-        5. Keep the query concise but comprehensive
-        """
         # Call OpenAI to enhance the query
         enhanced_response = openai.ChatCompletion.create(
@@ -1828,5 +1682,4 @@ def enhance_medical_query(original_query):
     except Exception as e:
         print(f"Error enhancing query: {str(e)}")
         # Fall back to original query if there's an error
-        return original_query

 import urllib.parse
 from dotenv import load_dotenv
 import time
+from typing import List, Dict, Any, Tuple
+import streamlit as st
 # Load environment variables
 load_dotenv()
 # Set OpenAI API key
 openai.api_key = get_openai_api_key()
+# Remove all the existing prompt definitions and add imports
+from prompts import (
+    SYSTEM_PROMPT,
+    FOLLOW_UP_PROMPT,
+    RAG_OUTPUT_INSTRUCTIONS,
+    CITATION_INSTRUCTIONS,
+    QUERY_ENHANCEMENT_PROMPT
+)
 # Function to extract source IDs and replace them with actual links
 def extract_and_link_sources(text, evidence_snippets):
     except Exception:
         return []
+def fetch_from_pmc_api(query, max_results=3, api_key=None):
     """Fetch free full text articles from PubMed Central (PMC)"""
     results = []
     except Exception:
         return []
+def fetch_from_who_api(query, max_results=2):
     """Fetch information from WHO guidelines - using web scraping as alternative to API"""
     try:
         # WHO search URL (as they don't have a public API, we use web scraping)
     except Exception:
         return []
+def fetch_from_core_api(query, max_results=3, api_key=None):
     """Fetch open access research papers from CORE API"""
     results = []
         print(f"Error in Europe PMC search: {str(e)}")
         return []
+# Function to fetch medical evidence from multiple sources
+def fetch_medical_evidence(query, max_results=5):
     """
     Fetch medical evidence using a multi-source approach:
     1. Search with extracted medical terms in PubMed
     Args:
         query (str): The user's original query
+        max_results (int): Maximum number of results to return (now set to 5)
     Returns:
         list: Combined and deduplicated results from all searches
         print(f"Searching PubMed with extracted terms: {terms_query}")
         # Search PubMed with extracted terms
+        terms_pubmed_results = enhanced_search_pubmed(terms_query, retmax=3, api_key=pubmed_api_key)
         # Search Europe PMC with extracted terms
         print(f"Searching Europe PMC with extracted terms")
+        terms_europepmc_results = search_europe_pmc(query, max_results=3,
                                                     use_extracted_terms=True,
                                                     extracted_terms=medical_terms)
     # Search with the full original query in both sources
     print(f"Searching PubMed with full query")
+    full_pubmed_results = enhanced_search_pubmed(query, retmax=3, api_key=pubmed_api_key)
     print(f"Searching Europe PMC with full query")
+    full_europepmc_results = search_europe_pmc(query, max_results=3)
     # Step 3: Combine results, ensuring no duplicates by PMID or DOI
     all_results = []
     if use_rag:
         # Only fetch and format evidence if RAG is enabled
         evidence_snippets = fetch_medical_evidence(query)
         # Format evidence for the model
         if evidence_snippets:
+            evidence_text = "Here are relevant medical evidence snippets to incorporate:\n\n"
+            for i, evidence in enumerate(evidence_snippets, 1):
+                title = evidence.get("title", "No title")
+                abstract = evidence.get("abstract", "No abstract")
+                source_id = evidence.get("source_id", "")
+                source_type = evidence.get("source_type", "Unknown Source")
+                is_open_access = evidence.get("is_open_access", False)
+                evidence_text += f"Evidence {i}:\n"
+                evidence_text += f"Title: {title}\n"
+                evidence_text += f"Source ID: {source_id}\n"
+                evidence_text += f"Source Type: {source_type}\n"
+                evidence_text += f"Open Access: {'🔓 Yes' if is_open_access else 'No'}\n"
+                evidence_text += f"Abstract: {abstract}\n\n"
+            # Add citation instructions
+            evidence_text += CITATION_INSTRUCTIONS
             msgs.append({"role": "system", "content": evidence_text})
         else:
     # Add instructions for structured output
     if use_rag:
+        msgs.append({"role": "system", "content": RAG_OUTPUT_INSTRUCTIONS})
     msgs.append({"role": "user", "content": query})
     # Get response from doctor agent
         questions = parsed_response.get("follow_up_questions", [])
         if questions:
             if isinstance(questions, list):
+                # Format questions with numbers
+                follow_up_questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
             else:
                 follow_up_questions = questions
+        # Set evidence from source map
+        evidence = list(source_map.values()) if source_map else None
+        return main_response, explanation, follow_up_questions, evidence
     else:
         # If RAG is disabled, just parse the response without source processing
         parsed_response = parse_doctor_response(response)
         questions = parsed_response.get("follow_up_questions", [])
         if questions:
             if isinstance(questions, list):
+                # Format questions with numbers
+                follow_up_questions = "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
             else:
                 follow_up_questions = questions
+        return main_response, explanation, follow_up_questions, None
 # Enhanced interactive loop with better handling of consultations
 def run_consultation(use_rag=True):
         str: An enhanced query optimized for medical search
     """
     try:
+        # Use imported prompt
+        system_prompt = QUERY_ENHANCEMENT_PROMPT
         # Call OpenAI to enhance the query
         enhanced_response = openai.ChatCompletion.create(
     except Exception as e:
         print(f"Error enhancing query: {str(e)}")
         # Fall back to original query if there's an error
+        return original_query