Spaces:

VaxGuide
/

Simple_RAG

Runtime error

App Files Files Community

Zeggai Abdellah commited on May 20, 2025

Commit

7f51074

1 Parent(s): fb64dbc

first commit

Browse files

Files changed (7) hide show

.gitignore +1 -0
Dockerfile +28 -0
app.py +46 -0
chunks.json +0 -0
prepare_env.py +89 -0
rag_pipeline.py +289 -0
requirements.txt +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+# Use a Python 3.9 base image
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /code
+# Copy requirements file
+COPY ./requirements.txt /code/requirements.txt
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Create a non-root user for security
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
+# Set app directory
+WORKDIR $HOME/app
+# Copy all project files
+COPY --chown=user . $HOME/app
+# Expose port 7860 (Hugging Face default)
+EXPOSE 7860
+# Run the FastAPI app with uvicorn
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from fastapi import FastAPI, Query
+from prepare_env import prepare_environment_and_retriever
+from rag_pipeline import full_rag_pipeline
+from langchain_google_genai import GoogleGenerativeAI
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+app = FastAPI()
+# Prepare the environment and load the vector store
+expanding_retriever = prepare_environment_and_retriever()
+@app.get("/ask")
+def ask_question(question: str, with_citations: bool = Query(False, description="Include citations in the response")):
+    response = full_rag_pipeline(question, expanding_retriever,clean_all_citations=with_citations)
+    return {"question": question, "answer": response}
+@app.get("/generate_title")
+def generate_title(first_question: str = Query(..., description="The first question to generate a title from")):
+    # Initialize the LLM - using the same model as in prepare_env.py
+    llm = GoogleGenerativeAI(
+        model="gemini-2.0-flash",
+        google_api_key=os.getenv("GOOGLE_API_KEY")
+    )
+    prompt = f"""Analyze this question and generate a very short title (3-5 words max):
+    1. If it's medical/vaccine-related: Create a professional clinical title
+    2. If non-medical: Create a general topic title
+    3. If unclear or greeting: Use "General Inquiry"
+    Always return just the title text, nothing else.
+    Question: {first_question}
+    Title:"""
+    title = llm.invoke(prompt)
+    return {"title": title.strip()}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

chunks.json ADDED Viewed

The diff for this file is too large to render. See raw diff

prepare_env.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+from langchain_core.documents import Document
+from langchain_community.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.retrievers import BM25Retriever
+from langchain.retrievers import EnsembleRetriever
+from langchain.retrievers.multi_query import MultiQueryRetriever
+from langchain_google_genai import GoogleGenerativeAI
+def prepare_environment_and_retriever(
+    chunks_path="./chunks.json",
+    model_name="intfloat/multilingual-e5-base",
+    collection_name="Guide_2023_e5_multilingual",
+    persist_directory="chroma_db_multilingual",
+    k_vector=6,
+    k_sparse=2,
+    weights=[0.5, 0.5],
+    llm_model_name="gemini-2.0-flash"
+):
+    # Load the chunks.json
+    with open(chunks_path, "r", encoding="utf-8") as f:
+        chunks_data = json.load(f)
+    documents = []
+    for element in chunks_data:
+        text = element["text"]
+        metadata = {
+            "source": element["filename"],
+            "filetype": element["filetype"],
+            "element_id": element["element_id"]
+        }
+        if element.get("type") == "TableElement":
+            metadata["table_text_as_html"] = element["table_text_as_html"]
+        doc = Document(page_content=text, metadata=metadata)
+        documents.append(doc)
+    # Create the embedding function
+    embedding_function = HuggingFaceEmbeddings(
+        model_name=model_name
+    )
+    # Create and persist the vector store
+    vectorstore = Chroma.from_documents(
+        documents=documents,
+        embedding=embedding_function,
+        collection_name=collection_name,
+        persist_directory=persist_directory
+    )
+    # vectorstore.persist()
+    print("✅ Stored with multilingual embeddings.")
+    # Build retrievers
+    retriever_multilingual = vectorstore.as_retriever(
+        search_type="similarity",
+        search_kwargs={"k": k_vector}
+    )
+    bm25_retriever = BM25Retriever.from_documents(documents)
+    bm25_retriever.k = k_sparse
+    # Ensemble retriever (combining vector + sparse search)
+    ensemble_retriever = EnsembleRetriever(
+        retrievers=[retriever_multilingual, bm25_retriever],
+        weights=weights
+    )
+    # Language model for multi-query expansion
+    # Using GoogleGenerativeAI instead of ChatGoogleGenerativeAI
+    llm = GoogleGenerativeAI(
+        model=llm_model_name,
+        google_api_key=os.getenv("GOOGLE_API_KEY")
+    )
+    expanding_retriever = MultiQueryRetriever.from_llm(
+        retriever=ensemble_retriever,
+        llm=llm
+    )
+    print("✅ Retrieval system ready (vector + sparse + ensemble + multi-query).")
+    return expanding_retriever  # Return the final retriever

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import json
+import re
+from langchain_google_genai import GoogleGenerativeAI
+from langchain_core.documents import Document
+from langdetect import detect
+import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+def generate_rag_response(query, retrieved_documents, model="gemini-2.0-flash"):
+    """
+    Perform Retrieval-Augmented Generation (RAG) using Google's Gemini.
+    Args:
+        query (str): The user's query.
+        retrieved_documents (list of str): The documents retrieved from the retriever.
+        model (str): The Gemini model to use.
+    Returns:
+        str: The generated response text.
+    """
+    information = "\n\n".join(retrieved_documents)
+    prompt = f"""You are a helpful and knowledgeable AI-powered vaccine assistant designed to support doctors in clinical decision-making.
+        You provide evidence-based guidance using only information from official vaccine medical documents.
+        Answer the doctor's question accurately and concisely using only the provided information.
+        IMPORTANT REQUIREMENTS:
+        ### Language Settings
+        1. DETECT THE LANGUAGE OF THE DOCTOR'S QUERY.
+        2. YOU MUST RESPOND ONLY IN ONE OF THESE THREE LANGUAGES:
+           - English (en): If the doctor's query is in English OR in any language not listed below
+           - Arabic (ar): ONLY if the doctor's query is in Arabic
+           - French (fr): ONLY if the doctor's query is in French
+        3. DO NOT switch languages mid-response. Use ONLY ONE language throughout your entire answer.
+        ### Citation and Sourcing
+        1. For each fact in your response, include an inline citation in the format [Source ID] immediately following the information, e.g., [e795ebd28318886c0b1a5395ac30ad90].
+        2. Do NOT use 'Source ID:' in the citation format; use only the source ID in square brackets.
+        3. If a fact is supported by multiple sources, use the following format:
+           - Use adjacent citations: [e795ebd28318886c0b1a5395ac30ad90][21a932b2340bb16707763f57f0ad2]
+        4. Use ONLY the provided information and never include facts from your general knowledge.
+        ### Content Formatting
+        1. When rendering tables:
+           - Convert HTML tables into clean Markdown format
+           - Preserve all original headers and data rows exactly
+           - Include the citation in the table caption, e.g., "Table: Vaccination Schedule [Source ID]"
+        2. For lists, maintain the original bullet points/numbering and include citations.
+        3. Present information concisely but ensure clinical accuracy is never compromised.
+        ### Professional Tone
+        1. Maintain a professional, clinical tone appropriate for physician communication.
+        2. Prioritize clarity and precision in medical terminology.
+        ### Response Handling
+        1. If the question cannot be answered with the provided documents:
+           - English: "I don't have sufficient information in the provided documents to answer this question completely. Please consult additional official vaccine resources or a specialist for guidance on this topic."
+           - Arabic: "ليس لدي معلومات كافية في الوثائق المقدمة للإجابة على هذا السؤال بشكل كامل. يرجى استشارة مصادر لقاح رسمية إضافية أو متخصص للحصول على إرشادات حول هذا الموضوع."
+           - French: "Je n'ai pas suffisamment d'informations dans les documents fournis pour répondre complètement à cette question. Veuillez consulter des ressources officielles sur les vaccins ou un spécialiste pour obtenir des conseils sur ce sujet."
+        2. If the question is clearly unrelated to vaccines or medicine:
+           - English: "I'm specialized in providing vaccine information for healthcare professionals. Could you please ask a question related to vaccines or immunization? I'd be happy to help with that."
+           - Arabic: "أنا متخصص في تقديم معلومات اللقاحات للمهنيين الصحيين. هل يمكنك طرح سؤال يتعلق باللقاحات أو التطعيم؟ سأكون سعيدًا بمساعدتك في ذلك."
+           - French: "Je suis spécialisé dans la fourniture d'informations sur les vaccins pour les professionnels de la santé. Pourriez-vous poser une question liée aux vaccins ou à l'immunisation ? Je serais heureux de vous aider avec ça."
+        3. For simple greetings:
+           - Respond with a simple formal greeting in the same language as the query.
+        Question: {query}
+        Information: {information}
+        """
+    # Initialize the LLM - using GoogleGenerativeAI instead of ChatGoogleGenerativeAI
+    llm = GoogleGenerativeAI(
+        model=model,
+        google_api_key=os.getenv("GOOGLE_API_KEY")
+    )
+    # Generate response using langchain
+    response = llm.invoke(prompt)
+    return response
+def extract_source_ids(response_text):
+    """
+    Extract source IDs from the response, handling different citation formats:
+    - Standard format: [Source ID]
+    - Multiple sources in one citation: [Source ID1][Source ID2]
+    - Multiple sources in one bracket: [Source ID1, Source ID2]
+    Args:
+        response_text (str): The generated response text with inline citations.
+    Returns:
+        list of str: List of unique source IDs found in the response text.
+    """
+    import re
+    # First, extract all source IDs from inline citations with adjacent brackets [ID1][ID2]
+    # Replace them with single brackets with comma separation to standardize format
+    consolidated_text = re.sub(r'\][\s]*\[', '][', response_text)
+    consolidated_text = re.sub(r'\]\[', ', ', consolidated_text)
+    # Now extract all source IDs from any format (single ID or comma-separated IDs)
+    inline_citations = re.findall(r'\[([^\[\]]+)\]', consolidated_text)
+    if not inline_citations:
+        print("Warning: No source IDs found in the response text.")
+        return []
+    # Process each citation which might contain multiple comma-separated IDs
+    all_ids = []
+    for citation in inline_citations:
+        # Split by comma and strip whitespace
+        ids = [id_str.strip() for id_str in citation.split(',')]
+        all_ids.extend(ids)
+    # Get unique source IDs
+    source_ids = list(set(all_ids))
+    # Filter out any non-UUID-like IDs (if needed)
+    # This is now optional as we're handling various source ID formats
+    # uuid_pattern = r'^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$'
+    # source_ids = [source_id for source_id in source_ids if re.match(uuid_pattern, source_id, re.IGNORECASE)]
+    if not source_ids:
+        print("Warning: No valid source IDs found after filtering.")
+        return []
+    return source_ids
+def format_response_with_sequential_citations(response_text, unique_ids, clean_all_citations=False):
+    """
+    Format the response text by either:
+    - Replacing source IDs with sequential numbers (default)
+    - Completely removing all citations (if clean_all_citations=True)
+    Handles multiple citation formats:
+    - Standard format: [Source ID]
+    - Multiple sources in one citation: [Source ID1][Source ID2]
+    - Multiple sources in one bracket: [Source ID1, Source ID2]
+    Args:
+        response_text (str): The generated response text with inline citations.
+        unique_ids (list): List of unique source IDs found in the response.
+        clean_all_citations (bool): If True, removes all citations completely.
+                                   If False, formats them as numbers.
+    Returns:
+        str: The formatted response text.
+    """
+    import re
+    if not unique_ids:
+        return response_text
+    formatted_response = response_text
+    # Create a mapping from source ID to sequential number
+    id_to_number = {source_id: str(i+1) for i, source_id in enumerate(unique_ids)}
+    if clean_all_citations:
+        # Remove all citations completely
+        formatted_response = re.sub(r'\[[^\[\]]+?\]', '', formatted_response)
+        # Clean up any resulting double spaces
+        formatted_response = re.sub(r'\s+', ' ', formatted_response)
+    else:
+        # First, standardize adjacent citations [ID1][ID2] to [ID1, ID2]
+        formatted_response = re.sub(r'\][\s]*\[', '][', formatted_response)
+        formatted_response = re.sub(r'\]\[', ', ', formatted_response)
+        # Now handle citations with multiple IDs
+        def replace_citation(match):
+            content = match.group(1)
+            # Check if there are multiple IDs separated by commas
+            if ',' in content:
+                ids = [id_str.strip() for id_str in content.split(',')]
+                numbers = []
+                for id_str in ids:
+                    if id_str in id_to_number:
+                        numbers.append(id_to_number[id_str])
+                if numbers:
+                    return f"[{', '.join(numbers)}]"
+            # Single ID case
+            elif content in id_to_number:
+                return f"[{id_to_number[content]}]"
+            return match.group(0)
+        # Replace citations with their sequential numbers
+        formatted_response = re.sub(r'\[([^\[\]]+)\]', replace_citation, formatted_response)
+    return formatted_response.strip()
+def retrieve_documents_and_prepare_inputs(query, expanding_retriever, chunks_path="./chunks.json"):
+    """
+    Retrieve relevant documents and prepare them for the RAG generation.
+    Args:
+        query (str): The user's query.
+        expanding_retriever: The retriever object (e.g., returned by prepare_environment_and_retriever).
+        chunks_path (str): Path to the chunks.json file.
+    Returns:
+        tuple: (source_texts_for_rag, retrieved_elements_full)
+    """
+    # Get documents - query expansion happens automatically
+    retrieved_docs = expanding_retriever.get_relevant_documents(query)
+    retrieved_chunk_ids = [doc.metadata["element_id"] for doc in retrieved_docs]
+    # Load all chunks
+    with open(chunks_path, "r", encoding="utf-8") as f:
+        chunks_data = json.load(f)
+    source_retrieved_texts = []
+    retrieved_elements_full = []
+    for chu in chunks_data:
+        if chu["element_id"] in retrieved_chunk_ids:
+            if chu.get("type") == "TableElement":
+                text = (
+                    f"[Source ID: {chu['elements']['element_id']}]\n"
+                    f"CONTENT:\n{chu['text']}\n"
+                    f"HTML:\n{chu['table_text_as_html']}\n\n"
+                )
+                source_retrieved_texts.append(text)
+            else:
+                for element in chu.get("elements", []):
+                    text = (
+                        f"[Source ID: {element['element_id']}]\n"
+                        f"CONTENT:\n{element['text']}\n\n"
+                    )
+                    source_retrieved_texts.append(text)
+                    retrieved_elements_full.append(element)
+    return source_retrieved_texts, retrieved_elements_full
+def full_rag_pipeline(query, expanding_retriever, chunks_path="./chunks.json", model="gemini-2.0-flash", clean_all_citations=False):
+    """
+    Full RAG pipeline from query to RAG response + extracted sources.
+    Args:
+        query (str): The user's query.
+        expanding_retriever: The retriever object.
+        chunks_path (str): Path to the chunks.json.
+        model (str): Gemini model.
+    Returns:
+        dict: {
+            "response": str,
+            "cited_elements_json": str,
+            "answer_language": str
+        }
+    """
+    source_texts, retrieved_elements = retrieve_documents_and_prepare_inputs(query, expanding_retriever, chunks_path)
+    # Step 1: RAG
+    response_text = generate_rag_response(query, source_texts, model=model)
+    # Step 2: Extract cited sources
+    unique_ids = extract_source_ids(response_text)
+    # Step 2.1: Format the response text with sequential citations
+    response_text = format_response_with_sequential_citations(response_text, unique_ids, clean_all_citations=clean_all_citations)
+    # Step 3: Get only the cited elements
+    cited_elements = [element for element in retrieved_elements if element["element_id"] in unique_ids]
+    cited_elements_json = json.dumps(cited_elements, ensure_ascii=False, indent=2)
+    # Improved language detection
+    try:
+        # Detect the language of the first 5 words of the response
+        first_line = " ".join(response_text.split()[:5])
+        first_line = re.sub(r'\[.*?\]', '', first_line)  # Remove citations
+        answer_language = detect(first_line)
+        if answer_language not in ['en', 'ar', 'fr']:
+            # Fall back to query language if detection fails
+            answer_language = detect(query)
+    except:
+        answer_language = detect(query) if detect(query) in ['en', 'ar', 'fr'] else 'en'
+    return {
+        "response": response_text,
+        "cited_elements_json": cited_elements_json,
+        "answer_language": answer_language
+    }

requirements.txt ADDED Viewed

Binary file (574 Bytes). View file