AgenticRAG_test

Sleeping

File size: 24,848 Bytes

# -*- coding: utf-8 -*-
"""
Environment preparation script for vaccine assistant
Creates vector stores and retrieval tools
"""

import os
import json
import re
import nest_asyncio
from typing import List
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
from llama_index.core.tools import FunctionTool
from llama_index.core.schema import TextNode
from langchain.prompts import PromptTemplate
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

def extract_source_ids(response_text):
    """
    Extract source IDs from the response, handling different citation formats:
    - Standard format: [Source ID]
    - Multiple sources in one citation: [Source ID1][Source ID2]
    - Multiple sources in one bracket: [Source ID1, Source ID2]

    Args:
        response_text (str): The generated response text with inline citations.

    Returns:
        list of str: List of unique source IDs found in the response text.
    """
    import re

    # First, extract all source IDs from inline citations with adjacent brackets [ID1][ID2]
    # Replace them with single brackets with comma separation to standardize format
    consolidated_text = re.sub(r'\][\s]*\[', '][', response_text)
    consolidated_text = re.sub(r'\]\[', ', ', consolidated_text)
    
    # Now extract all source IDs from any format (single ID or comma-separated IDs)
    inline_citations = re.findall(r'\[([^\[\]]+)\]', consolidated_text)
    
    if not inline_citations:
        print("Warning: No source IDs found in the response text.")
        return []

    # Process each citation which might contain multiple comma-separated IDs
    all_ids = []
    for citation in inline_citations:
        # Split by comma and strip whitespace
        ids = [id_str.strip() for id_str in citation.split(',')]
        all_ids.extend(ids)
    
    # Get unique source IDs
    source_ids = list(set(all_ids))
    
    # Filter out any non-UUID-like IDs (if needed)
    # This is now optional as we're handling various source ID formats
    # uuid_pattern = r'^[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}$'
    # source_ids = [source_id for source_id in source_ids if re.match(uuid_pattern, source_id, re.IGNORECASE)]
    
    if not source_ids:
        print("Warning: No valid source IDs found after filtering.")
        return []

    return source_ids


def setup_models():
    """Initialize embedding model and LLM"""
    print("🔧 Setting up embedding model and LLM...")
    
    # Initialize embedding model
    embedding_function = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-base"
    )
    print("✅ Embedding model initialized: intfloat/multilingual-e5-base")
    
    # Initialize LLM
    genai_api_key = os.getenv('GOOGLE_API_KEY')
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        google_api_key=genai_api_key
    )
    print("✅ LLM initialized: gemini-2.0-flash")
    
    return embedding_function, llm

def create_vectorstore_from_json(json_path: str, collection_name: str, embedding_function):
    """Create vector store from JSON chunks"""
    print(f"📚 Creating vector store from: {json_path}")
    
    # Load the chunks.json
    with open(json_path, "r", encoding="utf-8") as f:
        chunks_data = json.load(f)
    
    print(f"📊 Loaded {len(chunks_data)} chunks from JSON")
    
    documents = []
    for element in chunks_data:
        text = element["text"]
        metadata = {
            "language": "fra",
            "source": element["filename"],
            "filetype": element["filetype"],
            "element_id": element["element_id"]
        }

        if "TableElement" == element["type"]:
            metadata["table_text_as_html"] = element["table_text_as_html"]

        doc = Document(page_content=text, metadata=metadata)
        documents.append(doc)

    # Create vector store
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_function,
        collection_name=collection_name,
        persist_directory="chroma_db_multilingual"
    )
    print(f"✅ Vector store created with collection: {collection_name}")
    return vectorstore, documents

def create_retriever(vectorstore, docs, llm, bm25_k=3,vector_k=6):
    """Create ensemble retriever with vector and BM25 search
    
    Args:
        vectorstore: The vector store for similarity search
        docs: Documents for BM25 retriever
        llm: Language model for multi-query generation
        bm25_k: Number of documents to retrieve with BM25
        vector_k: Number of documents to retrieve with vector search
    
    Returns:
        Configured retriever (MultiQueryRetriever or EnsembleRetriever)
    """
    print("🔍 Creating ensemble retriever...")
    
    # PromptTemplate for Vaccine Assistant MultiQuery Retriever
    VACCINE_MULTIQUERY_PROMPT = PromptTemplate(
        input_variables=["question"],
        template="""You are an AI assistant specialized in vaccine-related medical information retrieval.
    Your task is to generate multiple search queries based on the original question to find relevant information from official vaccine medical documents.

    IMPORTANT GUIDELINES:
    - Keep all vaccine-specific terminology and medical terms intact
    - Maintain the clinical and medical context
    - Focus on evidence-based vaccine information
    - Preserve any specific vaccine names, diseases, or medical conditions mentioned
    - Generate queries that would help retrieve information about vaccine schedules, dosing, contraindications, adverse events, and disease prevention

    Original question: {question}

    Generate 4 different search queries that rephrase the original question while maintaining vaccine terminology and medical accuracy. Each query should approach the topic from a slightly different angle to maximize retrieval from vaccine medical documents.

    Provide only the alternative questions, one per line."""
    )
    

    
    # Vector retriever
    vector_retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": vector_k}
    )
    print(f"✅ Vector retriever created (k={vector_k})")
    
    # BM25 retriever
    bm25_retriever = BM25Retriever.from_documents(docs)
    bm25_retriever.k = bm25_k
    print(f"✅ BM25 retriever created (k={bm25_k})")
    
    # Ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[vector_retriever, bm25_retriever],
        weights=[0.5, 0.5]
    )
    print("✅ Ensemble retriever created (weights: 0.5, 0.5)")
    
    
    # Multi-query expanding retriever (only for filtered mode)
    expanding_retriever = MultiQueryRetriever.from_llm(
        retriever=ensemble_retriever,
        llm=llm,
        prompt=VACCINE_MULTIQUERY_PROMPT,
    )
    print("✅ Multi-query expanding retriever created")
    
    return expanding_retriever

def convert_chromadb_to_llamaindex_nodes(chromadb_documents: List) -> List[TextNode]:
    """Convert ChromaDB Document objects to LlamaIndex TextNode objects"""
    nodes = []
    for i, doc in enumerate(chromadb_documents):
        try:
            text = doc.page_content
            metadata = doc.metadata.copy()
            element_id = metadata.get("element_id", f"doc_{i}")
            source = metadata.get("source", "unknown")
            node_id = f"{source}_{element_id}"

            node = TextNode(
                text=text,
                metadata=metadata,
                id_=node_id
            )
            nodes.append(node)
        except Exception as e:
            continue
    return nodes

def section_tool_wrapper(retriever, section_path_chunks, query):
    """Generic section tool wrapper"""
    print(f"🔍 TOOL CALL: Searching for query: '{query[:100]}...' in {section_path_chunks}")
    
    try:
        retrieved_docs = retriever.get_relevant_documents(query)
        print(f"📄 Retrieved {len(retrieved_docs)} documents")
        
        nodes_from_retrieved_docs = convert_chromadb_to_llamaindex_nodes(retrieved_docs)

        if not nodes_from_retrieved_docs:
            print("❌ No relevant documents found for the query")
            return "No relevant documents found for the query."

        chunk_ids = [node.metadata['element_id'] for node in retrieved_docs]
        print(f"🆔 Found chunk IDs: {chunk_ids}")
        
        with open(section_path_chunks, "r", encoding="utf-8") as f:
            chunks_data = json.load(f)

        chunks_unique = [node for node in chunks_data if node.get('element_id', 'Unknown') in chunk_ids]
        print(f"✅ Matched {len(chunks_unique)} unique chunks")
        
        combined_text = []
        
        for chu in chunks_unique:
            if "TableElement" == chu["type"]:
                text = f"[Source: {chu['elements']['element_id']}]\n CONTENT: \n{chu['text']}\n HTML: \n {chu['table_text_as_html']}  \n\n"
                combined_text.append(text)
            else:
                for element in chu["elements"]:
                    text = f"[Source: {element['element_id']}]\n CONTENT: \n{element['text']} \n\n"
                    combined_text.append(text)

        result = "\n---\n".join(combined_text)
        print(f"✅ TOOL RESPONSE: Generated response with {len(combined_text)} text sections")
        return result
    except Exception as e:
        print(f"❌ TOOL ERROR: {e}")
        return f"Error retrieving documents: {str(e)}"

def create_section_tools(embedding_function, llm):
    """
    Create all section-specific retrieval tools with improved descriptions for accurate routing.
    """
    print("🛠️ Creating section-specific retrieval tools with enhanced descriptions...")

    # Define section paths - Fixed path structure
    section_paths = {
        'one': './data/section_one_chunks.json',
        'two': './data/section_two_chunks.json',
        'three': './data/section_three_chunks.json',
        'four': './data/section_four_chunks.json',
        'five': './data/section_five_chunks.json',
        'six': './data/section_six_chunks.json',
        'seven': './data/section_seven_chunks.json',
        'eight': './data/section_eight_chunks.json',
        'nine': './data/section_nine_chunks.json',
        'ten': './data/section_ten_chunks.json'
    }

    # Create retrievers for each section
    section_retrievers = {}
    for section, path in section_paths.items():
        try:
            if os.path.exists(path):
                print(f"📁 Creating retriever for section {section} from {path}")
                vstore, docs = create_vectorstore_from_json(path, f"Guide_2023_{section}", embedding_function)
                section_retrievers[section] = create_retriever(vstore, docs, llm, bm25_k=7, vector_k=10)
                print(f"✅ Successfully created retriever for section {section}")
            else:
                print(f"⚠️ Warning: File not found for section {section}: {path}")
                section_retrievers[section] = None
        except Exception as e:
            print(f"❌ Error creating retriever for section {section}: {e}")
            section_retrievers[section] = None

    # Create main guide retriever
    guide_path = './data/Guide-pratique-de-mise-en-oeuvre-du-calendrier-national-de-vaccination-2023.json'
    guide_retriever = None
    try:
        if os.path.exists(guide_path):
            print("📚 Creating main guide retriever...")
            guide_vstore, guide_docs = create_vectorstore_from_json(guide_path, "Guide_2023_multilingual", embedding_function)
            guide_retriever = create_retriever(guide_vstore, guide_docs, llm)
            print("✅ Successfully created main guide retriever")
        else:
            print(f"⚠️ Warning: Main guide file not found: {guide_path}")
    except Exception as e:
        print(f"❌ Error creating main guide retriever: {e}")

    # WHO Immunization in Practice Tool
    immunization_path = './data/Immunization in Practice_WHO_eng_2015.json'
    immunization_retriever = None
    try:
        if os.path.exists(immunization_path):
            print("🌍 Creating immunization retriever...")
            immunization_vstore, immunization_docs = create_vectorstore_from_json(
                immunization_path,
                "Immunization_in_Practice_WHO_eng_2015",
                embedding_function
            )
            immunization_retriever = create_retriever(immunization_vstore, immunization_docs, llm)
            print("✅ Successfully created immunization retriever")
        else:
            print(f"⚠️ Warning: Immunization file not found: {immunization_path}")
    except Exception as e:
        print(f"❌ Error creating immunization retriever: {e}")

    # --- Tool Definitions with Improved Descriptions ---

    def general_guide_tool(query: str) -> str:
        """
        A general-purpose tool for the Algerian National Vaccination Guide.
        **Use this tool as a fallback** if no other specific tool seems appropriate, or for very broad, multi-topic questions
        (e.g., 'Summarize the Algerian vaccination policy and its safety measures').
        **Always prefer a more specific tool if the query matches its description** (e.g., use 'cold_chain_tool' for temperature questions).

        Args:
            query (str): A broad or ambiguous question about the Algerian National Vaccination Guide.

        Returns:
            str: Content retrieved from the entire guide.
        """
        print(f"🏥 GENERAL GUIDE TOOL CALLED (FALLBACK): {query[:50]}...")
        if not guide_retriever:
            return "Guide retriever not available - main guide file may be missing"
        return section_tool_wrapper(guide_retriever, guide_path, query)

    def who_immunization_tool(query: str) -> str:
        """
        Provides information from the WHO's 'Immunization in Practice' guide. Use this for questions about
        **global immunization standards**, international best practices, or for comparing Algerian policy to
        general WHO recommendations on topics like cold chain, safety, and disease control.

        Args:
            query (str): A question seeking global or general immunization practices.

        Returns:
            str: Content from the WHO Immunization in Practice guide.
        """
        print(f"🌍 WHO TOOL CALLED: {query[:50]}...")
        if not immunization_retriever:
            return "Immunization in Practice retriever not available - WHO guide file may be missing"
        return section_tool_wrapper(immunization_retriever, immunization_path, query)

    def program_overview_tool(query: str) -> str:
        """
        (Section 1) The primary tool for questions about the **history, objectives, and structure** of Algeria's
        national immunization program (PEV - Programme Élargi de Vaccination). Use this for topics like
        the program's rationale, key achievements, and the reasons for updates to the vaccination calendar.

        Args:
            query (str): A question about the foundation or evolution of the PEV.

        Returns:
            str: Response from Section 1.
        """
        print(f"📋 PROGRAM OVERVIEW (S1) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('one'):
            return "Section 1 retriever not available"
        return section_tool_wrapper(section_retrievers['one'], section_paths['one'], query)

    def disease_info_tool(query: str) -> str:
        """
        (Section 2) The definitive tool for information on **specific vaccine-preventable diseases**.
        Use this to find details on **symptoms, transmission methods, complications**, and prevention
        strategies for diseases like Diphtheria, Measles, Polio, Tetanus, etc.

        Args:
            query (str): A question about a disease covered by the national vaccination program.

        Returns:
            str: Disease-specific content from Section 2.
        """
        print(f"🦠 DISEASE INFO (S2) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('two'):
            return "Section 2 retriever not available"
        return section_tool_wrapper(section_retrievers['two'], section_paths['two'], query)

    def vaccine_properties_tool(query: str) -> str:
        """
        (Section 3) The specific tool for questions about the **vaccines themselves**: their types (e.g., BCG, ROR,
        DTCaVPI), composition, whether they are live or inactivated, and the correct **method of administration**
        (e.g., intradermal, intramuscular, oral).

        Args:
            query (str): A question about a vaccine's formulation or how it is administered.

        Returns:
            str: Vaccine-specific info from Section 3.
        """
        print(f"💉 VACCINE PROPERTIES (S3) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('three'):
            return "Section 3 retriever not available"
        return section_tool_wrapper(section_retrievers['three'], section_paths['three'], query)

    def catch_up_vaccination_tool(query: str) -> str:
        """
        (Section 4) Specialized tool for **missed or delayed vaccinations (rattrapage vaccinal)**.
        Use this for questions about creating a **catch-up schedule** for a child who is behind
        on their shots, based on their age and vaccination history.

        Args:
            query (str): A question about catch-up vaccination due to a delay or missed dose.

        Returns:
            str: Catch-up schedule guidance from Section 4.
        """
        print(f"🔄 CATCH-UP (S4) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('four'):
            return "Section 4 retriever not available"
        return section_tool_wrapper(section_retrievers['four'], section_paths['four'], query)

    def special_populations_tool(query: str) -> str:
        """
        (Section 5) The designated tool for vaccination guidelines concerning **special populations**.
        Use for questions about vaccinating preterm infants, allergic children, or patients with
        immunosuppression, chronic illnesses (cardiac, pulmonary), or other specific health conditions.

        Args:
            query (str): A question about tailored vaccination for a vulnerable or special group.

        Returns:
            str: Custom recommendations from Section 5.
        """
        print(f"👥 SPECIAL POPULATIONS (S5) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('five'):
            return "Section 5 retriever not available"
        return section_tool_wrapper(section_retrievers['five'], section_paths['five'], query)

    def cold_chain_tool(query: str) -> str:
        """
        (Section 6) The definitive tool for all questions about the **cold chain**, including vaccine **storage
        temperatures**, transport protocols, refrigerators, temperature monitoring (like PCV pastilles),
        and procedures for handling cold chain failures or power outages.

        Args:
            query (str): A logistics-related question about vaccine temperature management.

        Returns:
            str: Cold chain instructions from Section 6.
        """
        print(f"❄️ COLD CHAIN (S6) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('six'):
            return "Section 6 retriever not available"
        return section_tool_wrapper(section_retrievers['six'], section_paths['six'], query)

    def injection_safety_tool(query: str) -> str:
        """
        (Section 7) The primary tool for questions related to the **safe administration of injections**.
        Use for topics like sterile equipment, proper injection techniques, preventing needlestick injuries,
        and safe disposal of medical waste (DASRI).

        Args:
            query (str): A question about how to perform vaccine injections safely.

        Returns:
            str: Best practices from Section 7.
        """
        print(f"🛡️ INJECTION SAFETY (S7) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('seven'):
            return "Section 7 retriever not available"
        return section_tool_wrapper(section_retrievers['seven'], section_paths['seven'], query)

    def session_management_tool(query: str) -> str:
        """
        (Section 8) Use this tool for questions about the **operational conduct of a vaccination session**
        and **vaccinovigilance**. This includes preparing the session, material setup, registering vaccination
        acts, and monitoring/reporting adverse events post-vaccination (MPVI).

        Args:
            query (str): A question about running a vaccination session or post-vaccine monitoring.

        Returns:
            str: Workflow and safety monitoring details from Section 8.
        """
        print(f"📊 SESSION MGMT (S8) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('eight'):
            return "Section 8 retriever not available"
        return section_tool_wrapper(section_retrievers['eight'], section_paths['eight'], query)

    def planning_and_logistics_tool(query: str) -> str:
        """
        (Section 9) This tool is for **planning vaccination sessions and managing logistics**. Use it for
        questions about creating operational maps, estimating vaccine and supply needs, managing stock,
        and reducing vaccine wastage.

        Args:
            query (str): A question about organizing vaccination services or managing stock.

        Returns:
            str: Planning and stock guidance from Section 9.
        """
        print(f"📅 PLANNING & LOGISTICS (S9) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('nine'):
            return "Section 9 retriever not available"
        return section_tool_wrapper(section_retrievers['nine'], section_paths['nine'], query)

    def communication_tool(query: str) -> str:
        """
        (Section 10) The specific tool for **social mobilization and communication**. Use this for
        questions about communication strategies, addressing **vaccine hesitancy**, managing rumors,
        and community outreach to promote vaccination.

        Args:
            query (str): A question about public engagement or communication for vaccination.

        Returns:
            str: Public mobilization strategies from Section 10.
        """
        print(f"📢 COMMUNICATION (S10) TOOL CALLED: {query[:50]}...")
        if not section_retrievers.get('ten'):
            return "Section 10 retriever not available"
        return section_tool_wrapper(section_retrievers['ten'], section_paths['ten'], query)

    # Create FunctionTool objects with new, clearer names
    tools = [
        FunctionTool.from_defaults(name="general_guide_tool", fn=general_guide_tool),
        FunctionTool.from_defaults(name="who_immunization_tool", fn=who_immunization_tool),
        # Section-specific tools
        FunctionTool.from_defaults(name="program_overview_tool", fn=program_overview_tool),
        FunctionTool.from_defaults(name="disease_info_tool", fn=disease_info_tool),
        FunctionTool.from_defaults(name="vaccine_properties_tool", fn=vaccine_properties_tool),
        FunctionTool.from_defaults(name="catch_up_vaccination_tool", fn=catch_up_vaccination_tool),
        FunctionTool.from_defaults(name="special_populations_tool", fn=special_populations_tool),
        FunctionTool.from_defaults(name="cold_chain_tool", fn=cold_chain_tool),
        FunctionTool.from_defaults(name="injection_safety_tool", fn=injection_safety_tool),
        FunctionTool.from_defaults(name="session_management_tool", fn=session_management_tool),
        FunctionTool.from_defaults(name="planning_and_logistics_tool", fn=planning_and_logistics_tool),
        FunctionTool.from_defaults(name="communication_tool", fn=communication_tool),
    ]

    print(f"✅ Created {len(tools)} tools with improved routing descriptions")
    return tools

def prepare_environment():
    """Main function to prepare the environment and return tools"""
    print("🚀 Starting environment preparation...")
    print("🔧 Setting up models...")
    embedding_function, llm = setup_models()
    
    print("🛠️ Creating section tools...")
    tools = create_section_tools(embedding_function, llm)
    
    print("✅ Environment prepared successfully!")
    print(f"📋 Created {len(tools)} tools")
    
    return tools, llm