Spaces:

KingArthur111
/

DocuMind

Sleeping

File size: 5,227 Bytes

b3f1583

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from typing import List

class RAGPipeline:
    def __init__(self, vector_store: Chroma, api_key: str):
        self.vector_store = vector_store
        self.llm = ChatGoogleGenerativeAI(
            model="gemini-2.0-flash", 
            google_api_key=api_key,
            temperature=0.2,
        )
        self.retriever = self.vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 5}
        )
        
        # Define the prompt template for the LLM
        # This template instructs the model to answer based on the provided context
        # and to include source citations.
        template = """
        You are a helpful assistant. Use the following context to answer the question at the end.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.

        Context:
        {context}

        Question:
        {question}

        Instructions:
        1. Provide a detailed and accurate answer based ONLY on the provided context.
        2. When referencing information, mention which source and page it comes from.
        3. If the context doesn't contain enough information, say so clearly.
        4. Keep your answer concise but comprehensive.

        Answer:
        """
        self.prompt = PromptTemplate(
            template=template,
            input_variables=["context", "question"]
        )

    def format_documents_with_citations(self, documents: List) -> str:
        """
        Formats the retrieved documents into a single string, including metadata for citations.
        """
        formatted_text = []
        for i, doc in enumerate(documents, 1):
            content = doc.page_content
            source = doc.metadata.get("source", "unknown")
            page = doc.metadata.get("page", "unknown")
            formatted_text.append(f"Source {i}:\nFile: {source}\nPage: {page}\nContent:\n{content}\n")
        return "\n---\n".join(formatted_text)

    def get_source_info_with_scores(self, documents: List) -> str:
        """
        Gets source information with confidence scores for the retrieved documents.
        """
        source_info = []
        for i, doc in enumerate(documents, 1):
            source = doc.metadata.get("source", "unknown")
            page = doc.metadata.get("page", "unknown")

            # Calculate confidence score based on multiple factors:
            # 1. Retrieval order (higher for top results)
            # 2. Content length (longer content might be more relevant)
            # 3. Position in document (earlier pages might be more important)
            base_score = 1.0 - (i - 1) * 0.15  # Order factor
            length_factor = min(1.0, len(doc.page_content) / 1000)  # Length factor
            page_factor = max(0.8, 1.0 - (page - 1) * 0.05) if isinstance(page, int) else 1.0

            confidence_score = base_score * length_factor * page_factor
            confidence_score = max(0.1, min(1.0, confidence_score))  # Clamp between 0.1 and 1.0
            confidence_percent = int(confidence_score * 100)

            # Determine confidence level
            if confidence_percent >= 90:
                level = "Very High"
            elif confidence_percent >= 75:
                level = "High"
            elif confidence_percent >= 60:
                level = "Medium"
            elif confidence_percent >= 40:
                level = "Low"
            else:
                level = "Very Low"

            source_info.append(f"• **Source {i}**: {source}")
            source_info.append(f"  - **Page**: {page}")
            source_info.append(f"  - **Confidence**: {confidence_percent}% ({level})")
            source_info.append(f"  - **Content Preview**: {doc.page_content[:200]}...")

        return "\n".join(source_info)

    def answer_question(self, question: str) -> str:
        """
        Executes the RAG pipeline: retrieves documents and generates a response.
        """
        # Step 1: Retrieve relevant documents with scores
        retrieved_docs = self.retriever.get_relevant_documents(question)

        if not retrieved_docs:
            return "I am sorry, I could not find any relevant information in the documents to answer your question."

        # Step 2: Format the retrieved documents for the prompt
        formatted_context = self.format_documents_with_citations(retrieved_docs)

        # Step 3: Create the final prompt
        final_prompt = self.prompt.format(context=formatted_context, question=question)

        # Step 4: Call the LLM to generate the answer
        response = self.llm.invoke(final_prompt).content

        # Step 5: Add source information and confidence scores to the response
        source_info = self.get_source_info_with_scores(retrieved_docs)

        # Combine the response with source information
        full_response = f"{response}\n\n**Sources and Context:**\n{source_info}"

        return full_response