DocuMind / src /pipeline.py
MOHITRAJDEO12345
Fresh start: Clean repository without binary files
b3f1583
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from typing import List
class RAGPipeline:
def __init__(self, vector_store: Chroma, api_key: str):
self.vector_store = vector_store
self.llm = ChatGoogleGenerativeAI(
model="gemini-2.0-flash",
google_api_key=api_key,
temperature=0.2,
)
self.retriever = self.vector_store.as_retriever(
search_type="similarity",
search_kwargs={"k": 5}
)
# Define the prompt template for the LLM
# This template instructs the model to answer based on the provided context
# and to include source citations.
template = """
You are a helpful assistant. Use the following context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context:
{context}
Question:
{question}
Instructions:
1. Provide a detailed and accurate answer based ONLY on the provided context.
2. When referencing information, mention which source and page it comes from.
3. If the context doesn't contain enough information, say so clearly.
4. Keep your answer concise but comprehensive.
Answer:
"""
self.prompt = PromptTemplate(
template=template,
input_variables=["context", "question"]
)
def format_documents_with_citations(self, documents: List) -> str:
"""
Formats the retrieved documents into a single string, including metadata for citations.
"""
formatted_text = []
for i, doc in enumerate(documents, 1):
content = doc.page_content
source = doc.metadata.get("source", "unknown")
page = doc.metadata.get("page", "unknown")
formatted_text.append(f"Source {i}:\nFile: {source}\nPage: {page}\nContent:\n{content}\n")
return "\n---\n".join(formatted_text)
def get_source_info_with_scores(self, documents: List) -> str:
"""
Gets source information with confidence scores for the retrieved documents.
"""
source_info = []
for i, doc in enumerate(documents, 1):
source = doc.metadata.get("source", "unknown")
page = doc.metadata.get("page", "unknown")
# Calculate confidence score based on multiple factors:
# 1. Retrieval order (higher for top results)
# 2. Content length (longer content might be more relevant)
# 3. Position in document (earlier pages might be more important)
base_score = 1.0 - (i - 1) * 0.15 # Order factor
length_factor = min(1.0, len(doc.page_content) / 1000) # Length factor
page_factor = max(0.8, 1.0 - (page - 1) * 0.05) if isinstance(page, int) else 1.0
confidence_score = base_score * length_factor * page_factor
confidence_score = max(0.1, min(1.0, confidence_score)) # Clamp between 0.1 and 1.0
confidence_percent = int(confidence_score * 100)
# Determine confidence level
if confidence_percent >= 90:
level = "Very High"
elif confidence_percent >= 75:
level = "High"
elif confidence_percent >= 60:
level = "Medium"
elif confidence_percent >= 40:
level = "Low"
else:
level = "Very Low"
source_info.append(f"• **Source {i}**: {source}")
source_info.append(f" - **Page**: {page}")
source_info.append(f" - **Confidence**: {confidence_percent}% ({level})")
source_info.append(f" - **Content Preview**: {doc.page_content[:200]}...")
return "\n".join(source_info)
def answer_question(self, question: str) -> str:
"""
Executes the RAG pipeline: retrieves documents and generates a response.
"""
# Step 1: Retrieve relevant documents with scores
retrieved_docs = self.retriever.get_relevant_documents(question)
if not retrieved_docs:
return "I am sorry, I could not find any relevant information in the documents to answer your question."
# Step 2: Format the retrieved documents for the prompt
formatted_context = self.format_documents_with_citations(retrieved_docs)
# Step 3: Create the final prompt
final_prompt = self.prompt.format(context=formatted_context, question=question)
# Step 4: Call the LLM to generate the answer
response = self.llm.invoke(final_prompt).content
# Step 5: Add source information and confidence scores to the response
source_info = self.get_source_info_with_scores(retrieved_docs)
# Combine the response with source information
full_response = f"{response}\n\n**Sources and Context:**\n{source_info}"
return full_response