Spaces:

KingArthur111
/

DocuMind

Sleeping

DocuMind / src /pipeline.py

MOHITRAJDEO12345

Fresh start: Clean repository without binary files

b3f1583 4 months ago

5.23 kB

	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain.chains import RetrievalQAWithSourcesChain
	from langchain_community.vectorstores import Chroma
	from langchain_core.prompts import PromptTemplate
	from langchain_core.documents import Document
	from typing import List

	class RAGPipeline:
	def __init__(self, vector_store: Chroma, api_key: str):
	self.vector_store = vector_store
	self.llm = ChatGoogleGenerativeAI(
	model="gemini-2.0-flash",
	google_api_key=api_key,
	temperature=0.2,
	)
	self.retriever = self.vector_store.as_retriever(
	search_type="similarity",
	search_kwargs={"k": 5}
	)

	# Define the prompt template for the LLM
	# This template instructs the model to answer based on the provided context
	# and to include source citations.
	template = """
	You are a helpful assistant. Use the following context to answer the question at the end.
	If you don't know the answer, just say that you don't know, don't try to make up an answer.

	Context:
	{context}

	Question:
	{question}

	Instructions:
	1. Provide a detailed and accurate answer based ONLY on the provided context.
	2. When referencing information, mention which source and page it comes from.
	3. If the context doesn't contain enough information, say so clearly.
	4. Keep your answer concise but comprehensive.

	Answer:
	"""
	self.prompt = PromptTemplate(
	template=template,
	input_variables=["context", "question"]
	)

	def format_documents_with_citations(self, documents: List) -> str:
	"""
	Formats the retrieved documents into a single string, including metadata for citations.
	"""
	formatted_text = []
	for i, doc in enumerate(documents, 1):
	content = doc.page_content
	source = doc.metadata.get("source", "unknown")
	page = doc.metadata.get("page", "unknown")
	formatted_text.append(f"Source {i}:\nFile: {source}\nPage: {page}\nContent:\n{content}\n")
	return "\n---\n".join(formatted_text)

	def get_source_info_with_scores(self, documents: List) -> str:
	"""
	Gets source information with confidence scores for the retrieved documents.
	"""
	source_info = []
	for i, doc in enumerate(documents, 1):
	source = doc.metadata.get("source", "unknown")
	page = doc.metadata.get("page", "unknown")

	# Calculate confidence score based on multiple factors:
	# 1. Retrieval order (higher for top results)
	# 2. Content length (longer content might be more relevant)
	# 3. Position in document (earlier pages might be more important)
	base_score = 1.0 - (i - 1) * 0.15 # Order factor
	length_factor = min(1.0, len(doc.page_content) / 1000) # Length factor
	page_factor = max(0.8, 1.0 - (page - 1) * 0.05) if isinstance(page, int) else 1.0

	confidence_score = base_score * length_factor * page_factor
	confidence_score = max(0.1, min(1.0, confidence_score)) # Clamp between 0.1 and 1.0
	confidence_percent = int(confidence_score * 100)

	# Determine confidence level
	if confidence_percent >= 90:
	level = "Very High"
	elif confidence_percent >= 75:
	level = "High"
	elif confidence_percent >= 60:
	level = "Medium"
	elif confidence_percent >= 40:
	level = "Low"
	else:
	level = "Very Low"

	source_info.append(f"• Source {i}: {source}")
	source_info.append(f" - Page: {page}")
	source_info.append(f" - Confidence: {confidence_percent}% ({level})")
	source_info.append(f" - Content Preview: {doc.page_content[:200]}...")

	return "\n".join(source_info)

	def answer_question(self, question: str) -> str:
	"""
	Executes the RAG pipeline: retrieves documents and generates a response.
	"""
	# Step 1: Retrieve relevant documents with scores
	retrieved_docs = self.retriever.get_relevant_documents(question)

	if not retrieved_docs:
	return "I am sorry, I could not find any relevant information in the documents to answer your question."

	# Step 2: Format the retrieved documents for the prompt
	formatted_context = self.format_documents_with_citations(retrieved_docs)

	# Step 3: Create the final prompt
	final_prompt = self.prompt.format(context=formatted_context, question=question)

	# Step 4: Call the LLM to generate the answer
	response = self.llm.invoke(final_prompt).content

	# Step 5: Add source information and confidence scores to the response
	source_info = self.get_source_info_with_scores(retrieved_docs)

	# Combine the response with source information
	full_response = f"{response}\n\nSources and Context:\n{source_info}"

	return full_response