Spaces:

Ansnaeem
/

rag-pdf-chatbot

Sleeping

App Files Files Community

rag-pdf-chatbot / app.py

Ansnaeem

Update app.py

a3382b8 verified 2 months ago

raw

history blame contribute delete

12.2 kB

	import gradio as gr
	import PyPDF2
	import io
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from typing import List, Tuple, Dict
	import os
	from groq import Groq
	import json

	# Initialize Groq client (will use API key from environment variable)
	groq_api_key = os.getenv("GROQ_API_KEY")
	if not groq_api_key:
	print("Warning: GROQ_API_KEY not found in environment variables. Please set it to use the chatbot.")
	client = None
	else:
	client = Groq(api_key=groq_api_key)

	# Initialize sentence transformer model
	print("Loading sentence transformer model...")
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
	print("Model loaded!")

	# Global variables to store documents and embeddings
	documents_store = []
	embeddings_store = []
	metadata_store = [] # Store filename and page number for each chunk

	def extract_text_from_pdf(pdf_file) -> List[Tuple[str, str, int]]:
	"""
	Extract text from PDF file.
	Returns: List of tuples (text, filename, page_number)
	"""
	text_chunks = []
	try:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	filename = pdf_file.name if hasattr(pdf_file, 'name') else "uploaded_file.pdf"

	for page_num, page in enumerate(pdf_reader.pages, start=1):
	text = page.extract_text()
	if text.strip(): # Only add non-empty pages
	text_chunks.append((text, filename, page_num))

	return text_chunks
	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	return []

	def split_into_chunks(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
	"""
	Split text into overlapping chunks.
	"""
	words = text.split()
	chunks = []

	for i in range(0, len(words), chunk_size - overlap):
	chunk = ' '.join(words[i:i + chunk_size])
	if chunk.strip():
	chunks.append(chunk)

	return chunks

	def process_pdfs(pdf_files) -> Tuple[str, str]:
	"""
	Process uploaded PDF files and create embeddings.
	Returns: (status_message, preview_text)
	"""
	global documents_store, embeddings_store, metadata_store

	if pdf_files is None or len(pdf_files) == 0:
	return "No files uploaded.", ""

	documents_store = []
	embeddings_store = []
	metadata_store = []

	all_text_chunks = []
	preview_text = "=== PDF PREVIEW ===\n\n"

	for pdf_file in pdf_files:
	extracted_chunks = extract_text_from_pdf(pdf_file)

	if not extracted_chunks:
	continue

	filename = extracted_chunks[0][1]
	preview_text += f"📄 File: {filename}\n"
	preview_text += f" Pages: {len(extracted_chunks)}\n"

	# Get first page preview
	if extracted_chunks:
	first_page_text = extracted_chunks[0][0][:500] # First 500 chars
	preview_text += f" Preview (Page 1): {first_page_text}...\n\n"

	# Split each page into smaller chunks
	for page_text, file_name, page_num in extracted_chunks:
	chunks = split_into_chunks(page_text)
	for chunk in chunks:
	all_text_chunks.append((chunk, file_name, page_num))

	if not all_text_chunks:
	return "No text could be extracted from the PDFs.", preview_text

	# Create embeddings
	print(f"Creating embeddings for {len(all_text_chunks)} chunks...")
	texts = [chunk[0] for chunk in all_text_chunks]
	embeddings = embedding_model.encode(texts, show_progress_bar=True)

	documents_store = texts
	embeddings_store = embeddings
	metadata_store = [(chunk[1], chunk[2]) for chunk in all_text_chunks]

	# Generate summary
	total_chunks = len(all_text_chunks)
	unique_files = len(set(chunk[1] for chunk in all_text_chunks))
	preview_text += f"\n=== SUMMARY ===\n"
	preview_text += f"Total documents processed: {unique_files}\n"
	preview_text += f"Total text chunks created: {total_chunks}\n"
	preview_text += f"Ready for questions!\n"

	return f"✅ Successfully processed {unique_files} PDF file(s) with {total_chunks} chunks!", preview_text

	def retrieve_relevant_chunks(query: str, top_k: int = 3) -> List[Tuple[str, str, int, float]]:
	"""
	Retrieve top-k most relevant chunks using cosine similarity.
	Returns: List of (chunk_text, filename, page_num, similarity_score)
	"""
	if len(documents_store) == 0:
	return []

	# Encode query
	query_embedding = embedding_model.encode([query])[0]

	# Calculate cosine similarity
	similarities = np.dot(embeddings_store, query_embedding) / (
	np.linalg.norm(embeddings_store, axis=1) * np.linalg.norm(query_embedding)
	)

	# Get top-k indices
	top_indices = np.argsort(similarities)[::-1][:top_k]

	# Return top chunks with metadata
	results = []
	for idx in top_indices:
	results.append((
	documents_store[idx],
	metadata_store[idx][0],
	metadata_store[idx][1],
	float(similarities[idx])
	))

	return results

	def convert_history_to_gradio_format(history):
	"""Convert history from tuple format to Gradio 6 format."""
	if not history:
	return []

	gradio_history = []
	for item in history:
	if isinstance(item, tuple) and len(item) == 2:
	# Convert tuple (user_msg, assistant_msg) to dict format
	gradio_history.append({"role": "user", "content": item[0]})
	gradio_history.append({"role": "assistant", "content": item[1]})
	elif isinstance(item, dict):
	# Already in correct format
	gradio_history.append(item)

	return gradio_history

	def convert_history_from_gradio_format(history):
	"""Convert history from Gradio 6 format to tuple format for internal use."""
	if not history:
	return []

	tuple_history = []
	i = 0
	while i < len(history):
	if isinstance(history[i], dict):
	if history[i].get("role") == "user" and i + 1 < len(history):
	if history[i + 1].get("role") == "assistant":
	tuple_history.append((history[i]["content"], history[i + 1]["content"]))
	i += 2
	continue
	elif isinstance(history[i], tuple):
	tuple_history.append(history[i])
	i += 1

	return tuple_history

	def generate_answer(question: str, history: List) -> Tuple[str, List]:
	"""
	Generate answer using Groq LLM with RAG context.
	"""
	# Convert history from Gradio 6 format to internal format
	internal_history = convert_history_from_gradio_format(history) if history else []

	if client is None:
	error_msg = "Error: GROQ_API_KEY not configured. Please set it as an environment variable or in Hugging Face Space secrets."
	internal_history.append((question, error_msg))
	return "", convert_history_to_gradio_format(internal_history)

	if len(documents_store) == 0:
	error_msg = "Please upload PDF files first!"
	internal_history.append((question, error_msg))
	return "", convert_history_to_gradio_format(internal_history)

	if not question.strip():
	return "", convert_history_to_gradio_format(internal_history)

	# Retrieve relevant chunks
	relevant_chunks = retrieve_relevant_chunks(question, top_k=3)

	if not relevant_chunks:
	error_msg = "No relevant context found in the documents."
	internal_history.append((question, error_msg))
	return "", convert_history_to_gradio_format(internal_history)

	# Build context with source references
	context_parts = []
	sources = []

	for i, (chunk, filename, page_num, score) in enumerate(relevant_chunks, 1):
	context_parts.append(f"[Source {i} - {filename}, Page {page_num}]\n{chunk}")
	sources.append(f"Source {i}: {filename}, Page {page_num} (similarity: {score:.3f})")

	context = "\n\n".join(context_parts)

	# Create prompt for Groq
	prompt = f"""You are a helpful assistant that answers questions based on the provided context from PDF documents.

	Context from documents:
	{context}

	Question: {question}

	Please provide a clear and accurate answer based on the context above. If the context doesn't contain enough information to answer the question, say so. At the end of your answer, mention the source references.

	Answer:"""

	try:
	# Call Groq API
	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": prompt
	}
	],
	model="llama-3.1-8b-instant",
	temperature=0.7,
	max_tokens=1024
	)

	answer = chat_completion.choices[0].message.content

	# Append sources to answer
	answer += "\n\n📚 Sources:\n" + "\n".join(sources)

	# Update history
	internal_history.append((question, answer))

	return "", convert_history_to_gradio_format(internal_history)

	except Exception as e:
	error_msg = f"Error generating answer: {str(e)}"
	internal_history.append((question, error_msg))
	return "", convert_history_to_gradio_format(internal_history)

	def clear_all():
	"""Clear all stored data."""
	global documents_store, embeddings_store, metadata_store
	documents_store = []
	embeddings_store = []
	metadata_store = []
	return "", "", []

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("""
	# 📚 RAG-Based Chatbot with PDF Support

	Upload multiple PDF files, and ask questions based on their content!

	Features:
	- 📄 Upload multiple PDF files
	- 👁️ Preview PDF content before asking questions
	- 🔍 Semantic search using sentence transformers
	- 💬 Chat with your documents using Groq LLM
	- 📖 Source references with page numbers
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📤 Upload PDFs")
	pdf_upload = gr.File(
	file_count="multiple",
	file_types=[".pdf"],
	label="Upload PDF Files"
	)
	process_btn = gr.Button("Process PDFs", variant="primary")
	status = gr.Textbox(label="Status", interactive=False)

	gr.Markdown("### 👁️ PDF Preview & Summary")
	preview = gr.Textbox(
	label="Preview",
	lines=15,
	interactive=False,
	placeholder="PDF preview and summary will appear here after processing..."
	)

	with gr.Column(scale=1):
	gr.Markdown("### 💬 Chat with Your Documents")
	chatbot = gr.Chatbot(
	label="Chat",
	height=400
	)
	question_input = gr.Textbox(
	label="Ask a question",
	placeholder="Type your question here...",
	lines=2
	)
	with gr.Row():
	submit_btn = gr.Button("Submit", variant="primary")
	clear_btn = gr.Button("Clear Chat")
	clear_all_btn = gr.Button("Clear All", variant="stop")

	# Event handlers
	process_btn.click(
	fn=process_pdfs,
	inputs=[pdf_upload],
	outputs=[status, preview]
	)

	submit_btn.click(
	fn=generate_answer,
	inputs=[question_input, chatbot],
	outputs=[question_input, chatbot]
	)

	question_input.submit(
	fn=generate_answer,
	inputs=[question_input, chatbot],
	outputs=[question_input, chatbot]
	)

	clear_btn.click(
	fn=lambda: ("", []),
	outputs=[question_input, chatbot]
	)

	clear_all_btn.click(
	fn=clear_all,
	outputs=[status, preview, chatbot]
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	theme=gr.themes.Soft()
	)