Spaces:

mahmoudalrefaey
/

PDFPal-PDF-chatbot

Sleeping

App Files Files Community

PDFPal-PDF-chatbot / app.py

mahmoudalrefaey

Upload app.py

2594ae1 verified 8 months ago

raw

history blame contribute delete

13.9 kB

	"""
	PDFPal - A lightweight, chat-based RAG application
	Built with free, local models and deployable via Gradio
	"""

	import os
	import tempfile
	import gradio as gr
	import time
	from typing import List, Dict, Any
	from pathlib import Path

	# Import our custom modules
	from modules.pdf_processor import PDFProcessor
	from modules.embedding_manager import EmbeddingManager
	from modules.llm_manager import LLMManager
	from modules.rag_pipeline import RAGPipeline
	from modules.chat_manager import ChatManager
	from config import Config

	class PDFPalApp:
	"""Main PDFPal application using Gradio"""

	def __init__(self):
	"""Initialize the PDFPal application"""
	self.chat_manager = ChatManager()
	self.rag_pipeline = None
	self.uploaded_files = []
	self.current_model = Config.DEFAULT_LLM_MODEL

	# Initialize components
	self.pdf_processor = PDFProcessor()
	self.embedding_manager = EmbeddingManager()
	self.llm_manager = None

	# Create Gradio interface
	self.interface = self._create_interface()

	def _create_interface(self):
	"""Create the Gradio interface"""

	# Custom CSS for better styling
	css = """
	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}
	.chat-container {
	height: 600px;
	overflow-y: auto;
	border: 1px solid #e0e0e0;
	border-radius: 8px;
	padding: 20px;
	background: #fafafa;
	}
	.file-upload {
	border: 2px dashed #007bff;
	border-radius: 8px;
	padding: 20px;
	text-align: center;
	background: #f8f9fa;
	}
	"""

	with gr.Blocks(css=css, title="PDFPal - AI Chatbot", theme=gr.themes.Soft()) as interface:

	# Header
	gr.Markdown("""
	# 📚 PDFPal - AI Chatbot

	Chat with your PDF documents using local AI models!

	Upload one or more PDF files and start asking questions in natural language.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	# Sidebar for configuration
	gr.Markdown("### ⚙️ Configuration")

	# Model selection
	model_dropdown = gr.Dropdown(
	choices=Config.get_model_names(),
	value=Config.get_recommended_model(),
	label="🤖 Language Model",
	info="Choose a lightweight local model"
	)

	# Advanced settings
	with gr.Accordion("🔧 Advanced Settings", open=False):
	chunk_size = gr.Slider(
	minimum=500, maximum=2000, value=800, step=100,
	label="Chunk Size", info="Size of text chunks (smaller = faster)"
	)
	chunk_overlap = gr.Slider(
	minimum=50, maximum=500, value=100, step=50,
	label="Chunk Overlap", info="Overlap between chunks"
	)
	max_tokens = gr.Slider(
	minimum=100, maximum=1000, value=300, step=50,
	label="Max Response Tokens", info="Maximum response length (smaller = faster)"
	)
	temperature = gr.Slider(
	minimum=0.0, maximum=1.0, value=0.7, step=0.1,
	label="Temperature", info="Creativity level"
	)

	# File upload section
	gr.Markdown("### 📁 Upload Documents")
	file_upload = gr.File(
	file_count="multiple",
	file_types=[".pdf"],
	label="Choose PDF files"
	)

	process_btn = gr.Button("🔄 Process Documents", variant="primary")
	process_status = gr.Textbox(label="Status", interactive=False)

	# Model info
	model_info = gr.JSON(label="Model Information", visible=False)

	with gr.Column(scale=2):
	# Chat interface
	gr.Markdown("### 💬 Chat Interface")

	# Chat history display
	chat_history = gr.Chatbot(
	label="Conversation",
	height=500,
	show_label=False,
	container=True,
	bubble_full_width=False
	)

	# Chat input
	with gr.Row():
	chat_input = gr.Textbox(
	placeholder="Ask a question about your documents...",
	label="Your Question",
	scale=4
	)
	send_btn = gr.Button("Send", variant="primary", scale=1)

	# Clear chat button
	clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")

	# Export options
	with gr.Row():
	export_json_btn = gr.Button("📄 Export JSON")
	export_txt_btn = gr.Button("📝 Export Text")

	# Statistics
	stats_display = gr.JSON(label="Chat Statistics", visible=False)

	# Event handlers
	model_dropdown.change(
	fn=self._change_model,
	inputs=[model_dropdown],
	outputs=[model_info, process_status]
	)

	process_btn.click(
	fn=self._process_documents,
	inputs=[file_upload, chunk_size, chunk_overlap, model_dropdown],
	outputs=[process_status, model_info]
	)

	send_btn.click(
	fn=self._send_message,
	inputs=[chat_input, max_tokens, temperature],
	outputs=[chat_history, chat_input, stats_display],
	show_progress=True
	)

	chat_input.submit(
	fn=self._send_message,
	inputs=[chat_input, max_tokens, temperature],
	outputs=[chat_history, chat_input, stats_display],
	show_progress=True
	)

	clear_btn.click(
	fn=self._clear_chat,
	outputs=[chat_history, stats_display]
	)

	export_json_btn.click(
	fn=self._export_conversation_json,
	outputs=[gr.File()]
	)

	export_txt_btn.click(
	fn=self._export_conversation_text,
	outputs=[gr.File()]
	)

	return interface

	def _change_model(self, model_name):
	"""Change the language model"""
	try:
	self.current_model = model_name
	self.llm_manager = LLMManager(model_name=model_name)

	model_info = self.llm_manager.get_model_info()
	return model_info, f"✅ Model changed to {model_name}"
	except Exception as e:
	return {}, f"❌ Error changing model: {str(e)}"

	def _process_documents(self, files, chunk_size, chunk_overlap, model_name):
	"""Process uploaded PDF documents"""
	if not files:
	return "⚠️ Please upload PDF files first", {}

	try:
	# Update processor settings
	self.pdf_processor = PDFProcessor(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

	# Initialize LLM manager
	self.llm_manager = LLMManager(model_name=model_name)

	# Process all files
	all_chunks = []
	self.uploaded_files = []

	for file in files:
	# Handle different file object types from Gradio
	if hasattr(file, 'read'):
	# File-like object
	file_content = file.read()
	file_name = getattr(file, 'name', f'file_{len(self.uploaded_files)}.pdf')
	elif isinstance(file, str):
	# File path string
	with open(file, 'rb') as f:
	file_content = f.read()
	file_name = os.path.basename(file)
	else:
	# Try to get content as bytes
	file_content = bytes(file) if hasattr(file, '__bytes__') else str(file).encode()
	file_name = f'file_{len(self.uploaded_files)}.pdf'

	# Save uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
	tmp_file.write(file_content)
	tmp_path = tmp_file.name

	try:
	# Process PDF
	chunks = self.pdf_processor.process_pdf(tmp_path)
	all_chunks.extend(chunks)
	self.uploaded_files.append(file_name)
	finally:
	# Clean up temporary file
	os.unlink(tmp_path)

	if all_chunks:
	# Create knowledge base
	knowledge_base = self.embedding_manager.create_knowledge_base(all_chunks)

	# Initialize RAG pipeline
	self.rag_pipeline = RAGPipeline(
	knowledge_base=knowledge_base,
	llm_manager=self.llm_manager
	)

	model_info = self.llm_manager.get_model_info()
	status = f"✅ Processed {len(all_chunks)} text chunks from {len(files)} file(s)"
	return status, model_info
	else:
	return "❌ No text could be extracted from the uploaded files", {}

	except Exception as e:
	return f"❌ Error processing files: {str(e)}", {}

	def _send_message(self, message, max_tokens, temperature):
	"""Send a message and get response"""
	start_time = time.time()

	if not message.strip():
	return self.chat_manager.get_gradio_chat_history(), "", {}

	if not self.rag_pipeline:
	# Add user message
	self.chat_manager.add_message("user", message)

	# Add error response
	error_msg = "⚠️ Please upload and process documents first!"
	self.chat_manager.add_message("assistant", error_msg)

	return self.chat_manager.get_gradio_chat_history(), "", self.chat_manager.get_statistics()

	try:
	# Add user message
	self.chat_manager.add_message("user", message)

	# Get AI response with timing
	response_start = time.time()
	response = self.rag_pipeline.get_response(
	message,
	max_tokens=max_tokens,
	temperature=temperature
	)
	response_time = time.time() - response_start

	# Add AI response
	self.chat_manager.add_message("assistant", response)

	# Add performance info to statistics
	total_time = time.time() - start_time
	stats = self.chat_manager.get_statistics()
	stats.update({
	"response_time_seconds": round(response_time, 2),
	"total_time_seconds": round(total_time, 2),
	"performance_note": f"Response generated in {round(response_time, 2)}s"
	})

	return self.chat_manager.get_gradio_chat_history(), "", stats

	except Exception as e:
	error_msg = f"❌ Error: {str(e)}"
	self.chat_manager.add_message("assistant", error_msg)
	return self.chat_manager.get_gradio_chat_history(), "", self.chat_manager.get_statistics()

	def _clear_chat(self):
	"""Clear chat history"""
	self.chat_manager.clear_history()
	return [], {}

	def _export_conversation_json(self):
	"""Export conversation as JSON"""
	try:
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.json')
	self.chat_manager.save_conversation(temp_file.name)
	return temp_file.name
	except Exception as e:
	return None

	def _export_conversation_text(self):
	"""Export conversation as text"""
	try:
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.txt')
	self.chat_manager.export_conversation_text(temp_file.name)
	return temp_file.name
	except Exception as e:
	return None

	def launch(self, **kwargs):
	"""Launch the Gradio interface"""
	return self.interface.launch(**kwargs)

	def main():
	"""Main entry point"""
	app = PDFPalApp()
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=True
	)

	if __name__ == "__main__":
	main()