Spaces:

mahmoudalrefaey
/

PDFPal-PDF-chatbot

Sleeping

App Files Files Community

PDFPal-PDF-chatbot / modules /pdf_processor.py

mahmoudalrefaey

Upload 6 files

b35e487 verified 7 months ago

raw

history blame contribute delete

4.85 kB

	"""
	PDF Processor Module
	Handles PDF text extraction and chunking for RAG pipeline
	"""

	import logging
	from typing import List, Optional
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema import Document

	class PDFProcessor:
	"""Handles PDF processing, text extraction, and chunking"""

	def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
	"""
	Initialize PDF processor

	Args:
	chunk_size: Size of text chunks
	chunk_overlap: Overlap between chunks
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	self.logger = logging.getLogger(__name__)

	def extract_text_from_pdf(self, pdf_path: str) -> str:
	"""
	Extract text from PDF file

	Args:
	pdf_path: Path to PDF file

	Returns:
	Extracted text as string
	"""
	try:
	self.logger.info(f"Extracting text from: {pdf_path}")

	with open(pdf_path, 'rb') as file:
	pdf_reader = PdfReader(file)
	text = ""

	for page_num, page in enumerate(pdf_reader.pages):
	try:
	page_text = page.extract_text()
	if page_text:
	text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
	except Exception as e:
	self.logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
	continue

	self.logger.info(f"Extracted {len(text)} characters from PDF")
	return text

	except Exception as e:
	self.logger.error(f"Error reading PDF file {pdf_path}: {e}")
	raise

	def split_text_into_chunks(self, text: str) -> List[Document]:
	"""
	Split text into chunks using LangChain text splitter

	Args:
	text: Text to split

	Returns:
	List of Document objects
	"""
	try:
	self.logger.info("Splitting text into chunks")

	# Create a single document first
	documents = [Document(page_content=text, metadata={"source": "pdf"})]

	# Split into chunks
	chunks = self.text_splitter.split_documents(documents)

	self.logger.info(f"Created {len(chunks)} text chunks")
	return chunks

	except Exception as e:
	self.logger.error(f"Error splitting text: {e}")
	raise

	def process_pdf(self, pdf_path: str) -> List[Document]:
	"""
	Complete PDF processing pipeline

	Args:
	pdf_path: Path to PDF file

	Returns:
	List of Document chunks
	"""
	try:
	# Extract text
	text = self.extract_text_from_pdf(pdf_path)

	if not text.strip():
	self.logger.warning("No text extracted from PDF")
	return []

	# Split into chunks
	chunks = self.split_text_into_chunks(text)

	# Add metadata
	for chunk in chunks:
	chunk.metadata["source"] = pdf_path
	chunk.metadata["chunk_size"] = len(chunk.page_content)

	return chunks

	except Exception as e:
	self.logger.error(f"Error processing PDF {pdf_path}: {e}")
	raise

	def get_chunk_stats(self, chunks: List[Document]) -> dict:
	"""
	Get statistics about the chunks

	Args:
	chunks: List of Document chunks

	Returns:
	Dictionary with chunk statistics
	"""
	if not chunks:
	return {"total_chunks": 0, "avg_chunk_size": 0, "total_characters": 0}

	total_chars = sum(len(chunk.page_content) for chunk in chunks)
	avg_size = total_chars / len(chunks)

	return {
	"total_chunks": len(chunks),
	"avg_chunk_size": round(avg_size, 2),
	"total_characters": total_chars
	}