""" PDF Processor Module Handles PDF text extraction and chunking for RAG pipeline """ import logging from typing import List, Optional from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document class PDFProcessor: """Handles PDF processing, text extraction, and chunking""" def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): """ Initialize PDF processor Args: chunk_size: Size of text chunks chunk_overlap: Overlap between chunks """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=["\n\n", "\n", " ", ""] ) # Configure logging logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__name__) def extract_text_from_pdf(self, pdf_path: str) -> str: """ Extract text from PDF file Args: pdf_path: Path to PDF file Returns: Extracted text as string """ try: self.logger.info(f"Extracting text from: {pdf_path}") with open(pdf_path, 'rb') as file: pdf_reader = PdfReader(file) text = "" for page_num, page in enumerate(pdf_reader.pages): try: page_text = page.extract_text() if page_text: text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" except Exception as e: self.logger.warning(f"Error extracting text from page {page_num + 1}: {e}") continue self.logger.info(f"Extracted {len(text)} characters from PDF") return text except Exception as e: self.logger.error(f"Error reading PDF file {pdf_path}: {e}") raise def split_text_into_chunks(self, text: str) -> List[Document]: """ Split text into chunks using LangChain text splitter Args: text: Text to split Returns: List of Document objects """ try: self.logger.info("Splitting text into chunks") # Create a single document first documents = [Document(page_content=text, metadata={"source": "pdf"})] # Split into chunks chunks = self.text_splitter.split_documents(documents) self.logger.info(f"Created {len(chunks)} text chunks") return chunks except Exception as e: self.logger.error(f"Error splitting text: {e}") raise def process_pdf(self, pdf_path: str) -> List[Document]: """ Complete PDF processing pipeline Args: pdf_path: Path to PDF file Returns: List of Document chunks """ try: # Extract text text = self.extract_text_from_pdf(pdf_path) if not text.strip(): self.logger.warning("No text extracted from PDF") return [] # Split into chunks chunks = self.split_text_into_chunks(text) # Add metadata for chunk in chunks: chunk.metadata["source"] = pdf_path chunk.metadata["chunk_size"] = len(chunk.page_content) return chunks except Exception as e: self.logger.error(f"Error processing PDF {pdf_path}: {e}") raise def get_chunk_stats(self, chunks: List[Document]) -> dict: """ Get statistics about the chunks Args: chunks: List of Document chunks Returns: Dictionary with chunk statistics """ if not chunks: return {"total_chunks": 0, "avg_chunk_size": 0, "total_characters": 0} total_chars = sum(len(chunk.page_content) for chunk in chunks) avg_size = total_chars / len(chunks) return { "total_chunks": len(chunks), "avg_chunk_size": round(avg_size, 2), "total_characters": total_chars }