Spaces:

mahmoudalrefaey
/

PDFPal-PDF-chatbot

Sleeping

File size: 4,852 Bytes

b35e487

"""

PDF Processor Module

Handles PDF text extraction and chunking for RAG pipeline

"""

import logging
from typing import List, Optional
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

class PDFProcessor:
    """Handles PDF processing, text extraction, and chunking"""
    
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """

        Initialize PDF processor

        

        Args:

            chunk_size: Size of text chunks

            chunk_overlap: Overlap between chunks

        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        
        # Configure logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """

        Extract text from PDF file

        

        Args:

            pdf_path: Path to PDF file

            

        Returns:

            Extracted text as string

        """
        try:
            self.logger.info(f"Extracting text from: {pdf_path}")
            
            with open(pdf_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                text = ""
                
                for page_num, page in enumerate(pdf_reader.pages):
                    try:
                        page_text = page.extract_text()
                        if page_text:
                            text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
                    except Exception as e:
                        self.logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
                        continue
                
                self.logger.info(f"Extracted {len(text)} characters from PDF")
                return text
                
        except Exception as e:
            self.logger.error(f"Error reading PDF file {pdf_path}: {e}")
            raise
    
    def split_text_into_chunks(self, text: str) -> List[Document]:
        """

        Split text into chunks using LangChain text splitter

        

        Args:

            text: Text to split

            

        Returns:

            List of Document objects

        """
        try:
            self.logger.info("Splitting text into chunks")
            
            # Create a single document first
            documents = [Document(page_content=text, metadata={"source": "pdf"})]
            
            # Split into chunks
            chunks = self.text_splitter.split_documents(documents)
            
            self.logger.info(f"Created {len(chunks)} text chunks")
            return chunks
            
        except Exception as e:
            self.logger.error(f"Error splitting text: {e}")
            raise
    
    def process_pdf(self, pdf_path: str) -> List[Document]:
        """

        Complete PDF processing pipeline

        

        Args:

            pdf_path: Path to PDF file

            

        Returns:

            List of Document chunks

        """
        try:
            # Extract text
            text = self.extract_text_from_pdf(pdf_path)
            
            if not text.strip():
                self.logger.warning("No text extracted from PDF")
                return []
            
            # Split into chunks
            chunks = self.split_text_into_chunks(text)
            
            # Add metadata
            for chunk in chunks:
                chunk.metadata["source"] = pdf_path
                chunk.metadata["chunk_size"] = len(chunk.page_content)
            
            return chunks
            
        except Exception as e:
            self.logger.error(f"Error processing PDF {pdf_path}: {e}")
            raise
    
    def get_chunk_stats(self, chunks: List[Document]) -> dict:
        """

        Get statistics about the chunks

        

        Args:

            chunks: List of Document chunks

            

        Returns:

            Dictionary with chunk statistics

        """
        if not chunks:
            return {"total_chunks": 0, "avg_chunk_size": 0, "total_characters": 0}
        
        total_chars = sum(len(chunk.page_content) for chunk in chunks)
        avg_size = total_chars / len(chunks)
        
        return {
            "total_chunks": len(chunks),
            "avg_chunk_size": round(avg_size, 2),
            "total_characters": total_chars
        }