File size: 4,852 Bytes
b35e487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""

PDF Processor Module

Handles PDF text extraction and chunking for RAG pipeline

"""

import logging
from typing import List, Optional
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

class PDFProcessor:
    """Handles PDF processing, text extraction, and chunking"""
    
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """

        Initialize PDF processor

        

        Args:

            chunk_size: Size of text chunks

            chunk_overlap: Overlap between chunks

        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        
        # Configure logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """

        Extract text from PDF file

        

        Args:

            pdf_path: Path to PDF file

            

        Returns:

            Extracted text as string

        """
        try:
            self.logger.info(f"Extracting text from: {pdf_path}")
            
            with open(pdf_path, 'rb') as file:
                pdf_reader = PdfReader(file)
                text = ""
                
                for page_num, page in enumerate(pdf_reader.pages):
                    try:
                        page_text = page.extract_text()
                        if page_text:
                            text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
                    except Exception as e:
                        self.logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
                        continue
                
                self.logger.info(f"Extracted {len(text)} characters from PDF")
                return text
                
        except Exception as e:
            self.logger.error(f"Error reading PDF file {pdf_path}: {e}")
            raise
    
    def split_text_into_chunks(self, text: str) -> List[Document]:
        """

        Split text into chunks using LangChain text splitter

        

        Args:

            text: Text to split

            

        Returns:

            List of Document objects

        """
        try:
            self.logger.info("Splitting text into chunks")
            
            # Create a single document first
            documents = [Document(page_content=text, metadata={"source": "pdf"})]
            
            # Split into chunks
            chunks = self.text_splitter.split_documents(documents)
            
            self.logger.info(f"Created {len(chunks)} text chunks")
            return chunks
            
        except Exception as e:
            self.logger.error(f"Error splitting text: {e}")
            raise
    
    def process_pdf(self, pdf_path: str) -> List[Document]:
        """

        Complete PDF processing pipeline

        

        Args:

            pdf_path: Path to PDF file

            

        Returns:

            List of Document chunks

        """
        try:
            # Extract text
            text = self.extract_text_from_pdf(pdf_path)
            
            if not text.strip():
                self.logger.warning("No text extracted from PDF")
                return []
            
            # Split into chunks
            chunks = self.split_text_into_chunks(text)
            
            # Add metadata
            for chunk in chunks:
                chunk.metadata["source"] = pdf_path
                chunk.metadata["chunk_size"] = len(chunk.page_content)
            
            return chunks
            
        except Exception as e:
            self.logger.error(f"Error processing PDF {pdf_path}: {e}")
            raise
    
    def get_chunk_stats(self, chunks: List[Document]) -> dict:
        """

        Get statistics about the chunks

        

        Args:

            chunks: List of Document chunks

            

        Returns:

            Dictionary with chunk statistics

        """
        if not chunks:
            return {"total_chunks": 0, "avg_chunk_size": 0, "total_characters": 0}
        
        total_chars = sum(len(chunk.page_content) for chunk in chunks)
        avg_size = total_chars / len(chunks)
        
        return {
            "total_chunks": len(chunks),
            "avg_chunk_size": round(avg_size, 2),
            "total_characters": total_chars
        }