File size: 7,337 Bytes
97052b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
from typing import List, Dict
from langchain_community.document_loaders import PyPDFLoader
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_classic.schema import Document
from config import Config
import re

class PDFProcessor:
    """Handles PDF loading, parsing, and chunking for insurance documents"""
    
    def __init__(self):
        self.chunking_config = Config.get_chunking_config()
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunking_config["chunk_size"],
            chunk_overlap=self.chunking_config["chunk_overlap"],
            separators=self.chunking_config["separators"],
            length_function=len,
        )
    
    def load_pdf(self, file_path: str) -> List[Document]:
        """

        Load PDF file and extract text

        

        Args:

            file_path: Path to the PDF file

            

        Returns:

            List of Document objects with page content and metadata

        """
        try:
            loader = PyPDFLoader(file_path)
            documents = loader.load()
            
            # Add source filename to metadata
            filename = os.path.basename(file_path)
            for doc in documents:
                doc.metadata["source_file"] = filename
                doc.metadata["total_pages"] = len(documents)
            
            print(f"Loaded {len(documents)} pages from {filename}")
            return documents
            
        except Exception as e:
            print(f"Error loading PDF {file_path}: {str(e)}")
            raise
    
    def extract_metadata(self, documents: List[Document]) -> Dict:
        """

        Extract useful metadata from insurance documents

        

        Args:

            documents: List of Document objects

            

        Returns:

            Dictionary containing extracted metadata

        """
        metadata = {
            "total_pages": len(documents),
            "source_file": documents[0].metadata.get("source_file", "unknown"),
            "document_type": self._identify_document_type(documents),
        }
        
        return metadata
    
    def identify_document_type(self, documents: List[Document]) -> str:
        """

        Attempt to identify the type of insurance document

        

        Args:

            documents: List of Document objects

            

        Returns:

            String indicating document type

        """
        # Combine first few pages to identify document type
        sample_text = " ".join([doc.page_content for doc in documents[:3]]).lower()
        
        # Common insurance document keywords
        if "policy schedule" in sample_text or "policy document" in sample_text:
            return "policy_document"
        elif "proposal form" in sample_text:
            return "proposal_form"
        elif "claim" in sample_text:
            return "claim_form"
        elif "endorsement" in sample_text:
            return "endorsement"
        elif "add-on" in sample_text or "rider" in sample_text:
            return "addon_coverage"
        else:
            return "general_insurance"
    
    def clean_text(self, text: str) -> str:
        """

        Clean and normalize text from PDF

        

        Args:

            text: Raw text from PDF

            

        Returns:

            Cleaned text

        """
        # Remove excessive whitespace
        text = " ".join(text.split())
        

        text = re.sub(r'\bPage\s+\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)
        text = re.sub(r'\bPage\s+\d+/\d+\b', '', text, flags=re.IGNORECASE)
        
        text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE)
        
        return text.strip()
    
    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        """

        Split documents into chunks optimized for RAG retrieval

        

        Args:

            documents: List of Document objects

            

        Returns:

            List of chunked Document objects with enhanced metadata

        """
        # Clean text in all documents
        for doc in documents:
            doc.page_content = self.clean_text(doc.page_content)
        
        # Split documents into chunks
        chunks = self.text_splitter.split_documents(documents)
        
        # Enhance metadata for each chunk
        for i, chunk in enumerate(chunks):
            chunk.metadata["chunk_id"] = i
            chunk.metadata["chunk_size"] = len(chunk.page_content)
            
            # Add context hints based on content
            content_lower = chunk.page_content.lower()
            
            # Identify important sections
            if any(keyword in content_lower for keyword in ["exclusion", "not covered", "does not cover"]):
                chunk.metadata["section_type"] = "exclusions"
            elif any(keyword in content_lower for keyword in ["coverage", "covered", "insured"]):
                chunk.metadata["section_type"] = "coverage"
            elif any(keyword in content_lower for keyword in ["premium", "cost", "price"]):
                chunk.metadata["section_type"] = "pricing"
            elif any(keyword in content_lower for keyword in ["add-on", "rider", "optional"]):
                chunk.metadata["section_type"] = "addons"
            elif any(keyword in content_lower for keyword in ["claim", "settlement"]):
                chunk.metadata["section_type"] = "claims"
            else:
                chunk.metadata["section_type"] = "general"
        
        print(f"Created {len(chunks)} chunks from {len(documents)} pages")
        return chunks
    
    def process_pdf(self, file_path: str) -> tuple[List[Document], Dict]:
        """

        Complete pipeline: Load, extract metadata, and chunk a PDF

        

        Args:

            file_path: Path to the PDF file

            

        Returns:

            Tuple of (chunks, metadata)

        """
        # Load PDF
        documents = self.load_pdf(file_path)
        
        # Extract metadata
        metadata = self.extract_metadata(documents)
        
        # Chunk documents
        chunks = self.chunk_documents(documents)
        
        return chunks, metadata
    
    def process_multiple_pdfs(self, file_paths: List[str]) -> tuple[List[Document], List[Dict]]:
        """

        Process multiple PDF files

        

        Args:

            file_paths: List of paths to PDF files

            

        Returns:

            Tuple of (all_chunks, all_metadata)

        """
        all_chunks = []
        all_metadata = []
        
        for file_path in file_paths:
            try:
                chunks, metadata = self.process_pdf(file_path)
                all_chunks.extend(chunks)
                all_metadata.append(metadata)
            except Exception as e:
                print(f"✗ Failed to process {file_path}: {str(e)}")
                continue
        
        print(f"\n Processed {len(file_paths)} PDFs")
        print(f"Total chunks created: {len(all_chunks)}")
        
        return all_chunks, all_metadata