Spaces:
Sleeping
Sleeping
File size: 7,337 Bytes
97052b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import os
from typing import List, Dict
from langchain_community.document_loaders import PyPDFLoader
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_classic.schema import Document
from config import Config
import re
class PDFProcessor:
"""Handles PDF loading, parsing, and chunking for insurance documents"""
def __init__(self):
self.chunking_config = Config.get_chunking_config()
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunking_config["chunk_size"],
chunk_overlap=self.chunking_config["chunk_overlap"],
separators=self.chunking_config["separators"],
length_function=len,
)
def load_pdf(self, file_path: str) -> List[Document]:
"""
Load PDF file and extract text
Args:
file_path: Path to the PDF file
Returns:
List of Document objects with page content and metadata
"""
try:
loader = PyPDFLoader(file_path)
documents = loader.load()
# Add source filename to metadata
filename = os.path.basename(file_path)
for doc in documents:
doc.metadata["source_file"] = filename
doc.metadata["total_pages"] = len(documents)
print(f"Loaded {len(documents)} pages from {filename}")
return documents
except Exception as e:
print(f"Error loading PDF {file_path}: {str(e)}")
raise
def extract_metadata(self, documents: List[Document]) -> Dict:
"""
Extract useful metadata from insurance documents
Args:
documents: List of Document objects
Returns:
Dictionary containing extracted metadata
"""
metadata = {
"total_pages": len(documents),
"source_file": documents[0].metadata.get("source_file", "unknown"),
"document_type": self._identify_document_type(documents),
}
return metadata
def identify_document_type(self, documents: List[Document]) -> str:
"""
Attempt to identify the type of insurance document
Args:
documents: List of Document objects
Returns:
String indicating document type
"""
# Combine first few pages to identify document type
sample_text = " ".join([doc.page_content for doc in documents[:3]]).lower()
# Common insurance document keywords
if "policy schedule" in sample_text or "policy document" in sample_text:
return "policy_document"
elif "proposal form" in sample_text:
return "proposal_form"
elif "claim" in sample_text:
return "claim_form"
elif "endorsement" in sample_text:
return "endorsement"
elif "add-on" in sample_text or "rider" in sample_text:
return "addon_coverage"
else:
return "general_insurance"
def clean_text(self, text: str) -> str:
"""
Clean and normalize text from PDF
Args:
text: Raw text from PDF
Returns:
Cleaned text
"""
# Remove excessive whitespace
text = " ".join(text.split())
text = re.sub(r'\bPage\s+\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)
text = re.sub(r'\bPage\s+\d+/\d+\b', '', text, flags=re.IGNORECASE)
text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE)
return text.strip()
def chunk_documents(self, documents: List[Document]) -> List[Document]:
"""
Split documents into chunks optimized for RAG retrieval
Args:
documents: List of Document objects
Returns:
List of chunked Document objects with enhanced metadata
"""
# Clean text in all documents
for doc in documents:
doc.page_content = self.clean_text(doc.page_content)
# Split documents into chunks
chunks = self.text_splitter.split_documents(documents)
# Enhance metadata for each chunk
for i, chunk in enumerate(chunks):
chunk.metadata["chunk_id"] = i
chunk.metadata["chunk_size"] = len(chunk.page_content)
# Add context hints based on content
content_lower = chunk.page_content.lower()
# Identify important sections
if any(keyword in content_lower for keyword in ["exclusion", "not covered", "does not cover"]):
chunk.metadata["section_type"] = "exclusions"
elif any(keyword in content_lower for keyword in ["coverage", "covered", "insured"]):
chunk.metadata["section_type"] = "coverage"
elif any(keyword in content_lower for keyword in ["premium", "cost", "price"]):
chunk.metadata["section_type"] = "pricing"
elif any(keyword in content_lower for keyword in ["add-on", "rider", "optional"]):
chunk.metadata["section_type"] = "addons"
elif any(keyword in content_lower for keyword in ["claim", "settlement"]):
chunk.metadata["section_type"] = "claims"
else:
chunk.metadata["section_type"] = "general"
print(f"Created {len(chunks)} chunks from {len(documents)} pages")
return chunks
def process_pdf(self, file_path: str) -> tuple[List[Document], Dict]:
"""
Complete pipeline: Load, extract metadata, and chunk a PDF
Args:
file_path: Path to the PDF file
Returns:
Tuple of (chunks, metadata)
"""
# Load PDF
documents = self.load_pdf(file_path)
# Extract metadata
metadata = self.extract_metadata(documents)
# Chunk documents
chunks = self.chunk_documents(documents)
return chunks, metadata
def process_multiple_pdfs(self, file_paths: List[str]) -> tuple[List[Document], List[Dict]]:
"""
Process multiple PDF files
Args:
file_paths: List of paths to PDF files
Returns:
Tuple of (all_chunks, all_metadata)
"""
all_chunks = []
all_metadata = []
for file_path in file_paths:
try:
chunks, metadata = self.process_pdf(file_path)
all_chunks.extend(chunks)
all_metadata.append(metadata)
except Exception as e:
print(f"✗ Failed to process {file_path}: {str(e)}")
continue
print(f"\n Processed {len(file_paths)} PDFs")
print(f"Total chunks created: {len(all_chunks)}")
return all_chunks, all_metadata
|