Spaces:

Clocksp
/

Insurance-AI

Sleeping

App Files Files Community

Insurance-AI / src /utils /pdf_processor.py

Clocksp

Upload 3 files

97052b8 verified 26 days ago

raw

history blame contribute delete

7.34 kB

	import os
	from typing import List, Dict
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
	from langchain_classic.schema import Document
	from config import Config
	import re

	class PDFProcessor:
	"""Handles PDF loading, parsing, and chunking for insurance documents"""

	def __init__(self):
	self.chunking_config = Config.get_chunking_config()
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=self.chunking_config["chunk_size"],
	chunk_overlap=self.chunking_config["chunk_overlap"],
	separators=self.chunking_config["separators"],
	length_function=len,
	)

	def load_pdf(self, file_path: str) -> List[Document]:
	"""
	Load PDF file and extract text

	Args:
	file_path: Path to the PDF file

	Returns:
	List of Document objects with page content and metadata
	"""
	try:
	loader = PyPDFLoader(file_path)
	documents = loader.load()

	# Add source filename to metadata
	filename = os.path.basename(file_path)
	for doc in documents:
	doc.metadata["source_file"] = filename
	doc.metadata["total_pages"] = len(documents)

	print(f"Loaded {len(documents)} pages from {filename}")
	return documents

	except Exception as e:
	print(f"Error loading PDF {file_path}: {str(e)}")
	raise

	def extract_metadata(self, documents: List[Document]) -> Dict:
	"""
	Extract useful metadata from insurance documents

	Args:
	documents: List of Document objects

	Returns:
	Dictionary containing extracted metadata
	"""
	metadata = {
	"total_pages": len(documents),
	"source_file": documents[0].metadata.get("source_file", "unknown"),
	"document_type": self._identify_document_type(documents),
	}

	return metadata

	def identify_document_type(self, documents: List[Document]) -> str:
	"""
	Attempt to identify the type of insurance document

	Args:
	documents: List of Document objects

	Returns:
	String indicating document type
	"""
	# Combine first few pages to identify document type
	sample_text = " ".join([doc.page_content for doc in documents[:3]]).lower()

	# Common insurance document keywords
	if "policy schedule" in sample_text or "policy document" in sample_text:
	return "policy_document"
	elif "proposal form" in sample_text:
	return "proposal_form"
	elif "claim" in sample_text:
	return "claim_form"
	elif "endorsement" in sample_text:
	return "endorsement"
	elif "add-on" in sample_text or "rider" in sample_text:
	return "addon_coverage"
	else:
	return "general_insurance"

	def clean_text(self, text: str) -> str:
	"""
	Clean and normalize text from PDF

	Args:
	text: Raw text from PDF

	Returns:
	Cleaned text
	"""
	# Remove excessive whitespace
	text = " ".join(text.split())


	text = re.sub(r'\bPage\s+\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)
	text = re.sub(r'\bPage\s+\d+/\d+\b', '', text, flags=re.IGNORECASE)

	text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE)

	return text.strip()

	def chunk_documents(self, documents: List[Document]) -> List[Document]:
	"""
	Split documents into chunks optimized for RAG retrieval

	Args:
	documents: List of Document objects

	Returns:
	List of chunked Document objects with enhanced metadata
	"""
	# Clean text in all documents
	for doc in documents:
	doc.page_content = self.clean_text(doc.page_content)

	# Split documents into chunks
	chunks = self.text_splitter.split_documents(documents)

	# Enhance metadata for each chunk
	for i, chunk in enumerate(chunks):
	chunk.metadata["chunk_id"] = i
	chunk.metadata["chunk_size"] = len(chunk.page_content)

	# Add context hints based on content
	content_lower = chunk.page_content.lower()

	# Identify important sections
	if any(keyword in content_lower for keyword in ["exclusion", "not covered", "does not cover"]):
	chunk.metadata["section_type"] = "exclusions"
	elif any(keyword in content_lower for keyword in ["coverage", "covered", "insured"]):
	chunk.metadata["section_type"] = "coverage"
	elif any(keyword in content_lower for keyword in ["premium", "cost", "price"]):
	chunk.metadata["section_type"] = "pricing"
	elif any(keyword in content_lower for keyword in ["add-on", "rider", "optional"]):
	chunk.metadata["section_type"] = "addons"
	elif any(keyword in content_lower for keyword in ["claim", "settlement"]):
	chunk.metadata["section_type"] = "claims"
	else:
	chunk.metadata["section_type"] = "general"

	print(f"Created {len(chunks)} chunks from {len(documents)} pages")
	return chunks

	def process_pdf(self, file_path: str) -> tuple[List[Document], Dict]:
	"""
	Complete pipeline: Load, extract metadata, and chunk a PDF

	Args:
	file_path: Path to the PDF file

	Returns:
	Tuple of (chunks, metadata)
	"""
	# Load PDF
	documents = self.load_pdf(file_path)

	# Extract metadata
	metadata = self.extract_metadata(documents)

	# Chunk documents
	chunks = self.chunk_documents(documents)

	return chunks, metadata

	def process_multiple_pdfs(self, file_paths: List[str]) -> tuple[List[Document], List[Dict]]:
	"""
	Process multiple PDF files

	Args:
	file_paths: List of paths to PDF files

	Returns:
	Tuple of (all_chunks, all_metadata)
	"""
	all_chunks = []
	all_metadata = []

	for file_path in file_paths:
	try:
	chunks, metadata = self.process_pdf(file_path)
	all_chunks.extend(chunks)
	all_metadata.append(metadata)
	except Exception as e:
	print(f"✗ Failed to process {file_path}: {str(e)}")
	continue

	print(f"\n Processed {len(file_paths)} PDFs")
	print(f"Total chunks created: {len(all_chunks)}")

	return all_chunks, all_metadata