setu / module_a /extractors.py
khagu's picture
chore: finally untrack large database files
3998131
"""
PDF text extraction module
Handles extraction from legal PDF documents
"""
import logging
from pathlib import Path
from typing import List, Dict, Tuple, Optional
try:
import pdfplumber
PDFPLUMBER_AVAILABLE = True
except ImportError:
PDFPLUMBER_AVAILABLE = False
try:
from PyPDF2 import PdfReader
PYPDF2_AVAILABLE = True
except ImportError:
PYPDF2_AVAILABLE = False
from .config import PDF_EXTRACTION_METHOD, PDF_FALLBACK_METHOD
logger = logging.getLogger(__name__)
class PDFExtractor:
"""Extracts text from PDF files with multiple extraction methods"""
def __init__(self, method: str = PDF_EXTRACTION_METHOD):
"""
Initialize PDF extractor
Args:
method: Extraction method ('pdfplumber' or 'pypdf2')
"""
self.method = method
self._validate_dependencies()
def _validate_dependencies(self):
"""Check if required libraries are available"""
if self.method == "pdfplumber" and not PDFPLUMBER_AVAILABLE:
logger.warning("pdfplumber not available, falling back to PyPDF2")
self.method = "pypdf2"
if self.method == "pypdf2" and not PYPDF2_AVAILABLE:
raise ImportError("No PDF extraction library available. Install pdfplumber or PyPDF2")
def extract_from_file(self, pdf_path: Path) -> List[Dict[str, any]]:
"""
Extract text from PDF file
Args:
pdf_path: Path to PDF file
Returns:
List of dicts with 'page_number' and 'text' keys
"""
logger.info(f"Extracting text from {pdf_path.name} using {self.method}")
try:
if self.method == "pdfplumber":
return self._extract_with_pdfplumber(pdf_path)
else:
return self._extract_with_pypdf2(pdf_path)
except Exception as e:
logger.error(f"Extraction failed with {self.method}: {e}")
# Try fallback method
if self.method == "pdfplumber" and PYPDF2_AVAILABLE:
logger.info("Trying fallback method: PyPDF2")
return self._extract_with_pypdf2(pdf_path)
elif self.method == "pypdf2" and PDFPLUMBER_AVAILABLE:
logger.info("Trying fallback method: pdfplumber")
return self._extract_with_pdfplumber(pdf_path)
else:
raise
def _extract_with_pdfplumber(self, pdf_path: Path) -> List[Dict[str, any]]:
"""Extract using pdfplumber (better for complex layouts)"""
pages_data = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if text:
pages_data.append({
'page_number': page_num,
'text': text
})
else:
logger.warning(f"No text extracted from page {page_num}")
logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}")
return pages_data
def _extract_with_pypdf2(self, pdf_path: Path) -> List[Dict[str, any]]:
"""Extract using PyPDF2 (fallback method)"""
pages_data = []
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
for page_num, page in enumerate(pdf_reader.pages, start=1):
text = page.extract_text()
if text:
pages_data.append({
'page_number': page_num,
'text': text
})
else:
logger.warning(f"No text extracted from page {page_num}")
logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}")
return pages_data
def extract_from_directory(self, directory: Path) -> Dict[str, List[Dict[str, any]]]:
"""
Extract text from all PDFs in a directory
Args:
directory: Path to directory containing PDFs
Returns:
Dict mapping filename to list of page data
"""
results = {}
pdf_files = list(directory.glob("*.pdf"))
logger.info(f"Found {len(pdf_files)} PDF files in {directory}")
for pdf_file in pdf_files:
try:
results[pdf_file.name] = self.extract_from_file(pdf_file)
except Exception as e:
logger.error(f"Failed to extract {pdf_file.name}: {e}")
results[pdf_file.name] = []
return results