|
|
""" |
|
|
PDF text extraction module |
|
|
Handles extraction from legal PDF documents |
|
|
""" |
|
|
|
|
|
import logging |
|
|
from pathlib import Path |
|
|
from typing import List, Dict, Tuple, Optional |
|
|
|
|
|
try: |
|
|
import pdfplumber |
|
|
PDFPLUMBER_AVAILABLE = True |
|
|
except ImportError: |
|
|
PDFPLUMBER_AVAILABLE = False |
|
|
|
|
|
try: |
|
|
from PyPDF2 import PdfReader |
|
|
PYPDF2_AVAILABLE = True |
|
|
except ImportError: |
|
|
PYPDF2_AVAILABLE = False |
|
|
|
|
|
from .config import PDF_EXTRACTION_METHOD, PDF_FALLBACK_METHOD |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class PDFExtractor: |
|
|
"""Extracts text from PDF files with multiple extraction methods""" |
|
|
|
|
|
def __init__(self, method: str = PDF_EXTRACTION_METHOD): |
|
|
""" |
|
|
Initialize PDF extractor |
|
|
|
|
|
Args: |
|
|
method: Extraction method ('pdfplumber' or 'pypdf2') |
|
|
""" |
|
|
self.method = method |
|
|
self._validate_dependencies() |
|
|
|
|
|
def _validate_dependencies(self): |
|
|
"""Check if required libraries are available""" |
|
|
if self.method == "pdfplumber" and not PDFPLUMBER_AVAILABLE: |
|
|
logger.warning("pdfplumber not available, falling back to PyPDF2") |
|
|
self.method = "pypdf2" |
|
|
|
|
|
if self.method == "pypdf2" and not PYPDF2_AVAILABLE: |
|
|
raise ImportError("No PDF extraction library available. Install pdfplumber or PyPDF2") |
|
|
|
|
|
def extract_from_file(self, pdf_path: Path) -> List[Dict[str, any]]: |
|
|
""" |
|
|
Extract text from PDF file |
|
|
|
|
|
Args: |
|
|
pdf_path: Path to PDF file |
|
|
|
|
|
Returns: |
|
|
List of dicts with 'page_number' and 'text' keys |
|
|
""" |
|
|
logger.info(f"Extracting text from {pdf_path.name} using {self.method}") |
|
|
|
|
|
try: |
|
|
if self.method == "pdfplumber": |
|
|
return self._extract_with_pdfplumber(pdf_path) |
|
|
else: |
|
|
return self._extract_with_pypdf2(pdf_path) |
|
|
except Exception as e: |
|
|
logger.error(f"Extraction failed with {self.method}: {e}") |
|
|
|
|
|
if self.method == "pdfplumber" and PYPDF2_AVAILABLE: |
|
|
logger.info("Trying fallback method: PyPDF2") |
|
|
return self._extract_with_pypdf2(pdf_path) |
|
|
elif self.method == "pypdf2" and PDFPLUMBER_AVAILABLE: |
|
|
logger.info("Trying fallback method: pdfplumber") |
|
|
return self._extract_with_pdfplumber(pdf_path) |
|
|
else: |
|
|
raise |
|
|
|
|
|
def _extract_with_pdfplumber(self, pdf_path: Path) -> List[Dict[str, any]]: |
|
|
"""Extract using pdfplumber (better for complex layouts)""" |
|
|
pages_data = [] |
|
|
|
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
|
for page_num, page in enumerate(pdf.pages, start=1): |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
pages_data.append({ |
|
|
'page_number': page_num, |
|
|
'text': text |
|
|
}) |
|
|
else: |
|
|
logger.warning(f"No text extracted from page {page_num}") |
|
|
|
|
|
logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}") |
|
|
return pages_data |
|
|
|
|
|
def _extract_with_pypdf2(self, pdf_path: Path) -> List[Dict[str, any]]: |
|
|
"""Extract using PyPDF2 (fallback method)""" |
|
|
pages_data = [] |
|
|
|
|
|
with open(pdf_path, 'rb') as file: |
|
|
pdf_reader = PdfReader(file) |
|
|
|
|
|
for page_num, page in enumerate(pdf_reader.pages, start=1): |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
pages_data.append({ |
|
|
'page_number': page_num, |
|
|
'text': text |
|
|
}) |
|
|
else: |
|
|
logger.warning(f"No text extracted from page {page_num}") |
|
|
|
|
|
logger.info(f"Extracted {len(pages_data)} pages from {pdf_path.name}") |
|
|
return pages_data |
|
|
|
|
|
def extract_from_directory(self, directory: Path) -> Dict[str, List[Dict[str, any]]]: |
|
|
""" |
|
|
Extract text from all PDFs in a directory |
|
|
|
|
|
Args: |
|
|
directory: Path to directory containing PDFs |
|
|
|
|
|
Returns: |
|
|
Dict mapping filename to list of page data |
|
|
""" |
|
|
results = {} |
|
|
pdf_files = list(directory.glob("*.pdf")) |
|
|
|
|
|
logger.info(f"Found {len(pdf_files)} PDF files in {directory}") |
|
|
|
|
|
for pdf_file in pdf_files: |
|
|
try: |
|
|
results[pdf_file.name] = self.extract_from_file(pdf_file) |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to extract {pdf_file.name}: {e}") |
|
|
results[pdf_file.name] = [] |
|
|
|
|
|
return results |
|
|
|