""" PDF Processor - Convert PDFs to images and extract text. This module works INDEPENDENTLY of embedding and vector storage. Use it if you just need PDF → images conversion. Features: - Batch processing to save memory - Text extraction with surrogate character handling - Configurable DPI and quality settings """ import gc import logging import re from pathlib import Path from typing import Any, Dict, Generator, List, Optional, Tuple from PIL import Image logger = logging.getLogger(__name__) class PDFProcessor: """ Process PDFs into images and text for visual retrieval. Works independently - no embedding or storage dependencies. Args: dpi: DPI for image conversion (higher = better quality) output_format: Image format (RGB, L, etc.) page_batch_size: Pages per batch for memory efficiency Example: >>> processor = PDFProcessor(dpi=140) >>> >>> # Convert single PDF >>> images, texts = processor.process_pdf(Path("report.pdf")) >>> >>> # Stream large PDFs >>> for images, texts in processor.stream_pdf(Path("large.pdf"), batch_size=10): ... # Process each batch ... pass """ def __init__( self, dpi: int = 140, output_format: str = "RGB", page_batch_size: int = 50, ): self.dpi = dpi self.output_format = output_format self.page_batch_size = page_batch_size # PDF deps are optional: we only require them when calling PDF-specific methods. # This keeps the class usable for helper utilities like `resize_for_colpali()` # even in minimal installs. self._pdf_deps_available = True try: import pdf2image # noqa: F401 import pypdf # noqa: F401 except Exception: self._pdf_deps_available = False def _require_pdf_deps(self) -> None: if not self._pdf_deps_available: raise ImportError( "PDF processing requires `pdf2image` and `pypdf`.\n" 'Install with: pip install "visual-rag-toolkit[pdf]"' ) def process_pdf( self, pdf_path: Path, dpi: Optional[int] = None, ) -> Tuple[List[Image.Image], List[str]]: """ Convert PDF to images and extract text. Args: pdf_path: Path to PDF file dpi: Override default DPI Returns: Tuple of (list of images, list of page texts) """ self._require_pdf_deps() from pdf2image import convert_from_path from pypdf import PdfReader dpi = dpi or self.dpi pdf_path = Path(pdf_path) logger.info(f"📄 Processing PDF: {pdf_path.name}") # Extract text reader = PdfReader(str(pdf_path)) total_pages = len(reader.pages) page_texts = [] for page in reader.pages: text = page.extract_text() or "" # Handle surrogate characters text = self._sanitize_text(text) page_texts.append(text) # Convert to images in batches all_images = [] for start_page in range(1, total_pages + 1, self.page_batch_size): end_page = min(start_page + self.page_batch_size - 1, total_pages) batch_images = convert_from_path( str(pdf_path), dpi=dpi, fmt=self.output_format.lower(), first_page=start_page, last_page=end_page, ) all_images.extend(batch_images) del batch_images gc.collect() assert len(all_images) == len( page_texts ), f"Mismatch: {len(all_images)} images vs {len(page_texts)} texts" logger.info(f"✅ Processed {len(all_images)} pages") return all_images, page_texts def stream_pdf( self, pdf_path: Path, batch_size: int = 10, dpi: Optional[int] = None, ) -> Generator[Tuple[List[Image.Image], List[str], int], None, None]: """ Stream PDF processing for large files. Yields batches of (images, texts, start_page) without loading entire PDF into memory. Args: pdf_path: Path to PDF file batch_size: Pages per batch dpi: Override default DPI Yields: Tuple of (batch_images, batch_texts, start_page_number) """ self._require_pdf_deps() from pdf2image import convert_from_path from pypdf import PdfReader dpi = dpi or self.dpi pdf_path = Path(pdf_path) reader = PdfReader(str(pdf_path)) total_pages = len(reader.pages) logger.info(f"📄 Streaming PDF: {pdf_path.name} ({total_pages} pages)") for start_idx in range(0, total_pages, batch_size): end_idx = min(start_idx + batch_size, total_pages) # Extract text for batch batch_texts = [] for page_idx in range(start_idx, end_idx): text = reader.pages[page_idx].extract_text() or "" text = self._sanitize_text(text) batch_texts.append(text) # Convert images for batch batch_images = convert_from_path( str(pdf_path), dpi=dpi, fmt=self.output_format.lower(), first_page=start_idx + 1, # 1-indexed last_page=end_idx, ) yield batch_images, batch_texts, start_idx + 1 del batch_images gc.collect() def get_page_count(self, pdf_path: Path) -> int: """Get number of pages in PDF without loading images.""" self._require_pdf_deps() from pypdf import PdfReader reader = PdfReader(str(pdf_path)) return len(reader.pages) def resize_for_colpali( self, image: Image.Image, max_edge: int = 2048, tile_size: int = 512, ) -> Tuple[Image.Image, int, int]: """ Resize image following ColPali/Idefics3 processor logic. Resizes to fit within tile grid without black padding. Args: image: PIL Image max_edge: Maximum edge length tile_size: Size of each tile Returns: Tuple of (resized_image, tile_rows, tile_cols) """ # Ensure consistent mode for downstream processors (and predictable tests) if image.mode != "RGB": image = image.convert("RGB") w, h = image.size # Step 1: Resize so longest edge = max_edge if w > h: new_w = max_edge new_h = int(h * (max_edge / w)) else: new_h = max_edge new_w = int(w * (max_edge / h)) # Step 2: Calculate tile grid tile_cols = (new_w + tile_size - 1) // tile_size tile_rows = (new_h + tile_size - 1) // tile_size # Step 3: Calculate exact dimensions for tiles final_w = tile_cols * tile_size final_h = tile_rows * tile_size # Step 4: Scale to fit within tile grid scale_w = final_w / w scale_h = final_h / h scale = min(scale_w, scale_h) scaled_w = int(w * scale) scaled_h = int(h * scale) resized = image.resize((scaled_w, scaled_h), Image.LANCZOS) # Center on white canvas if needed if scaled_w != final_w or scaled_h != final_h: canvas = Image.new("RGB", (final_w, final_h), (255, 255, 255)) offset_x = (final_w - scaled_w) // 2 offset_y = (final_h - scaled_h) // 2 canvas.paste(resized, (offset_x, offset_y)) resized = canvas return resized, tile_rows, tile_cols def _sanitize_text(self, text: str) -> str: """Remove invalid Unicode characters (surrogates) from text.""" if not text: return "" # Remove surrogate characters (U+D800-U+DFFF) return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore") def extract_metadata_from_filename( self, filename: str, mapping: Optional[Dict[str, Dict[str, Any]]] = None, ) -> Dict[str, Any]: """ Extract metadata from PDF filename. Uses mapping if provided, otherwise falls back to pattern matching. Args: filename: PDF filename (with or without .pdf extension) mapping: Optional mapping dict {filename: metadata} Returns: Metadata dict with year, source, district, etc. """ # Remove extension stem = Path(filename).stem stem_lower = stem.lower().strip() # Try mapping first if mapping: if stem_lower in mapping: return mapping[stem_lower].copy() # Try without .pdf stem_no_ext = stem_lower.replace(".pdf", "") if stem_no_ext in mapping: return mapping[stem_no_ext].copy() # Fallback: pattern matching metadata = {"filename": filename} # Extract year year_match = re.search(r"(20\d{2})", stem) if year_match: metadata["year"] = int(year_match.group(1)) # Detect source type if "consolidated" in stem_lower or ("annual" in stem_lower and "oag" in stem_lower): metadata["source"] = "Consolidated" elif "dlg" in stem_lower or "district local government" in stem_lower: metadata["source"] = "Local Government" # Try to extract district name district_match = re.search(r"([a-z]+)\s+(?:dlg|district local government)", stem_lower) if district_match: metadata["district"] = district_match.group(1).title() elif "hospital" in stem_lower or "referral" in stem_lower: metadata["source"] = "Hospital" elif "ministry" in stem_lower: metadata["source"] = "Ministry" elif "project" in stem_lower: metadata["source"] = "Project" else: metadata["source"] = "Unknown" return metadata