Spaces:

Yeroyan
/

visual-rag-toolkit

Sleeping

App Files Files Community

visual-rag-toolkit / visual_rag /indexing /pdf_processor.py

Yeroyan

sync v0.1.3

9513cca verified 9 days ago

raw

history blame contribute delete

10.2 kB

	"""
	PDF Processor - Convert PDFs to images and extract text.

	This module works INDEPENDENTLY of embedding and vector storage.
	Use it if you just need PDF → images conversion.

	Features:
	- Batch processing to save memory
	- Text extraction with surrogate character handling
	- Configurable DPI and quality settings
	"""

	import gc
	import logging
	import re
	from pathlib import Path
	from typing import Any, Dict, Generator, List, Optional, Tuple

	from PIL import Image

	logger = logging.getLogger(__name__)


	class PDFProcessor:
	"""
	Process PDFs into images and text for visual retrieval.

	Works independently - no embedding or storage dependencies.

	Args:
	dpi: DPI for image conversion (higher = better quality)
	output_format: Image format (RGB, L, etc.)
	page_batch_size: Pages per batch for memory efficiency

	Example:
	>>> processor = PDFProcessor(dpi=140)
	>>>
	>>> # Convert single PDF
	>>> images, texts = processor.process_pdf(Path("report.pdf"))
	>>>
	>>> # Stream large PDFs
	>>> for images, texts in processor.stream_pdf(Path("large.pdf"), batch_size=10):
	... # Process each batch
	... pass
	"""

	def __init__(
	self,
	dpi: int = 140,
	output_format: str = "RGB",
	page_batch_size: int = 50,
	):
	self.dpi = dpi
	self.output_format = output_format
	self.page_batch_size = page_batch_size

	# PDF deps are optional: we only require them when calling PDF-specific methods.
	# This keeps the class usable for helper utilities like `resize_for_colpali()`
	# even in minimal installs.
	self._pdf_deps_available = True
	try:
	import pdf2image # noqa: F401
	import pypdf # noqa: F401
	except Exception:
	self._pdf_deps_available = False

	def _require_pdf_deps(self) -> None:
	if not self._pdf_deps_available:
	raise ImportError(
	"PDF processing requires `pdf2image` and `pypdf`.\n"
	'Install with: pip install "visual-rag-toolkit[pdf]"'
	)

	def process_pdf(
	self,
	pdf_path: Path,
	dpi: Optional[int] = None,
	) -> Tuple[List[Image.Image], List[str]]:
	"""
	Convert PDF to images and extract text.

	Args:
	pdf_path: Path to PDF file
	dpi: Override default DPI

	Returns:
	Tuple of (list of images, list of page texts)
	"""
	self._require_pdf_deps()
	from pdf2image import convert_from_path
	from pypdf import PdfReader

	dpi = dpi or self.dpi
	pdf_path = Path(pdf_path)

	logger.info(f"📄 Processing PDF: {pdf_path.name}")

	# Extract text
	reader = PdfReader(str(pdf_path))
	total_pages = len(reader.pages)

	page_texts = []
	for page in reader.pages:
	text = page.extract_text() or ""
	# Handle surrogate characters
	text = self._sanitize_text(text)
	page_texts.append(text)

	# Convert to images in batches
	all_images = []
	for start_page in range(1, total_pages + 1, self.page_batch_size):
	end_page = min(start_page + self.page_batch_size - 1, total_pages)

	batch_images = convert_from_path(
	str(pdf_path),
	dpi=dpi,
	fmt=self.output_format.lower(),
	first_page=start_page,
	last_page=end_page,
	)

	all_images.extend(batch_images)

	del batch_images
	gc.collect()

	assert len(all_images) == len(
	page_texts
	), f"Mismatch: {len(all_images)} images vs {len(page_texts)} texts"

	logger.info(f"✅ Processed {len(all_images)} pages")
	return all_images, page_texts

	def stream_pdf(
	self,
	pdf_path: Path,
	batch_size: int = 10,
	dpi: Optional[int] = None,
	) -> Generator[Tuple[List[Image.Image], List[str], int], None, None]:
	"""
	Stream PDF processing for large files.

	Yields batches of (images, texts, start_page) without loading
	entire PDF into memory.

	Args:
	pdf_path: Path to PDF file
	batch_size: Pages per batch
	dpi: Override default DPI

	Yields:
	Tuple of (batch_images, batch_texts, start_page_number)
	"""
	self._require_pdf_deps()
	from pdf2image import convert_from_path
	from pypdf import PdfReader

	dpi = dpi or self.dpi
	pdf_path = Path(pdf_path)

	reader = PdfReader(str(pdf_path))
	total_pages = len(reader.pages)

	logger.info(f"📄 Streaming PDF: {pdf_path.name} ({total_pages} pages)")

	for start_idx in range(0, total_pages, batch_size):
	end_idx = min(start_idx + batch_size, total_pages)

	# Extract text for batch
	batch_texts = []
	for page_idx in range(start_idx, end_idx):
	text = reader.pages[page_idx].extract_text() or ""
	text = self._sanitize_text(text)
	batch_texts.append(text)

	# Convert images for batch
	batch_images = convert_from_path(
	str(pdf_path),
	dpi=dpi,
	fmt=self.output_format.lower(),
	first_page=start_idx + 1, # 1-indexed
	last_page=end_idx,
	)

	yield batch_images, batch_texts, start_idx + 1

	del batch_images
	gc.collect()

	def get_page_count(self, pdf_path: Path) -> int:
	"""Get number of pages in PDF without loading images."""
	self._require_pdf_deps()
	from pypdf import PdfReader

	reader = PdfReader(str(pdf_path))
	return len(reader.pages)

	def resize_for_colpali(
	self,
	image: Image.Image,
	max_edge: int = 2048,
	tile_size: int = 512,
	) -> Tuple[Image.Image, int, int]:
	"""
	Resize image following ColPali/Idefics3 processor logic.

	Resizes to fit within tile grid without black padding.

	Args:
	image: PIL Image
	max_edge: Maximum edge length
	tile_size: Size of each tile

	Returns:
	Tuple of (resized_image, tile_rows, tile_cols)
	"""
	# Ensure consistent mode for downstream processors (and predictable tests)
	if image.mode != "RGB":
	image = image.convert("RGB")

	w, h = image.size

	# Step 1: Resize so longest edge = max_edge
	if w > h:
	new_w = max_edge
	new_h = int(h * (max_edge / w))
	else:
	new_h = max_edge
	new_w = int(w * (max_edge / h))

	# Step 2: Calculate tile grid
	tile_cols = (new_w + tile_size - 1) // tile_size
	tile_rows = (new_h + tile_size - 1) // tile_size

	# Step 3: Calculate exact dimensions for tiles
	final_w = tile_cols * tile_size
	final_h = tile_rows * tile_size

	# Step 4: Scale to fit within tile grid
	scale_w = final_w / w
	scale_h = final_h / h
	scale = min(scale_w, scale_h)

	scaled_w = int(w * scale)
	scaled_h = int(h * scale)

	resized = image.resize((scaled_w, scaled_h), Image.LANCZOS)

	# Center on white canvas if needed
	if scaled_w != final_w or scaled_h != final_h:
	canvas = Image.new("RGB", (final_w, final_h), (255, 255, 255))
	offset_x = (final_w - scaled_w) // 2
	offset_y = (final_h - scaled_h) // 2
	canvas.paste(resized, (offset_x, offset_y))
	resized = canvas

	return resized, tile_rows, tile_cols

	def _sanitize_text(self, text: str) -> str:
	"""Remove invalid Unicode characters (surrogates) from text."""
	if not text:
	return ""

	# Remove surrogate characters (U+D800-U+DFFF)
	return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")

	def extract_metadata_from_filename(
	self,
	filename: str,
	mapping: Optional[Dict[str, Dict[str, Any]]] = None,
	) -> Dict[str, Any]:
	"""
	Extract metadata from PDF filename.

	Uses mapping if provided, otherwise falls back to pattern matching.

	Args:
	filename: PDF filename (with or without .pdf extension)
	mapping: Optional mapping dict {filename: metadata}

	Returns:
	Metadata dict with year, source, district, etc.
	"""
	# Remove extension
	stem = Path(filename).stem
	stem_lower = stem.lower().strip()

	# Try mapping first
	if mapping:
	if stem_lower in mapping:
	return mapping[stem_lower].copy()

	# Try without .pdf
	stem_no_ext = stem_lower.replace(".pdf", "")
	if stem_no_ext in mapping:
	return mapping[stem_no_ext].copy()

	# Fallback: pattern matching
	metadata = {"filename": filename}

	# Extract year
	year_match = re.search(r"(20\d{2})", stem)
	if year_match:
	metadata["year"] = int(year_match.group(1))

	# Detect source type
	if "consolidated" in stem_lower or ("annual" in stem_lower and "oag" in stem_lower):
	metadata["source"] = "Consolidated"
	elif "dlg" in stem_lower or "district local government" in stem_lower:
	metadata["source"] = "Local Government"
	# Try to extract district name
	district_match = re.search(r"([a-z]+)\s+(?:dlg\|district local government)", stem_lower)
	if district_match:
	metadata["district"] = district_match.group(1).title()
	elif "hospital" in stem_lower or "referral" in stem_lower:
	metadata["source"] = "Hospital"
	elif "ministry" in stem_lower:
	metadata["source"] = "Ministry"
	elif "project" in stem_lower:
	metadata["source"] = "Project"
	else:
	metadata["source"] = "Unknown"

	return metadata