from __future__ import annotations from pathlib import Path from typing import List import fitz # PyMuPDF from PIL import Image def pdf_to_images(pdf_path: Path, dpi: int = 200) -> List[Image.Image]: """ Render each PDF page to a PIL Image (RGB). Args: pdf_path: path to a .pdf file dpi: target DPI for rasterization (higher = sharper but slower) Returns: List of PIL Images, one per page, in reading order. """ images: List[Image.Image] = [] zoom = dpi / 72.0 # 72 dpi is the PDF default mat = fitz.Matrix(zoom, zoom) with fitz.open(pdf_path) as doc: for page in doc: pix = page.get_pixmap(matrix=mat) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) return images from .schema import OCRBlock def pdf_native_blocks(pdf_path: Path) -> list[OCRBlock]: blocks: list[OCRBlock] = [] with fitz.open(pdf_path) as doc: for i, page in enumerate(doc, start=1): for b in page.get_text("blocks"): # (x0,y0,x1,y1,text, ...) x0, y0, x1, y1, txt, *_ = b if txt and txt.strip(): blocks.append( OCRBlock(page=i, bbox=(int(x0), int(y0), int(x1), int(y1)), text=txt.strip(), confidence=1.0) ) return blocks