| from __future__ import annotations |
| from pathlib import Path |
| from typing import List |
|
|
| import fitz |
| from PIL import Image |
|
|
|
|
| def pdf_to_images(pdf_path: Path, dpi: int = 200) -> List[Image.Image]: |
| """ |
| Render each PDF page to a PIL Image (RGB). |
| |
| Args: |
| pdf_path: path to a .pdf file |
| dpi: target DPI for rasterization (higher = sharper but slower) |
| |
| Returns: |
| List of PIL Images, one per page, in reading order. |
| """ |
| images: List[Image.Image] = [] |
| zoom = dpi / 72.0 |
| mat = fitz.Matrix(zoom, zoom) |
| with fitz.open(pdf_path) as doc: |
| for page in doc: |
| pix = page.get_pixmap(matrix=mat) |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
| images.append(img) |
| return images |
|
|
| from .schema import OCRBlock |
|
|
| def pdf_native_blocks(pdf_path: Path) -> list[OCRBlock]: |
| blocks: list[OCRBlock] = [] |
| with fitz.open(pdf_path) as doc: |
| for i, page in enumerate(doc, start=1): |
| for b in page.get_text("blocks"): |
| x0, y0, x1, y1, txt, *_ = b |
| if txt and txt.strip(): |
| blocks.append( |
| OCRBlock(page=i, bbox=(int(x0), int(y0), int(x1), int(y1)), |
| text=txt.strip(), confidence=1.0) |
| ) |
| return blocks |