"""PDF -> list of A4-sized RGB page images using PyMuPDF (fitz). "Native split" handling: if a page is already ~A4 portrait we render it as-is. If a page is much larger / a different ratio (e.g. an A3 spread, a long scan), we slice it into A4-height bands so downstream tearing always works on A4 tiles. """ from __future__ import annotations import numpy as np from .config import A4_ASPECT, MAX_PAGES_PER_PDF, a4_pixels def _page_pixmap(page, dpi: int) -> np.ndarray: import fitz # PyMuPDF zoom = dpi / 72.0 pm = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False) arr = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.h, pm.w, pm.n) if pm.n == 4: # RGBA -> RGB arr = arr[:, :, :3] elif pm.n == 1: # gray -> RGB arr = np.repeat(arr, 3, axis=2) return np.ascontiguousarray(arr) def _fit_to_a4(img: np.ndarray, dpi: int) -> list[np.ndarray]: """Return one or more A4-portrait tiles covering `img`. Tall pages are sliced into A4-height bands (each band padded to A4 width). """ target_w, target_h = a4_pixels(dpi) H, W = img.shape[:2] aspect = H / W # Close enough to A4 portrait: letterbox-resize the whole page onto A4. if abs(aspect - A4_ASPECT) < 0.12: return [_letterbox(img, target_w, target_h)] # Otherwise scale to A4 width, then slice the (now-tall) image into bands. scale = target_w / W new_h = max(1, int(round(H * scale))) resized = _resize(img, target_w, new_h) tiles = [] for top in range(0, new_h, target_h): band = resized[top:top + target_h] if band.shape[0] < target_h: band = _letterbox(band, target_w, target_h) tiles.append(band) return tiles def _resize(img: np.ndarray, w: int, h: int) -> np.ndarray: from PIL import Image return np.asarray(Image.fromarray(img).resize((w, h), Image.LANCZOS)) def _letterbox(img: np.ndarray, w: int, h: int) -> np.ndarray: """Resize preserving aspect, pad with white onto a w*h canvas.""" H, W = img.shape[:2] scale = min(w / W, h / H) nw, nh = max(1, int(W * scale)), max(1, int(H * scale)) resized = _resize(img, nw, nh) canvas = np.full((h, w, 3), 255, dtype=np.uint8) oy, ox = (h - nh) // 2, (w - nw) // 2 canvas[oy:oy + nh, ox:ox + nw] = resized return canvas def load_pdf_pages(path: str, dpi: int) -> list[np.ndarray]: """Render `path` into a list of A4 RGB uint8 page images.""" import fitz doc = fitz.open(path) try: out: list[np.ndarray] = [] for page in doc: raw = _page_pixmap(page, dpi) out.extend(_fit_to_a4(raw, dpi)) if len(out) >= MAX_PAGES_PER_PDF: return out[:MAX_PAGES_PER_PDF] return out finally: doc.close()