"""PDF page rendering (PyMuPDF/fitz) — upfront bulk rendering at ingest time.""" from __future__ import annotations from pathlib import Path import fitz # PyMuPDF from config import PDF_RENDER_DPI def get_page_count(pdf_path: str) -> int: """Return the number of pages in a PDF without rendering anything.""" doc = fitz.open(pdf_path) count = len(doc) doc.close() return count def render_pages(pdf_path: str, output_dir: str, dpi: int = PDF_RENDER_DPI) -> int: """Render every PDF page as a PNG image. This is the primary rendering method, called once during PDF ingestion to pre-render all pages at the configured DPI. """ out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) doc = fitz.open(pdf_path) num_pages = len(doc) zoom = dpi / 72.0 matrix = fitz.Matrix(zoom, zoom) for page_num in range(num_pages): page = doc.load_page(page_num) pix = page.get_pixmap(matrix=matrix) img_bytes = pix.tobytes("png") img_path = out / f"page_{page_num}.png" with open(img_path, "wb") as f: f.write(img_bytes) doc.close() return num_pages def render_single_page( pdf_path: str, page_num: int, output_dir: str, dpi: int = PDF_RENDER_DPI, ) -> None: """Render a single PDF page as a PNG and save to disk.""" out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) doc = fitz.open(pdf_path) zoom = dpi / 72.0 page = doc.load_page(page_num) pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom)) img_path = out / f"page_{page_num}.png" with open(img_path, "wb") as f: f.write(pix.tobytes("png")) doc.close() def extract_page_range_bytes(pdf_path: str, start: int, end: int) -> bytes: """Extract a range of pages from a PDF and return as in-memory PDF bytes. Args: pdf_path: Path to the source PDF. start: First page index (0-indexed, inclusive). end: Last page index (0-indexed, inclusive). Returns: Raw bytes of a new PDF containing only the specified pages. """ src = fitz.open(pdf_path) dst = fitz.open() # new empty PDF dst.insert_pdf(src, from_page=start, to_page=end) pdf_bytes = dst.tobytes() dst.close() src.close() return pdf_bytes def get_page_image_bytes( page_image_dir: str, page_num: int, ) -> bytes: """Load a pre-rendered page image from disk. Pages are expected to already exist from the upfront bulk render performed during PDF ingestion. """ path = Path(page_image_dir) / f"page_{page_num}.png" return path.read_bytes()