| | """PDF page rendering (PyMuPDF/fitz) — upfront bulk rendering at ingest time."""
|
| | from __future__ import annotations
|
| |
|
| | from pathlib import Path
|
| |
|
| | import fitz
|
| |
|
| | from config import PDF_RENDER_DPI
|
| |
|
| |
|
| | def get_page_count(pdf_path: str) -> int:
|
| | """Return the number of pages in a PDF without rendering anything."""
|
| | doc = fitz.open(pdf_path)
|
| | count = len(doc)
|
| | doc.close()
|
| | return count
|
| |
|
| |
|
| | def render_pages(pdf_path: str, output_dir: str, dpi: int = PDF_RENDER_DPI) -> int:
|
| | """Render every PDF page as a PNG image.
|
| |
|
| | This is the primary rendering method, called once during PDF ingestion
|
| | to pre-render all pages at the configured DPI.
|
| | """
|
| | out = Path(output_dir)
|
| | out.mkdir(parents=True, exist_ok=True)
|
| |
|
| | doc = fitz.open(pdf_path)
|
| | num_pages = len(doc)
|
| | zoom = dpi / 72.0
|
| | matrix = fitz.Matrix(zoom, zoom)
|
| |
|
| | for page_num in range(num_pages):
|
| | page = doc.load_page(page_num)
|
| | pix = page.get_pixmap(matrix=matrix)
|
| | img_bytes = pix.tobytes("png")
|
| | img_path = out / f"page_{page_num}.png"
|
| | with open(img_path, "wb") as f:
|
| | f.write(img_bytes)
|
| |
|
| | doc.close()
|
| | return num_pages
|
| |
|
| |
|
| | def render_single_page(
|
| | pdf_path: str,
|
| | page_num: int,
|
| | output_dir: str,
|
| | dpi: int = PDF_RENDER_DPI,
|
| | ) -> None:
|
| | """Render a single PDF page as a PNG and save to disk."""
|
| | out = Path(output_dir)
|
| | out.mkdir(parents=True, exist_ok=True)
|
| |
|
| | doc = fitz.open(pdf_path)
|
| | zoom = dpi / 72.0
|
| | page = doc.load_page(page_num)
|
| | pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
|
| | img_path = out / f"page_{page_num}.png"
|
| | with open(img_path, "wb") as f:
|
| | f.write(pix.tobytes("png"))
|
| | doc.close()
|
| |
|
| |
|
| | def extract_page_range_bytes(pdf_path: str, start: int, end: int) -> bytes:
|
| | """Extract a range of pages from a PDF and return as in-memory PDF bytes.
|
| |
|
| | Args:
|
| | pdf_path: Path to the source PDF.
|
| | start: First page index (0-indexed, inclusive).
|
| | end: Last page index (0-indexed, inclusive).
|
| |
|
| | Returns:
|
| | Raw bytes of a new PDF containing only the specified pages.
|
| | """
|
| | src = fitz.open(pdf_path)
|
| | dst = fitz.open()
|
| | dst.insert_pdf(src, from_page=start, to_page=end)
|
| | pdf_bytes = dst.tobytes()
|
| | dst.close()
|
| | src.close()
|
| | return pdf_bytes
|
| |
|
| |
|
| | def get_page_image_bytes(
|
| | page_image_dir: str,
|
| | page_num: int,
|
| | ) -> bytes:
|
| | """Load a pre-rendered page image from disk.
|
| |
|
| | Pages are expected to already exist from the upfront bulk render
|
| | performed during PDF ingestion.
|
| | """
|
| | path = Path(page_image_dir) / f"page_{page_num}.png"
|
| | return path.read_bytes()
|
| |
|