Updated_code_complaince / tools /pdf_processor.py
Ryan2219's picture
Upload 70 files
e1ced8e verified
"""PDF page rendering (PyMuPDF/fitz) — upfront bulk rendering at ingest time."""
from __future__ import annotations
from pathlib import Path
import fitz # PyMuPDF
from config import PDF_RENDER_DPI
def get_page_count(pdf_path: str) -> int:
"""Return the number of pages in a PDF without rendering anything."""
doc = fitz.open(pdf_path)
count = len(doc)
doc.close()
return count
def render_pages(pdf_path: str, output_dir: str, dpi: int = PDF_RENDER_DPI) -> int:
"""Render every PDF page as a PNG image.
This is the primary rendering method, called once during PDF ingestion
to pre-render all pages at the configured DPI.
"""
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
doc = fitz.open(pdf_path)
num_pages = len(doc)
zoom = dpi / 72.0
matrix = fitz.Matrix(zoom, zoom)
for page_num in range(num_pages):
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=matrix)
img_bytes = pix.tobytes("png")
img_path = out / f"page_{page_num}.png"
with open(img_path, "wb") as f:
f.write(img_bytes)
doc.close()
return num_pages
def render_single_page(
pdf_path: str,
page_num: int,
output_dir: str,
dpi: int = PDF_RENDER_DPI,
) -> None:
"""Render a single PDF page as a PNG and save to disk."""
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
doc = fitz.open(pdf_path)
zoom = dpi / 72.0
page = doc.load_page(page_num)
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
img_path = out / f"page_{page_num}.png"
with open(img_path, "wb") as f:
f.write(pix.tobytes("png"))
doc.close()
def extract_page_range_bytes(pdf_path: str, start: int, end: int) -> bytes:
"""Extract a range of pages from a PDF and return as in-memory PDF bytes.
Args:
pdf_path: Path to the source PDF.
start: First page index (0-indexed, inclusive).
end: Last page index (0-indexed, inclusive).
Returns:
Raw bytes of a new PDF containing only the specified pages.
"""
src = fitz.open(pdf_path)
dst = fitz.open() # new empty PDF
dst.insert_pdf(src, from_page=start, to_page=end)
pdf_bytes = dst.tobytes()
dst.close()
src.close()
return pdf_bytes
def get_page_image_bytes(
page_image_dir: str,
page_num: int,
) -> bytes:
"""Load a pre-rendered page image from disk.
Pages are expected to already exist from the upfront bulk render
performed during PDF ingestion.
"""
path = Path(page_image_dir) / f"page_{page_num}.png"
return path.read_bytes()