Spaces:

NYSERDA-CRE-Working-Group
/

Updated_code_complaince

Running

App Files Files Community

Updated_code_complaince / tools /pdf_processor.py

Ryan2219

Upload 70 files

e1ced8e verified 16 days ago

raw

history blame contribute delete

2.75 kB

	"""PDF page rendering (PyMuPDF/fitz) — upfront bulk rendering at ingest time."""
	from __future__ import annotations

	from pathlib import Path

	import fitz # PyMuPDF

	from config import PDF_RENDER_DPI


	def get_page_count(pdf_path: str) -> int:
	"""Return the number of pages in a PDF without rendering anything."""
	doc = fitz.open(pdf_path)
	count = len(doc)
	doc.close()
	return count


	def render_pages(pdf_path: str, output_dir: str, dpi: int = PDF_RENDER_DPI) -> int:
	"""Render every PDF page as a PNG image.

	This is the primary rendering method, called once during PDF ingestion
	to pre-render all pages at the configured DPI.
	"""
	out = Path(output_dir)
	out.mkdir(parents=True, exist_ok=True)

	doc = fitz.open(pdf_path)
	num_pages = len(doc)
	zoom = dpi / 72.0
	matrix = fitz.Matrix(zoom, zoom)

	for page_num in range(num_pages):
	page = doc.load_page(page_num)
	pix = page.get_pixmap(matrix=matrix)
	img_bytes = pix.tobytes("png")
	img_path = out / f"page_{page_num}.png"
	with open(img_path, "wb") as f:
	f.write(img_bytes)

	doc.close()
	return num_pages


	def render_single_page(
	pdf_path: str,
	page_num: int,
	output_dir: str,
	dpi: int = PDF_RENDER_DPI,
	) -> None:
	"""Render a single PDF page as a PNG and save to disk."""
	out = Path(output_dir)
	out.mkdir(parents=True, exist_ok=True)

	doc = fitz.open(pdf_path)
	zoom = dpi / 72.0
	page = doc.load_page(page_num)
	pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
	img_path = out / f"page_{page_num}.png"
	with open(img_path, "wb") as f:
	f.write(pix.tobytes("png"))
	doc.close()


	def extract_page_range_bytes(pdf_path: str, start: int, end: int) -> bytes:
	"""Extract a range of pages from a PDF and return as in-memory PDF bytes.

	Args:
	pdf_path: Path to the source PDF.
	start: First page index (0-indexed, inclusive).
	end: Last page index (0-indexed, inclusive).

	Returns:
	Raw bytes of a new PDF containing only the specified pages.
	"""
	src = fitz.open(pdf_path)
	dst = fitz.open() # new empty PDF
	dst.insert_pdf(src, from_page=start, to_page=end)
	pdf_bytes = dst.tobytes()
	dst.close()
	src.close()
	return pdf_bytes


	def get_page_image_bytes(
	page_image_dir: str,
	page_num: int,
	) -> bytes:
	"""Load a pre-rendered page image from disk.

	Pages are expected to already exist from the upfront bulk render
	performed during PDF ingestion.
	"""
	path = Path(page_image_dir) / f"page_{page_num}.png"
	return path.read_bytes()