Spaces:
Sleeping
Sleeping
File size: 1,650 Bytes
d56c6ae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import os
from typing import Optional
import fitz # PyMuPDF
from src.config import PDF_IMAGE_DPI, PDF_IMAGE_BASE_DIR
# --------------------------------------------------
# PDF TO IMAGE CONVERSION
# --------------------------------------------------
def pdf_to_images(
pdf_path: str,
base_dir: Optional[str] = None
) -> str:
"""
Convert a multi-page PDF into individual PNG images.
Each page of the PDF is rendered at a fixed DPI and
saved as a separate image file inside a directory
named after the PDF.
Parameters
----------
pdf_path : str
Path to the input PDF file.
base_dir : str, optional
Base directory where page images will be stored.
Defaults to the configured PDF_IMAGE_BASE_DIR.
Returns
-------
str
Name of the PDF file (without extension), used
as the output folder name.
"""
# Resolve base output directory
output_base: str = base_dir or PDF_IMAGE_BASE_DIR
# Extract PDF name (without extension)
pdf_name: str = os.path.splitext(os.path.basename(pdf_path))[0]
# Create output directory for this PDF
output_dir: str = os.path.join(output_base, pdf_name)
os.makedirs(output_dir, exist_ok=True)
# Open PDF document
document = fitz.open(pdf_path)
# Render each page as a high-resolution PNG image
for page_index, page in enumerate(document, start=1):
pixmap = page.get_pixmap(dpi=PDF_IMAGE_DPI)
pixmap.save(
os.path.join(output_dir, f"page_{page_index}.png")
)
# Close document to release resources
document.close()
return pdf_name
|