""" PDF to Image Conversion using PyMuPDF (fitz) Converts all pages of a PDF to PIL Images. """ import fitz # PyMuPDF from PIL import Image from pathlib import Path from typing import List, Tuple import io def pdf_to_images(pdf_path: str, dpi: int = 150) -> List[Image.Image]: """ Convert all pages of a PDF to PIL Images. Args: pdf_path: Path to the PDF file dpi: Resolution for rendering (default 150, balance of quality/speed) Returns: List of PIL Images, one per page """ images = [] try: doc = fitz.open(pdf_path) for page_num in range(len(doc)): page = doc[page_num] # Create pixmap at specified DPI zoom = dpi / 72 # 72 is default PDF DPI matrix = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=matrix) # Convert to PIL Image img_data = pix.tobytes("png") img = Image.open(io.BytesIO(img_data)) images.append(img.convert("RGB")) doc.close() except Exception as e: print(f"Error converting {pdf_path}: {e}") return [] return images def get_pdf_page_count(pdf_path: str) -> int: """Get the number of pages in a PDF.""" try: doc = fitz.open(pdf_path) count = len(doc) doc.close() return count except: return 0 def collect_pdfs(folder_path: str, recursive: bool = True) -> List[Path]: """ Collect all PDF files from a folder. Args: folder_path: Path to folder containing PDFs recursive: Whether to search subfolders Returns: List of Path objects for each PDF """ folder = Path(folder_path) if recursive: return list(folder.rglob("*.pdf")) else: return list(folder.glob("*.pdf")) if __name__ == "__main__": # Quick test import sys if len(sys.argv) > 1: pdf_path = sys.argv[1] print(f"Converting: {pdf_path}") images = pdf_to_images(pdf_path) print(f"Extracted {len(images)} pages") if images: print(f"First page size: {images[0].size}") else: print("Usage: python pdf_to_image.py ")