Spaces:
Runtime error
Runtime error
| """ | |
| PDF to Image Conversion using PyMuPDF (fitz) | |
| Converts all pages of a PDF to PIL Images. | |
| """ | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| import io | |
| def pdf_to_images(pdf_path: str, dpi: int = 150) -> List[Image.Image]: | |
| """ | |
| Convert all pages of a PDF to PIL Images. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| dpi: Resolution for rendering (default 150, balance of quality/speed) | |
| Returns: | |
| List of PIL Images, one per page | |
| """ | |
| images = [] | |
| try: | |
| doc = fitz.open(pdf_path) | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| # Create pixmap at specified DPI | |
| zoom = dpi / 72 # 72 is default PDF DPI | |
| matrix = fitz.Matrix(zoom, zoom) | |
| pix = page.get_pixmap(matrix=matrix) | |
| # Convert to PIL Image | |
| img_data = pix.tobytes("png") | |
| img = Image.open(io.BytesIO(img_data)) | |
| images.append(img.convert("RGB")) | |
| doc.close() | |
| except Exception as e: | |
| print(f"Error converting {pdf_path}: {e}") | |
| return [] | |
| return images | |
| def get_pdf_page_count(pdf_path: str) -> int: | |
| """Get the number of pages in a PDF.""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| count = len(doc) | |
| doc.close() | |
| return count | |
| except: | |
| return 0 | |
| def collect_pdfs(folder_path: str, recursive: bool = True) -> List[Path]: | |
| """ | |
| Collect all PDF files from a folder. | |
| Args: | |
| folder_path: Path to folder containing PDFs | |
| recursive: Whether to search subfolders | |
| Returns: | |
| List of Path objects for each PDF | |
| """ | |
| folder = Path(folder_path) | |
| if recursive: | |
| return list(folder.rglob("*.pdf")) | |
| else: | |
| return list(folder.glob("*.pdf")) | |
| if __name__ == "__main__": | |
| # Quick test | |
| import sys | |
| if len(sys.argv) > 1: | |
| pdf_path = sys.argv[1] | |
| print(f"Converting: {pdf_path}") | |
| images = pdf_to_images(pdf_path) | |
| print(f"Extracted {len(images)} pages") | |
| if images: | |
| print(f"First page size: {images[0].size}") | |
| else: | |
| print("Usage: python pdf_to_image.py <path_to_pdf>") | |