Doc-Classifier / pdf_to_image.py
Qasim-Dost's picture
Upload 5 files
ce9f3ac verified
"""
PDF to Image Conversion using PyMuPDF (fitz)
Converts all pages of a PDF to PIL Images.
"""
import fitz # PyMuPDF
from PIL import Image
from pathlib import Path
from typing import List, Tuple
import io
def pdf_to_images(pdf_path: str, dpi: int = 150) -> List[Image.Image]:
"""
Convert all pages of a PDF to PIL Images.
Args:
pdf_path: Path to the PDF file
dpi: Resolution for rendering (default 150, balance of quality/speed)
Returns:
List of PIL Images, one per page
"""
images = []
try:
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
# Create pixmap at specified DPI
zoom = dpi / 72 # 72 is default PDF DPI
matrix = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=matrix)
# Convert to PIL Image
img_data = pix.tobytes("png")
img = Image.open(io.BytesIO(img_data))
images.append(img.convert("RGB"))
doc.close()
except Exception as e:
print(f"Error converting {pdf_path}: {e}")
return []
return images
def get_pdf_page_count(pdf_path: str) -> int:
"""Get the number of pages in a PDF."""
try:
doc = fitz.open(pdf_path)
count = len(doc)
doc.close()
return count
except:
return 0
def collect_pdfs(folder_path: str, recursive: bool = True) -> List[Path]:
"""
Collect all PDF files from a folder.
Args:
folder_path: Path to folder containing PDFs
recursive: Whether to search subfolders
Returns:
List of Path objects for each PDF
"""
folder = Path(folder_path)
if recursive:
return list(folder.rglob("*.pdf"))
else:
return list(folder.glob("*.pdf"))
if __name__ == "__main__":
# Quick test
import sys
if len(sys.argv) > 1:
pdf_path = sys.argv[1]
print(f"Converting: {pdf_path}")
images = pdf_to_images(pdf_path)
print(f"Extracted {len(images)} pages")
if images:
print(f"First page size: {images[0].size}")
else:
print("Usage: python pdf_to_image.py <path_to_pdf>")