DocAI / src /pdf_io.py
Pengyuan Li
Add ZeroGPU support for DocAI demo on HuggingFace Spaces
c37e95b
"""
PDF I/O utilities - Load and render PDFs
Placeholder for Task 2
"""
from typing import List, Tuple
from PIL import Image
import io
def load_pdf_pages(pdf_bytes: bytes, dpi: int = 150, max_pages: int = 10) -> List[Image.Image]:
"""
Load PDF and render each page to PIL Image.
Args:
pdf_bytes: PDF file as bytes
dpi: Resolution for rendering (default 150)
max_pages: Maximum pages to render (default 10)
Returns:
List of PIL Images, one per page
"""
try:
import fitz # PyMuPDF
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
pages = []
for i in range(min(len(doc), max_pages)):
page = doc[i]
# Render to image
pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
pages.append(img)
doc.close()
return pages
except ImportError:
# Stub: if fitz not available
print("⚠️ PyMuPDF not available, returning placeholder")
img = Image.new("RGB", (800, 1000), color=(200, 200, 200))
return [img] * 3
def get_page_count(pdf_bytes: bytes) -> int:
"""Get total page count of PDF."""
try:
import fitz
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
count = len(doc)
doc.close()
return count
except Exception:
return 0