pdf-trainer-api / backend /worker /pdf_render.py
Avinash
integrate real backend api
4a5269c
raw
history blame contribute delete
996 Bytes
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import List
import fitz # PyMuPDF
from PIL import Image
@dataclass
class RenderedImage:
path: Path
page_index: int
def render_pdf_to_pngs(pdf_path: Path, out_dir: Path, pages: int = 2, dpi: int = 200) -> List[RenderedImage]:
out_dir.mkdir(parents=True, exist_ok=True)
doc = fitz.open(pdf_path)
n = min(pages, doc.page_count)
zoom = dpi / 72.0
mat = fitz.Matrix(zoom, zoom)
rendered: List[RenderedImage] = []
for i in range(n):
page = doc.load_page(i)
pix = page.get_pixmap(matrix=mat, alpha=False)
img_path = out_dir / f"{pdf_path.stem}_p{i+1}.png"
pix.save(str(img_path))
# normalize to RGB with PIL (avoids weird modes)
im = Image.open(img_path).convert("RGB")
im.save(img_path)
rendered.append(RenderedImage(path=img_path, page_index=i))
doc.close()
return rendered