OCR / ocrpkg /pdf.py
Eyob-Sol's picture
Upload 15 files
9a5a8ff verified
Raw
History Blame Contribute Delete
1.41 kB
from __future__ import annotations
from pathlib import Path
from typing import List
import fitz # PyMuPDF
from PIL import Image
def pdf_to_images(pdf_path: Path, dpi: int = 200) -> List[Image.Image]:
"""
Render each PDF page to a PIL Image (RGB).
Args:
pdf_path: path to a .pdf file
dpi: target DPI for rasterization (higher = sharper but slower)
Returns:
List of PIL Images, one per page, in reading order.
"""
images: List[Image.Image] = []
zoom = dpi / 72.0 # 72 dpi is the PDF default
mat = fitz.Matrix(zoom, zoom)
with fitz.open(pdf_path) as doc:
for page in doc:
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
from .schema import OCRBlock
def pdf_native_blocks(pdf_path: Path) -> list[OCRBlock]:
blocks: list[OCRBlock] = []
with fitz.open(pdf_path) as doc:
for i, page in enumerate(doc, start=1):
for b in page.get_text("blocks"): # (x0,y0,x1,y1,text, ...)
x0, y0, x1, y1, txt, *_ = b
if txt and txt.strip():
blocks.append(
OCRBlock(page=i, bbox=(int(x0), int(y0), int(x1), int(y1)),
text=txt.strip(), confidence=1.0)
)
return blocks