Spaces:

Eyob-Sol
/

OCR

Sleeping

OCR / ocrpkg /pdf.py

Upload 15 files

9a5a8ff verified 10 months ago

1.41 kB

	from __future__ import annotations
	from pathlib import Path
	from typing import List

	import fitz # PyMuPDF
	from PIL import Image


	def pdf_to_images(pdf_path: Path, dpi: int = 200) -> List[Image.Image]:
	"""
	Render each PDF page to a PIL Image (RGB).

	Args:
	pdf_path: path to a .pdf file
	dpi: target DPI for rasterization (higher = sharper but slower)

	Returns:
	List of PIL Images, one per page, in reading order.
	"""
	images: List[Image.Image] = []
	zoom = dpi / 72.0 # 72 dpi is the PDF default
	mat = fitz.Matrix(zoom, zoom)
	with fitz.open(pdf_path) as doc:
	for page in doc:
	pix = page.get_pixmap(matrix=mat)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)
	return images

	from .schema import OCRBlock

	def pdf_native_blocks(pdf_path: Path) -> list[OCRBlock]:
	blocks: list[OCRBlock] = []
	with fitz.open(pdf_path) as doc:
	for i, page in enumerate(doc, start=1):
	for b in page.get_text("blocks"): # (x0,y0,x1,y1,text, ...)
	x0, y0, x1, y1, txt, *_ = b
	if txt and txt.strip():
	blocks.append(
	OCRBlock(page=i, bbox=(int(x0), int(y0), int(x1), int(y1)),
	text=txt.strip(), confidence=1.0)
	)
	return blocks