import os import fitz # PyMuPDF import pytesseract import cv2 import numpy as np # Use bundled Hebrew data os.environ["TESSDATA_PREFIX"] = "./tessdata" def extract_pages(pdf_path): """ HF-safe PDF → image → Hebrew OCR """ doc = fitz.open(pdf_path) results = [] for page_index in range(len(doc)): page = doc.load_page(page_index) pix = page.get_pixmap(dpi=300) img = np.frombuffer(pix.samples, dtype=np.uint8) img = img.reshape(pix.height, pix.width, pix.n) if pix.n == 4: img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR) h, w, _ = img.shape crop = img[ int(h * 0.12):int(h * 0.88), int(w * 0.05):int(w * 0.95) ] gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) gray = cv2.threshold( gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU )[1] text = pytesseract.image_to_string( gray, lang="heb", config="--psm 6" ) results.append({ "page": page_index + 1, "text": text }) return results