Spaces:
Sleeping
Sleeping
| import os | |
| import fitz # PyMuPDF | |
| import pytesseract | |
| import cv2 | |
| import numpy as np | |
| # Use bundled Hebrew data | |
| os.environ["TESSDATA_PREFIX"] = "./tessdata" | |
| def extract_pages(pdf_path): | |
| """ | |
| HF-safe PDF → image → Hebrew OCR | |
| """ | |
| doc = fitz.open(pdf_path) | |
| results = [] | |
| for page_index in range(len(doc)): | |
| page = doc.load_page(page_index) | |
| pix = page.get_pixmap(dpi=300) | |
| img = np.frombuffer(pix.samples, dtype=np.uint8) | |
| img = img.reshape(pix.height, pix.width, pix.n) | |
| if pix.n == 4: | |
| img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR) | |
| h, w, _ = img.shape | |
| crop = img[ | |
| int(h * 0.12):int(h * 0.88), | |
| int(w * 0.05):int(w * 0.95) | |
| ] | |
| gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) | |
| gray = cv2.threshold( | |
| gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU | |
| )[1] | |
| text = pytesseract.image_to_string( | |
| gray, | |
| lang="heb", | |
| config="--psm 6" | |
| ) | |
| results.append({ | |
| "page": page_index + 1, | |
| "text": text | |
| }) | |
| return results | |