Spaces:
Sleeping
Sleeping
File size: 1,149 Bytes
0d2d3d5 4371fe4 0d2d3d5 4371fe4 0d2d3d5 4371fe4 0d2d3d5 4371fe4 0d2d3d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os
import fitz # PyMuPDF
import pytesseract
import cv2
import numpy as np
# Use bundled Hebrew data
os.environ["TESSDATA_PREFIX"] = "./tessdata"
def extract_pages(pdf_path):
"""
HF-safe PDF → image → Hebrew OCR
"""
doc = fitz.open(pdf_path)
results = []
for page_index in range(len(doc)):
page = doc.load_page(page_index)
pix = page.get_pixmap(dpi=300)
img = np.frombuffer(pix.samples, dtype=np.uint8)
img = img.reshape(pix.height, pix.width, pix.n)
if pix.n == 4:
img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
h, w, _ = img.shape
crop = img[
int(h * 0.12):int(h * 0.88),
int(w * 0.05):int(w * 0.95)
]
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
gray = cv2.threshold(
gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)[1]
text = pytesseract.image_to_string(
gray,
lang="heb",
config="--psm 6"
)
results.append({
"page": page_index + 1,
"text": text
})
return results
|