mortgage_OCR / ocr.py
mlbench123's picture
Update ocr.py
4371fe4 verified
import os
import fitz # PyMuPDF
import pytesseract
import cv2
import numpy as np
# Use bundled Hebrew data
os.environ["TESSDATA_PREFIX"] = "./tessdata"
def extract_pages(pdf_path):
"""
HF-safe PDF → image → Hebrew OCR
"""
doc = fitz.open(pdf_path)
results = []
for page_index in range(len(doc)):
page = doc.load_page(page_index)
pix = page.get_pixmap(dpi=300)
img = np.frombuffer(pix.samples, dtype=np.uint8)
img = img.reshape(pix.height, pix.width, pix.n)
if pix.n == 4:
img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
h, w, _ = img.shape
crop = img[
int(h * 0.12):int(h * 0.88),
int(w * 0.05):int(w * 0.95)
]
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
gray = cv2.threshold(
gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)[1]
text = pytesseract.image_to_string(
gray,
lang="heb",
config="--psm 6"
)
results.append({
"page": page_index + 1,
"text": text
})
return results