File size: 1,149 Bytes
0d2d3d5
 
 
 
 
4371fe4
 
0d2d3d5
 
 
 
 
4371fe4
0d2d3d5
 
 
 
 
 
 
 
4371fe4
0d2d3d5
 
 
4371fe4
0d2d3d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import fitz  # PyMuPDF
import pytesseract
import cv2
import numpy as np

# Use bundled Hebrew data
os.environ["TESSDATA_PREFIX"] = "./tessdata"


def extract_pages(pdf_path):
    """
    HF-safe PDF → image → Hebrew OCR
    """

    doc = fitz.open(pdf_path)
    results = []

    for page_index in range(len(doc)):
        page = doc.load_page(page_index)
        pix = page.get_pixmap(dpi=300)

        img = np.frombuffer(pix.samples, dtype=np.uint8)
        img = img.reshape(pix.height, pix.width, pix.n)

        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)

        h, w, _ = img.shape

        crop = img[
            int(h * 0.12):int(h * 0.88),
            int(w * 0.05):int(w * 0.95)
        ]

        gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
        gray = cv2.threshold(
            gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
        )[1]

        text = pytesseract.image_to_string(
            gray,
            lang="heb",
            config="--psm 6"
        )

        results.append({
            "page": page_index + 1,
            "text": text
        })

    return results