File size: 3,921 Bytes
8850f3e
 
 
 
 
 
 
7880958
8850f3e
 
 
7880958
 
 
 
8850f3e
7880958
8850f3e
11b7950
 
8850f3e
7880958
 
8850f3e
7880958
 
8850f3e
7880958
 
8850f3e
 
7880958
 
8850f3e
7880958
 
 
 
 
8850f3e
7880958
 
 
8850f3e
7880958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8850f3e
 
 
 
 
7880958
 
11b7950
8850f3e
 
 
 
 
 
 
7880958
8850f3e
 
 
 
7880958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8850f3e
 
 
7880958
8850f3e
 
11b7950
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import cv2
import numpy as np
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
import tempfile  # Added for safe path handling

logger = logging.getLogger("ocr_preprocessor")

# Toggle to save images for debugging
DEBUG_SAVE_IMAGES = True 

def preprocess_image(image: Image.Image, page_num: int) -> Image.Image:
    """
    Applies the preprocessing steps for OCR enhancement
    1. Normalization (Contrast Stretching)
    2. Denoising (Gaussian Blur)
    3. Deskewing (Rotation Correction)
    4. Thresholding (Binarization)
    
    Saves debug images to the system temp directory to avoid permission errors in HF Spaces.
    """
    # 1. Convert to Grayscale
    img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

    # 2. Normalization
    norm_img = np.zeros(img_cv.shape, dtype=np.uint8)
    img_cv = cv2.normalize(img_cv, norm_img, 0, 255, cv2.NORM_MINMAX)

    # 3. Denoising (3x3 kernel)
    denoised = cv2.GaussianBlur(img_cv, (3, 3), 0)

    # 4. Adaptive Thresholding
    binary = cv2.adaptiveThreshold(
        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
        cv2.THRESH_BINARY, 11, 2
    )

    # 5. Deskewing (Inverted)
    inverted_binary = cv2.bitwise_not(binary)
    coords = np.column_stack(np.where(inverted_binary > 0))
    
    if coords.size > 0:
        angle = cv2.minAreaRect(coords)[-1]
        if angle < -45: angle = -(90 + angle)
        else: angle = -angle

        if abs(angle) > 0.5:
            (h, w) = binary.shape[:2]
            center = (w // 2, h // 2)
            M = cv2.getRotationMatrix2D(center, angle, 1.0)
            binary = cv2.warpAffine(
                binary, M, (w, h), 
                flags=cv2.INTER_CUBIC, 
                borderMode=cv2.BORDER_REPLICATE
            )

    # --- DEBUG SAVING LOGIC ---
    if DEBUG_SAVE_IMAGES:
        try:
            # Use the system temp directory (/tmp in Linux/HF Spaces)
            temp_dir = tempfile.gettempdir()
            debug_filename = f"debug_page_{page_num}_processed.png"
            debug_path = os.path.join(temp_dir, debug_filename)
            
            cv2.imwrite(debug_path, binary)
            logger.info(f"Debug image saved to: {debug_path}")
        except Exception as e:
            logger.warning(f"Could not save debug image: {e}")
    # --------------------------

    return Image.fromarray(binary)

def extract_text_with_preprocessing(file_path: str) -> str:
    """
    Pipeline: PDF -> 300 DPI Image -> Preprocessing -> Tesseract
    Converts PDF to 300 DPI images, pre-processes them, 
    and runs Tesseract with layout preservation.
    """
    if not os.path.exists(file_path):
        return ""

    text_content = ""
    try:
        if file_path.lower().endswith('.pdf'):
            # Convert PDF to images at 300 DPI [2]
            images = convert_from_path(file_path, dpi=300)
        else:
            images = [Image.open(file_path)]

        for i, raw_img in enumerate(images):
            custom_config = r'--oem 3 --psm 4'

            # Try Robust Preprocessing
            try:
                processed_img = preprocess_image(raw_img, i+1)
                page_text = pytesseract.image_to_string(processed_img, config=custom_config)
            except Exception as e:
                logger.warning(f"Preprocessing failed: {e}")
                page_text = ""

            # Fallback to Raw Image if preprocessing fails or yields empty text [3]
            if len(page_text.strip()) < 10:
                logger.warning(f"Page {i+1}: Low confidence. Retrying with raw image.")
                page_text = pytesseract.image_to_string(raw_img, config=custom_config)

            text_content += f"--- Page {i+1} ---\n{page_text}\n"

    except Exception as e:
        logger.error(f"OCR Pipeline Error: {e}")
        return f"Error processing file: {str(e)}"

    return text_content.strip()