File size: 2,708 Bytes
f9a156f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import cv2
import numpy as np

def preprocess_image(image_path_or_array, target_size=(1024, 32)):
    """
    Preprocess the image for handwritten text recognition.
    1. Read image as grayscale
    2. Resize while maintaining aspect ratio (padding with white)
    3. Apply binarization / normalization
    """
    if isinstance(image_path_or_array, str):
        img = cv2.imread(image_path_or_array, cv2.IMREAD_GRAYSCALE)
        if img is None:
            raise FileNotFoundError(f"Could not read image at {image_path_or_array}")
    else:
        if len(image_path_or_array.shape) == 3:
            img = cv2.cvtColor(image_path_or_array, cv2.COLOR_BGR2GRAY)
        else:
            img = image_path_or_array.copy()

    # Enhance contrast (CLAHE - Contrast Limited Adaptive Histogram Equalization)
    # We do NOT want to do this if the image is already aggressively thresholded/binarized
    # However, for smooth grayscale training images, CLAHE is great. 
    # Let's keep it but recognize it might amplify noise if not careful.
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    img = clahe.apply(img)

    # Resize keeping aspect ratio
    h, w = img.shape
    target_w, target_h = target_size
    
    # Calculate ratio
    ratio_w = target_w / w
    ratio_h = target_h / h
    ratio = min(ratio_w, ratio_h)
    
    new_w = int(w * ratio)
    new_h = int(h * ratio)
    
    # Check to prevent 0 width/height
    if new_w == 0 or new_h == 0:
        return np.ones((target_h, target_w), dtype=np.uint8) * 255
    
    img_resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
    
    # Create target blank (white) image
    target_img = np.ones((target_h, target_w), dtype=np.uint8) * 255
    
    # Calculate padding to center it vertically, but align LEFT horizontally
    # (Aligning left is usually better for sequence models like CTC)
    pad_y = (target_h - new_h) // 2
    pad_x = 0 # Align left instead of center
    
    # Paste resized image into target
    target_img[pad_y:pad_y+new_h, pad_x:pad_x+new_w] = img_resized
    
    # Return as uint8 array without inverting, to match training behavior (white background)
    return target_img

def deskew(img):
    """
    Deskew the image using image moments.
    """
    m = cv2.moments(img)
    if abs(m['mu02']) < 1e-2:
        return img.copy()
    
    skew = m['mu11'] / m['mu02']
    M = np.float32([[1, skew, -0.5 * img.shape[0] * skew], [0, 1, 0]])
    img_deskewed = cv2.warpAffine(img, M, (img.shape[1], img.shape[0]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
    return img_deskewed

if __name__ == "__main__":
    # Simple test
    print("Preprocessing module ready.")