import cv2 import numpy as np def preprocess_image(image_path_or_array, target_size=(1024, 32)): """ Preprocess the image for handwritten text recognition. 1. Read image as grayscale 2. Resize while maintaining aspect ratio (padding with white) 3. Apply binarization / normalization """ if isinstance(image_path_or_array, str): img = cv2.imread(image_path_or_array, cv2.IMREAD_GRAYSCALE) if img is None: raise FileNotFoundError(f"Could not read image at {image_path_or_array}") else: if len(image_path_or_array.shape) == 3: img = cv2.cvtColor(image_path_or_array, cv2.COLOR_BGR2GRAY) else: img = image_path_or_array.copy() # Enhance contrast (CLAHE - Contrast Limited Adaptive Histogram Equalization) # We do NOT want to do this if the image is already aggressively thresholded/binarized # However, for smooth grayscale training images, CLAHE is great. # Let's keep it but recognize it might amplify noise if not careful. clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) img = clahe.apply(img) # Resize keeping aspect ratio h, w = img.shape target_w, target_h = target_size # Calculate ratio ratio_w = target_w / w ratio_h = target_h / h ratio = min(ratio_w, ratio_h) new_w = int(w * ratio) new_h = int(h * ratio) # Check to prevent 0 width/height if new_w == 0 or new_h == 0: return np.ones((target_h, target_w), dtype=np.uint8) * 255 img_resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA) # Create target blank (white) image target_img = np.ones((target_h, target_w), dtype=np.uint8) * 255 # Calculate padding to center it vertically, but align LEFT horizontally # (Aligning left is usually better for sequence models like CTC) pad_y = (target_h - new_h) // 2 pad_x = 0 # Align left instead of center # Paste resized image into target target_img[pad_y:pad_y+new_h, pad_x:pad_x+new_w] = img_resized # Return as uint8 array without inverting, to match training behavior (white background) return target_img def deskew(img): """ Deskew the image using image moments. """ m = cv2.moments(img) if abs(m['mu02']) < 1e-2: return img.copy() skew = m['mu11'] / m['mu02'] M = np.float32([[1, skew, -0.5 * img.shape[0] * skew], [0, 1, 0]]) img_deskewed = cv2.warpAffine(img, M, (img.shape[1], img.shape[0]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE) return img_deskewed if __name__ == "__main__": # Simple test print("Preprocessing module ready.")