File size: 2,708 Bytes
f9a156f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | import cv2
import numpy as np
def preprocess_image(image_path_or_array, target_size=(1024, 32)):
"""
Preprocess the image for handwritten text recognition.
1. Read image as grayscale
2. Resize while maintaining aspect ratio (padding with white)
3. Apply binarization / normalization
"""
if isinstance(image_path_or_array, str):
img = cv2.imread(image_path_or_array, cv2.IMREAD_GRAYSCALE)
if img is None:
raise FileNotFoundError(f"Could not read image at {image_path_or_array}")
else:
if len(image_path_or_array.shape) == 3:
img = cv2.cvtColor(image_path_or_array, cv2.COLOR_BGR2GRAY)
else:
img = image_path_or_array.copy()
# Enhance contrast (CLAHE - Contrast Limited Adaptive Histogram Equalization)
# We do NOT want to do this if the image is already aggressively thresholded/binarized
# However, for smooth grayscale training images, CLAHE is great.
# Let's keep it but recognize it might amplify noise if not careful.
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
img = clahe.apply(img)
# Resize keeping aspect ratio
h, w = img.shape
target_w, target_h = target_size
# Calculate ratio
ratio_w = target_w / w
ratio_h = target_h / h
ratio = min(ratio_w, ratio_h)
new_w = int(w * ratio)
new_h = int(h * ratio)
# Check to prevent 0 width/height
if new_w == 0 or new_h == 0:
return np.ones((target_h, target_w), dtype=np.uint8) * 255
img_resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
# Create target blank (white) image
target_img = np.ones((target_h, target_w), dtype=np.uint8) * 255
# Calculate padding to center it vertically, but align LEFT horizontally
# (Aligning left is usually better for sequence models like CTC)
pad_y = (target_h - new_h) // 2
pad_x = 0 # Align left instead of center
# Paste resized image into target
target_img[pad_y:pad_y+new_h, pad_x:pad_x+new_w] = img_resized
# Return as uint8 array without inverting, to match training behavior (white background)
return target_img
def deskew(img):
"""
Deskew the image using image moments.
"""
m = cv2.moments(img)
if abs(m['mu02']) < 1e-2:
return img.copy()
skew = m['mu11'] / m['mu02']
M = np.float32([[1, skew, -0.5 * img.shape[0] * skew], [0, 1, 0]])
img_deskewed = cv2.warpAffine(img, M, (img.shape[1], img.shape[0]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
return img_deskewed
if __name__ == "__main__":
# Simple test
print("Preprocessing module ready.")
|