""" OCR Image Preprocessing Module (Step 1 of pipeline). OpenCV-only preprocessing for scanned documents and handwritten notes. No deep learning. Windows-compatible. Pipeline: load -> grayscale -> denoise -> adaptive threshold -> deskew. """ from __future__ import annotations import argparse import sys from pathlib import Path import cv2 import numpy as np # --------------------------------------------------------------------------- # Constants (tunable for low-quality scans and handwritten text) # --------------------------------------------------------------------------- BLUR_KERNEL_SIZE = 3 # Median blur kernel; odd, 3 or 5 for noise removal ADAPTIVE_BLOCK_SIZE = 15 # Block size for adaptive threshold (odd, ~11–31) ADAPTIVE_C = 8 # Constant subtracted from mean in adaptive threshold MIN_CONTOUR_AREA = 500 # Min contour area to consider for deskew (filter noise) MAX_CONTOUR_AREA_RATIO = 0.5 # Max contour area as ratio of image (filter full-page) ANGLE_QUANTILE = 0.5 # Median angle for deskew (0.5 = median) def load_image(image_path: str | Path) -> np.ndarray: """ Load an image from disk. Supports common formats (PNG, JPG, TIFF, etc.). Args: image_path: Path to the image file. Returns: BGR image as numpy array. Raises: FileNotFoundError: If the file does not exist. ValueError: If the image could not be loaded (e.g. corrupt or unsupported). """ path = Path(image_path) if not path.is_file(): raise FileNotFoundError(f"Image file not found: {path}") img = cv2.imread(str(path)) if img is None: raise ValueError(f"Could not load image (unsupported or corrupt): {path}") return img def to_grayscale(image: np.ndarray) -> np.ndarray: """ Convert BGR image to grayscale. Single channel improves thresholding and deskew. Args: image: BGR or grayscale image. Returns: Grayscale image (uint8). """ if len(image.shape) == 2: return image.copy() return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) def denoise(image: np.ndarray, kernel_size: int = BLUR_KERNEL_SIZE) -> np.ndarray: """ Reduce noise using median blur. Preserves edges better than Gaussian for text and handwritten strokes; helps with low-quality scans. Args: image: Grayscale image. kernel_size: Odd kernel size (3 or 5 typical). Returns: Denoised grayscale image. """ if kernel_size % 2 == 0: kernel_size += 1 return cv2.medianBlur(image, kernel_size) def adaptive_threshold( image: np.ndarray, block_size: int = ADAPTIVE_BLOCK_SIZE, c: int = ADAPTIVE_C, ) -> np.ndarray: """ Binarize image with adaptive thresholding for uneven lighting (e.g. scans with shadows or non-uniform illumination). Each pixel is compared to a local mean. Args: image: Grayscale image. block_size: Size of neighbourhood (must be odd). c: Constant subtracted from the mean. Returns: Binary image (0 or 255); text usually white on black for many OCR APIs. """ if block_size % 2 == 0: block_size += 1 binary = cv2.adaptiveThreshold( image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, c, ) return binary def _get_skew_angle_from_binary( binary: np.ndarray, min_area: int = MIN_CONTOUR_AREA, max_area_ratio: float = MAX_CONTOUR_AREA_RATIO, quantile: float = ANGLE_QUANTILE, ) -> float: """ Estimate skew angle (degrees) from binary image using contours. Uses minAreaRect of text-like contours; returns median angle so a few outliers (noise, graphics) do not dominate. Args: binary: Binary image (0 and 255). min_area: Ignore contours smaller than this. max_area_ratio: Ignore contours larger than this fraction of image area. quantile: Which quantile of angles to use (0.5 = median). Returns: Estimated skew angle in degrees (positive = CCW tilt of text lines). """ h, w = binary.shape total_area = h * w # Find contours (external only to get text block outlines) contours, _ = cv2.findContours( binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) angles = [] for cnt in contours: area = cv2.contourArea(cnt) if area < min_area or area > max_area_ratio * total_area: continue rect = cv2.minAreaRect(cnt) # rect[2] is angle in degrees in [-90, 0); we use it for skew angle = rect[2] # Normalize to small tilt: prefer angle in [-45, 45] if angle < -45: angle += 90 angles.append(angle) if not angles: return 0.0 return float(np.quantile(angles, quantile)) def deskew( image: np.ndarray, binary: np.ndarray, skew_angle: float | None = None, ) -> tuple[np.ndarray, np.ndarray, float]: """ Rotate image and binary to correct skew. If skew_angle is not provided, it is estimated from the binary image. Args: image: Grayscale image to deskew. binary: Binary image used for angle estimation (and to deskew). skew_angle: Override estimated angle (degrees); if None, estimate from binary. Returns: (deskewed_grayscale, deskewed_binary, angle_used) """ if skew_angle is None: skew_angle = _get_skew_angle_from_binary(binary) # Only rotate if angle is meaningful (avoid jitter on already straight docs) if abs(skew_angle) < 0.2: return image.copy(), binary.copy(), skew_angle h, w = image.shape[:2] center = (w / 2, h / 2) M = cv2.getRotationMatrix2D(center, -skew_angle, 1.0) # Expand canvas so rotated image is not cropped cos = np.abs(M[0, 0]) sin = np.abs(M[0, 1]) nw = int(h * sin + w * cos) nh = int(h * cos + w * sin) M[0, 2] += (nw / 2) - center[0] M[1, 2] += (nh / 2) - center[1] gray_deskewed = cv2.warpAffine( image, M, (nw, nh), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE, ) binary_deskewed = cv2.warpAffine( binary, M, (nw, nh), flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0, ) return gray_deskewed, binary_deskewed, skew_angle def preprocess_image(image_path: str | Path) -> tuple[np.ndarray, np.ndarray]: """ Full preprocessing pipeline: load -> grayscale -> denoise -> adaptive threshold -> deskew. Suitable for scanned documents and handwritten notes. Args: image_path: Path to the input image. Returns: (binary_image, deskewed_grayscale_image) - binary_image: Final black & white image (deskewed), for OCR input. - deskewed_grayscale: Deskewed grayscale, for debugging/visualization. Raises: FileNotFoundError: If image_path does not exist. ValueError: If image cannot be loaded. """ # 1. Load bgr = load_image(image_path) # 2. Grayscale gray = to_grayscale(bgr) # 3. Denoise (median blur) denoised = denoise(gray) # 4. Adaptive threshold -> binary binary = adaptive_threshold(denoised) # 5. Deskew (estimate angle from binary, then rotate both binary and grayscale) gray_deskewed, binary_deskewed, _ = deskew(denoised, binary) # Return final binary and deskewed grayscale (for debugging) return binary_deskewed, gray_deskewed # --------------------------------------------------------------------------- # Main: test block with display and save # --------------------------------------------------------------------------- def _run_demo(image_path: str | Path, output_path: str | Path | None) -> None: """ Load a sample image, run preprocessing, show intermediate results in OpenCV windows, and save the final preprocessed image. """ path = Path(image_path) if not path.is_file(): print(f"Error: Sample image not found: {path}", file=sys.stderr) print("Usage: python preprocess.py [output_path]", file=sys.stderr) sys.exit(1) # Load and pipeline (we need intermediates for display) bgr = load_image(path) gray = to_grayscale(bgr) denoised = denoise(gray) binary = adaptive_threshold(denoised) gray_deskewed, binary_deskewed, angle = deskew(denoised, binary) # Default output path if not provided if output_path is None: output_path = path.parent / f"{path.stem}_preprocessed.png" out = Path(output_path) # Save final binary (main output) and optionally deskewed grayscale cv2.imwrite(str(out), binary_deskewed) cv2.imwrite(str(out.parent / f"{out.stem}_gray.png"), gray_deskewed) print(f"Saved: {out}") print(f"Saved (grayscale): {out.parent / (out.stem + '_gray.png')}") print(f"Deskew angle (degrees): {angle:.2f}") # Display intermediate results (resize if very large so they fit on screen) max_display = 800 def _resize_for_display(img: np.ndarray) -> np.ndarray: h, w = img.shape[:2] if max(h, w) <= max_display: return img scale = max_display / max(h, w) return cv2.resize(img, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA) cv2.imshow("1_original", _resize_for_display(bgr)) cv2.imshow("2_grayscale", _resize_for_display(gray)) cv2.imshow("3_thresholded", _resize_for_display(binary)) cv2.imshow("4_deskewed_binary", _resize_for_display(binary_deskewed)) cv2.imshow("5_deskewed_grayscale", _resize_for_display(gray_deskewed)) print("Close any OpenCV window or press a key to exit.") cv2.waitKey(0) cv2.destroyAllWindows() if __name__ == "__main__": parser = argparse.ArgumentParser( description="Preprocess a scanned/handwritten image for OCR (display + save).", ) parser.add_argument( "image_path", type=Path, help="Path to input image (e.g. scanned document or handwritten note).", ) parser.add_argument( "output_path", type=Path, nargs="?", default=None, help="Path for saved preprocessed image (default: _preprocessed.png).", ) args = parser.parse_args() _run_demo(args.image_path, args.output_path)