bilingual-ocr-api / preprocess.py
Zarm33na's picture
Initial deployment: bilingual OCR API (Urdu + English)
04f9475
"""
OCR Image Preprocessing Module (Step 1 of pipeline).
OpenCV-only preprocessing for scanned documents and handwritten notes.
No deep learning. Windows-compatible.
Pipeline: load -> grayscale -> denoise -> adaptive threshold -> deskew.
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
import cv2
import numpy as np
# ---------------------------------------------------------------------------
# Constants (tunable for low-quality scans and handwritten text)
# ---------------------------------------------------------------------------
BLUR_KERNEL_SIZE = 3 # Median blur kernel; odd, 3 or 5 for noise removal
ADAPTIVE_BLOCK_SIZE = 15 # Block size for adaptive threshold (odd, ~11–31)
ADAPTIVE_C = 8 # Constant subtracted from mean in adaptive threshold
MIN_CONTOUR_AREA = 500 # Min contour area to consider for deskew (filter noise)
MAX_CONTOUR_AREA_RATIO = 0.5 # Max contour area as ratio of image (filter full-page)
ANGLE_QUANTILE = 0.5 # Median angle for deskew (0.5 = median)
def load_image(image_path: str | Path) -> np.ndarray:
"""
Load an image from disk. Supports common formats (PNG, JPG, TIFF, etc.).
Args:
image_path: Path to the image file.
Returns:
BGR image as numpy array.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the image could not be loaded (e.g. corrupt or unsupported).
"""
path = Path(image_path)
if not path.is_file():
raise FileNotFoundError(f"Image file not found: {path}")
img = cv2.imread(str(path))
if img is None:
raise ValueError(f"Could not load image (unsupported or corrupt): {path}")
return img
def to_grayscale(image: np.ndarray) -> np.ndarray:
"""
Convert BGR image to grayscale. Single channel improves thresholding and deskew.
Args:
image: BGR or grayscale image.
Returns:
Grayscale image (uint8).
"""
if len(image.shape) == 2:
return image.copy()
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
def denoise(image: np.ndarray, kernel_size: int = BLUR_KERNEL_SIZE) -> np.ndarray:
"""
Reduce noise using median blur. Preserves edges better than Gaussian for
text and handwritten strokes; helps with low-quality scans.
Args:
image: Grayscale image.
kernel_size: Odd kernel size (3 or 5 typical).
Returns:
Denoised grayscale image.
"""
if kernel_size % 2 == 0:
kernel_size += 1
return cv2.medianBlur(image, kernel_size)
def adaptive_threshold(
image: np.ndarray,
block_size: int = ADAPTIVE_BLOCK_SIZE,
c: int = ADAPTIVE_C,
) -> np.ndarray:
"""
Binarize image with adaptive thresholding for uneven lighting (e.g. scans
with shadows or non-uniform illumination). Each pixel is compared to a
local mean.
Args:
image: Grayscale image.
block_size: Size of neighbourhood (must be odd).
c: Constant subtracted from the mean.
Returns:
Binary image (0 or 255); text usually white on black for many OCR APIs.
"""
if block_size % 2 == 0:
block_size += 1
binary = cv2.adaptiveThreshold(
image,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
block_size,
c,
)
return binary
def _get_skew_angle_from_binary(
binary: np.ndarray,
min_area: int = MIN_CONTOUR_AREA,
max_area_ratio: float = MAX_CONTOUR_AREA_RATIO,
quantile: float = ANGLE_QUANTILE,
) -> float:
"""
Estimate skew angle (degrees) from binary image using contours.
Uses minAreaRect of text-like contours; returns median angle so a few
outliers (noise, graphics) do not dominate.
Args:
binary: Binary image (0 and 255).
min_area: Ignore contours smaller than this.
max_area_ratio: Ignore contours larger than this fraction of image area.
quantile: Which quantile of angles to use (0.5 = median).
Returns:
Estimated skew angle in degrees (positive = CCW tilt of text lines).
"""
h, w = binary.shape
total_area = h * w
# Find contours (external only to get text block outlines)
contours, _ = cv2.findContours(
binary,
cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE,
)
angles = []
for cnt in contours:
area = cv2.contourArea(cnt)
if area < min_area or area > max_area_ratio * total_area:
continue
rect = cv2.minAreaRect(cnt)
# rect[2] is angle in degrees in [-90, 0); we use it for skew
angle = rect[2]
# Normalize to small tilt: prefer angle in [-45, 45]
if angle < -45:
angle += 90
angles.append(angle)
if not angles:
return 0.0
return float(np.quantile(angles, quantile))
def deskew(
image: np.ndarray,
binary: np.ndarray,
skew_angle: float | None = None,
) -> tuple[np.ndarray, np.ndarray, float]:
"""
Rotate image and binary to correct skew. If skew_angle is not provided,
it is estimated from the binary image.
Args:
image: Grayscale image to deskew.
binary: Binary image used for angle estimation (and to deskew).
skew_angle: Override estimated angle (degrees); if None, estimate from binary.
Returns:
(deskewed_grayscale, deskewed_binary, angle_used)
"""
if skew_angle is None:
skew_angle = _get_skew_angle_from_binary(binary)
# Only rotate if angle is meaningful (avoid jitter on already straight docs)
if abs(skew_angle) < 0.2:
return image.copy(), binary.copy(), skew_angle
h, w = image.shape[:2]
center = (w / 2, h / 2)
M = cv2.getRotationMatrix2D(center, -skew_angle, 1.0)
# Expand canvas so rotated image is not cropped
cos = np.abs(M[0, 0])
sin = np.abs(M[0, 1])
nw = int(h * sin + w * cos)
nh = int(h * cos + w * sin)
M[0, 2] += (nw / 2) - center[0]
M[1, 2] += (nh / 2) - center[1]
gray_deskewed = cv2.warpAffine(
image, M, (nw, nh),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE,
)
binary_deskewed = cv2.warpAffine(
binary, M, (nw, nh),
flags=cv2.INTER_NEAREST,
borderMode=cv2.BORDER_CONSTANT,
borderValue=0,
)
return gray_deskewed, binary_deskewed, skew_angle
def preprocess_image(image_path: str | Path) -> tuple[np.ndarray, np.ndarray]:
"""
Full preprocessing pipeline: load -> grayscale -> denoise -> adaptive
threshold -> deskew. Suitable for scanned documents and handwritten notes.
Args:
image_path: Path to the input image.
Returns:
(binary_image, deskewed_grayscale_image)
- binary_image: Final black & white image (deskewed), for OCR input.
- deskewed_grayscale: Deskewed grayscale, for debugging/visualization.
Raises:
FileNotFoundError: If image_path does not exist.
ValueError: If image cannot be loaded.
"""
# 1. Load
bgr = load_image(image_path)
# 2. Grayscale
gray = to_grayscale(bgr)
# 3. Denoise (median blur)
denoised = denoise(gray)
# 4. Adaptive threshold -> binary
binary = adaptive_threshold(denoised)
# 5. Deskew (estimate angle from binary, then rotate both binary and grayscale)
gray_deskewed, binary_deskewed, _ = deskew(denoised, binary)
# Return final binary and deskewed grayscale (for debugging)
return binary_deskewed, gray_deskewed
# ---------------------------------------------------------------------------
# Main: test block with display and save
# ---------------------------------------------------------------------------
def _run_demo(image_path: str | Path, output_path: str | Path | None) -> None:
"""
Load a sample image, run preprocessing, show intermediate results in
OpenCV windows, and save the final preprocessed image.
"""
path = Path(image_path)
if not path.is_file():
print(f"Error: Sample image not found: {path}", file=sys.stderr)
print("Usage: python preprocess.py <path_to_image> [output_path]", file=sys.stderr)
sys.exit(1)
# Load and pipeline (we need intermediates for display)
bgr = load_image(path)
gray = to_grayscale(bgr)
denoised = denoise(gray)
binary = adaptive_threshold(denoised)
gray_deskewed, binary_deskewed, angle = deskew(denoised, binary)
# Default output path if not provided
if output_path is None:
output_path = path.parent / f"{path.stem}_preprocessed.png"
out = Path(output_path)
# Save final binary (main output) and optionally deskewed grayscale
cv2.imwrite(str(out), binary_deskewed)
cv2.imwrite(str(out.parent / f"{out.stem}_gray.png"), gray_deskewed)
print(f"Saved: {out}")
print(f"Saved (grayscale): {out.parent / (out.stem + '_gray.png')}")
print(f"Deskew angle (degrees): {angle:.2f}")
# Display intermediate results (resize if very large so they fit on screen)
max_display = 800
def _resize_for_display(img: np.ndarray) -> np.ndarray:
h, w = img.shape[:2]
if max(h, w) <= max_display:
return img
scale = max_display / max(h, w)
return cv2.resize(img, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
cv2.imshow("1_original", _resize_for_display(bgr))
cv2.imshow("2_grayscale", _resize_for_display(gray))
cv2.imshow("3_thresholded", _resize_for_display(binary))
cv2.imshow("4_deskewed_binary", _resize_for_display(binary_deskewed))
cv2.imshow("5_deskewed_grayscale", _resize_for_display(gray_deskewed))
print("Close any OpenCV window or press a key to exit.")
cv2.waitKey(0)
cv2.destroyAllWindows()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Preprocess a scanned/handwritten image for OCR (display + save).",
)
parser.add_argument(
"image_path",
type=Path,
help="Path to input image (e.g. scanned document or handwritten note).",
)
parser.add_argument(
"output_path",
type=Path,
nargs="?",
default=None,
help="Path for saved preprocessed image (default: <input_stem>_preprocessed.png).",
)
args = parser.parse_args()
_run_demo(args.image_path, args.output_path)