Spaces:

Zarm33na
/

bilingual-ocr-api

Sleeping

File size: 10,398 Bytes

04f9475

"""
OCR Image Preprocessing Module (Step 1 of pipeline).

OpenCV-only preprocessing for scanned documents and handwritten notes.
No deep learning. Windows-compatible.

Pipeline: load -> grayscale -> denoise -> adaptive threshold -> deskew.
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

import cv2
import numpy as np


# ---------------------------------------------------------------------------
# Constants (tunable for low-quality scans and handwritten text)
# ---------------------------------------------------------------------------
BLUR_KERNEL_SIZE = 3          # Median blur kernel; odd, 3 or 5 for noise removal
ADAPTIVE_BLOCK_SIZE = 15     # Block size for adaptive threshold (odd, ~11–31)
ADAPTIVE_C = 8               # Constant subtracted from mean in adaptive threshold
MIN_CONTOUR_AREA = 500       # Min contour area to consider for deskew (filter noise)
MAX_CONTOUR_AREA_RATIO = 0.5 # Max contour area as ratio of image (filter full-page)
ANGLE_QUANTILE = 0.5         # Median angle for deskew (0.5 = median)


def load_image(image_path: str | Path) -> np.ndarray:
    """
    Load an image from disk. Supports common formats (PNG, JPG, TIFF, etc.).

    Args:
        image_path: Path to the image file.

    Returns:
        BGR image as numpy array.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the image could not be loaded (e.g. corrupt or unsupported).
    """
    path = Path(image_path)
    if not path.is_file():
        raise FileNotFoundError(f"Image file not found: {path}")

    img = cv2.imread(str(path))
    if img is None:
        raise ValueError(f"Could not load image (unsupported or corrupt): {path}")
    return img


def to_grayscale(image: np.ndarray) -> np.ndarray:
    """
    Convert BGR image to grayscale. Single channel improves thresholding and deskew.

    Args:
        image: BGR or grayscale image.

    Returns:
        Grayscale image (uint8).
    """
    if len(image.shape) == 2:
        return image.copy()
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


def denoise(image: np.ndarray, kernel_size: int = BLUR_KERNEL_SIZE) -> np.ndarray:
    """
    Reduce noise using median blur. Preserves edges better than Gaussian for
    text and handwritten strokes; helps with low-quality scans.

    Args:
        image: Grayscale image.
        kernel_size: Odd kernel size (3 or 5 typical).

    Returns:
        Denoised grayscale image.
    """
    if kernel_size % 2 == 0:
        kernel_size += 1
    return cv2.medianBlur(image, kernel_size)


def adaptive_threshold(
    image: np.ndarray,
    block_size: int = ADAPTIVE_BLOCK_SIZE,
    c: int = ADAPTIVE_C,
) -> np.ndarray:
    """
    Binarize image with adaptive thresholding for uneven lighting (e.g. scans
    with shadows or non-uniform illumination). Each pixel is compared to a
    local mean.

    Args:
        image: Grayscale image.
        block_size: Size of neighbourhood (must be odd).
        c: Constant subtracted from the mean.

    Returns:
        Binary image (0 or 255); text usually white on black for many OCR APIs.
    """
    if block_size % 2 == 0:
        block_size += 1
    binary = cv2.adaptiveThreshold(
        image,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        block_size,
        c,
    )
    return binary


def _get_skew_angle_from_binary(
    binary: np.ndarray,
    min_area: int = MIN_CONTOUR_AREA,
    max_area_ratio: float = MAX_CONTOUR_AREA_RATIO,
    quantile: float = ANGLE_QUANTILE,
) -> float:
    """
    Estimate skew angle (degrees) from binary image using contours.
    Uses minAreaRect of text-like contours; returns median angle so a few
    outliers (noise, graphics) do not dominate.

    Args:
        binary: Binary image (0 and 255).
        min_area: Ignore contours smaller than this.
        max_area_ratio: Ignore contours larger than this fraction of image area.
        quantile: Which quantile of angles to use (0.5 = median).

    Returns:
        Estimated skew angle in degrees (positive = CCW tilt of text lines).
    """
    h, w = binary.shape
    total_area = h * w
    # Find contours (external only to get text block outlines)
    contours, _ = cv2.findContours(
        binary,
        cv2.RETR_EXTERNAL,
        cv2.CHAIN_APPROX_SIMPLE,
    )
    angles = []
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area < min_area or area > max_area_ratio * total_area:
            continue
        rect = cv2.minAreaRect(cnt)
        # rect[2] is angle in degrees in [-90, 0); we use it for skew
        angle = rect[2]
        # Normalize to small tilt: prefer angle in [-45, 45]
        if angle < -45:
            angle += 90
        angles.append(angle)
    if not angles:
        return 0.0
    return float(np.quantile(angles, quantile))


def deskew(
    image: np.ndarray,
    binary: np.ndarray,
    skew_angle: float | None = None,
) -> tuple[np.ndarray, np.ndarray, float]:
    """
    Rotate image and binary to correct skew. If skew_angle is not provided,
    it is estimated from the binary image.

    Args:
        image: Grayscale image to deskew.
        binary: Binary image used for angle estimation (and to deskew).
        skew_angle: Override estimated angle (degrees); if None, estimate from binary.

    Returns:
        (deskewed_grayscale, deskewed_binary, angle_used)
    """
    if skew_angle is None:
        skew_angle = _get_skew_angle_from_binary(binary)
    # Only rotate if angle is meaningful (avoid jitter on already straight docs)
    if abs(skew_angle) < 0.2:
        return image.copy(), binary.copy(), skew_angle

    h, w = image.shape[:2]
    center = (w / 2, h / 2)
    M = cv2.getRotationMatrix2D(center, -skew_angle, 1.0)
    # Expand canvas so rotated image is not cropped
    cos = np.abs(M[0, 0])
    sin = np.abs(M[0, 1])
    nw = int(h * sin + w * cos)
    nh = int(h * cos + w * sin)
    M[0, 2] += (nw / 2) - center[0]
    M[1, 2] += (nh / 2) - center[1]
    gray_deskewed = cv2.warpAffine(
        image, M, (nw, nh),
        flags=cv2.INTER_CUBIC,
        borderMode=cv2.BORDER_REPLICATE,
    )
    binary_deskewed = cv2.warpAffine(
        binary, M, (nw, nh),
        flags=cv2.INTER_NEAREST,
        borderMode=cv2.BORDER_CONSTANT,
        borderValue=0,
    )
    return gray_deskewed, binary_deskewed, skew_angle


def preprocess_image(image_path: str | Path) -> tuple[np.ndarray, np.ndarray]:
    """
    Full preprocessing pipeline: load -> grayscale -> denoise -> adaptive
    threshold -> deskew. Suitable for scanned documents and handwritten notes.

    Args:
        image_path: Path to the input image.

    Returns:
        (binary_image, deskewed_grayscale_image)
        - binary_image: Final black & white image (deskewed), for OCR input.
        - deskewed_grayscale: Deskewed grayscale, for debugging/visualization.

    Raises:
        FileNotFoundError: If image_path does not exist.
        ValueError: If image cannot be loaded.
    """
    # 1. Load
    bgr = load_image(image_path)
    # 2. Grayscale
    gray = to_grayscale(bgr)
    # 3. Denoise (median blur)
    denoised = denoise(gray)
    # 4. Adaptive threshold -> binary
    binary = adaptive_threshold(denoised)
    # 5. Deskew (estimate angle from binary, then rotate both binary and grayscale)
    gray_deskewed, binary_deskewed, _ = deskew(denoised, binary)
    # Return final binary and deskewed grayscale (for debugging)
    return binary_deskewed, gray_deskewed


# ---------------------------------------------------------------------------
# Main: test block with display and save
# ---------------------------------------------------------------------------
def _run_demo(image_path: str | Path, output_path: str | Path | None) -> None:
    """
    Load a sample image, run preprocessing, show intermediate results in
    OpenCV windows, and save the final preprocessed image.
    """
    path = Path(image_path)
    if not path.is_file():
        print(f"Error: Sample image not found: {path}", file=sys.stderr)
        print("Usage: python preprocess.py <path_to_image> [output_path]", file=sys.stderr)
        sys.exit(1)

    # Load and pipeline (we need intermediates for display)
    bgr = load_image(path)
    gray = to_grayscale(bgr)
    denoised = denoise(gray)
    binary = adaptive_threshold(denoised)
    gray_deskewed, binary_deskewed, angle = deskew(denoised, binary)

    # Default output path if not provided
    if output_path is None:
        output_path = path.parent / f"{path.stem}_preprocessed.png"
    out = Path(output_path)

    # Save final binary (main output) and optionally deskewed grayscale
    cv2.imwrite(str(out), binary_deskewed)
    cv2.imwrite(str(out.parent / f"{out.stem}_gray.png"), gray_deskewed)
    print(f"Saved: {out}")
    print(f"Saved (grayscale): {out.parent / (out.stem + '_gray.png')}")
    print(f"Deskew angle (degrees): {angle:.2f}")

    # Display intermediate results (resize if very large so they fit on screen)
    max_display = 800
    def _resize_for_display(img: np.ndarray) -> np.ndarray:
        h, w = img.shape[:2]
        if max(h, w) <= max_display:
            return img
        scale = max_display / max(h, w)
        return cv2.resize(img, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)

    cv2.imshow("1_original", _resize_for_display(bgr))
    cv2.imshow("2_grayscale", _resize_for_display(gray))
    cv2.imshow("3_thresholded", _resize_for_display(binary))
    cv2.imshow("4_deskewed_binary", _resize_for_display(binary_deskewed))
    cv2.imshow("5_deskewed_grayscale", _resize_for_display(gray_deskewed))
    print("Close any OpenCV window or press a key to exit.")
    cv2.waitKey(0)
    cv2.destroyAllWindows()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Preprocess a scanned/handwritten image for OCR (display + save).",
    )
    parser.add_argument(
        "image_path",
        type=Path,
        help="Path to input image (e.g. scanned document or handwritten note).",
    )
    parser.add_argument(
        "output_path",
        type=Path,
        nargs="?",
        default=None,
        help="Path for saved preprocessed image (default: <input_stem>_preprocessed.png).",
    )
    args = parser.parse_args()
    _run_demo(args.image_path, args.output_path)