File size: 4,379 Bytes
b0bec61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
utils.py — Image preprocessing utilities for the Bill/Invoice Scanner.

Responsibilities:
- preprocess_image(): denoise, deskew, and threshold a bill image for OCR
- pil_to_cv2(): convert a PIL Image to a BGR numpy array for OpenCV/PaddleOCR

These are pure functions with no side effects.
"""

from pathlib import Path
import numpy as np
import cv2
from PIL import Image


def pil_to_cv2(pil_image: Image.Image) -> np.ndarray:
    """
    Convert a PIL Image to a cv2-compatible BGR numpy array.

    PaddleOCR expects BGR format (OpenCV convention). PIL images are
    RGB by default — passing RGB to PaddleOCR inverts colors and
    degrades OCR quality significantly. This function corrects that.

    Args:
        pil_image: A PIL Image object in any mode (RGB, RGBA, L, etc.)

    Returns:
        A numpy array of dtype uint8 in BGR channel order.
    """
    # Ensure we are working in RGB first (handles RGBA, L, P, etc.)
    pil_rgb = pil_image.convert("RGB")
    # Convert to numpy array (H, W, 3) in RGB
    rgb_array = np.array(pil_rgb, dtype=np.uint8)
    # Flip RGB → BGR (OpenCV/PaddleOCR format)
    bgr_array = cv2.cvtColor(rgb_array, cv2.COLOR_RGB2BGR)
    return bgr_array


def _deskew(gray: np.ndarray) -> np.ndarray:
    """
    Detect and correct the skew angle of a grayscale image.

    Uses contour analysis via minAreaRect to find the dominant angle.
    Guards against the -45° quadrant-ambiguity by skipping rotation
    when the absolute angle is less than 1 degree (straight images do
    not need correction and would be mis-rotated otherwise).

    Args:
        gray: A 2D uint8 numpy array (grayscale image).

    Returns:
        The deskewed grayscale image as a uint8 numpy array.
    """
    # Threshold to binary for contour detection
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    coords = np.column_stack(np.where(thresh > 0))

    if coords.shape[0] == 0:
        # No content found — return original unchanged
        return gray

    angle = cv2.minAreaRect(coords)[-1]

    # Resolve quadrant ambiguity: minAreaRect returns angles in [-90, 0)
    if angle < -45:
        angle = 90 + angle  # e.g. -80° → 10°

    # Failure-mode fix: skip rotation for near-zero angles
    if abs(angle) < 1.0:
        return gray

    (h, w) = gray.shape
    center = (w // 2, h // 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
    deskewed = cv2.warpAffine(
        gray,
        rotation_matrix,
        (w, h),
        flags=cv2.INTER_CUBIC,
        borderMode=cv2.BORDER_REPLICATE,
    )
    return deskewed


def preprocess_image(image_path: str | Path) -> np.ndarray:
    """
    Load and preprocess a bill image for OCR.

    Pipeline:
        1. Load and convert to grayscale
        2. Denoise (remove camera grain and paper texture)
        3. Deskew (correct slight rotation from camera angle)
        4. Adaptive threshold (handle uneven lighting / shadows)
        5. Convert result to BGR (PaddleOCR expected format)

    Args:
        image_path: Path to the image file (str or pathlib.Path).

    Returns:
        A preprocessed numpy array of dtype uint8 in BGR format,
        ready to be passed directly to PaddleOCR.

    Raises:
        FileNotFoundError: If the image path does not exist.
        ValueError: If the file cannot be decoded as an image.
    """
    path = Path(image_path)
    if not path.exists():
        raise FileNotFoundError(f"Image not found: {path}")

    # Step 1 — Load as BGR using OpenCV (already BGR, no conversion needed)
    bgr = cv2.imread(str(path))
    if bgr is None:
        raise ValueError(f"Could not decode image: {path}")

    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)

    # Step 2 — Denoise: remove grain while preserving text edges
    denoised = cv2.fastNlMeansDenoising(gray, h=10, templateWindowSize=7, searchWindowSize=21)

    # Step 3 — Deskew
    deskewed = _deskew(denoised)

    # Step 4 — Adaptive threshold: pure black/white; robust to uneven lighting
    binary = cv2.adaptiveThreshold(
        deskewed,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=31,
        C=15,
    )

    # Step 5 — Convert grayscale binary back to BGR for PaddleOCR
    bgr_output = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
    return bgr_output