DIVYANSHI SINGH
Root project layout configured for deployment
b0bec61
"""
utils.py β€” Image preprocessing utilities for the Bill/Invoice Scanner.
Responsibilities:
- preprocess_image(): denoise, deskew, and threshold a bill image for OCR
- pil_to_cv2(): convert a PIL Image to a BGR numpy array for OpenCV/PaddleOCR
These are pure functions with no side effects.
"""
from pathlib import Path
import numpy as np
import cv2
from PIL import Image
def pil_to_cv2(pil_image: Image.Image) -> np.ndarray:
"""
Convert a PIL Image to a cv2-compatible BGR numpy array.
PaddleOCR expects BGR format (OpenCV convention). PIL images are
RGB by default β€” passing RGB to PaddleOCR inverts colors and
degrades OCR quality significantly. This function corrects that.
Args:
pil_image: A PIL Image object in any mode (RGB, RGBA, L, etc.)
Returns:
A numpy array of dtype uint8 in BGR channel order.
"""
# Ensure we are working in RGB first (handles RGBA, L, P, etc.)
pil_rgb = pil_image.convert("RGB")
# Convert to numpy array (H, W, 3) in RGB
rgb_array = np.array(pil_rgb, dtype=np.uint8)
# Flip RGB β†’ BGR (OpenCV/PaddleOCR format)
bgr_array = cv2.cvtColor(rgb_array, cv2.COLOR_RGB2BGR)
return bgr_array
def _deskew(gray: np.ndarray) -> np.ndarray:
"""
Detect and correct the skew angle of a grayscale image.
Uses contour analysis via minAreaRect to find the dominant angle.
Guards against the -45Β° quadrant-ambiguity by skipping rotation
when the absolute angle is less than 1 degree (straight images do
not need correction and would be mis-rotated otherwise).
Args:
gray: A 2D uint8 numpy array (grayscale image).
Returns:
The deskewed grayscale image as a uint8 numpy array.
"""
# Threshold to binary for contour detection
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
coords = np.column_stack(np.where(thresh > 0))
if coords.shape[0] == 0:
# No content found β€” return original unchanged
return gray
angle = cv2.minAreaRect(coords)[-1]
# Resolve quadrant ambiguity: minAreaRect returns angles in [-90, 0)
if angle < -45:
angle = 90 + angle # e.g. -80Β° β†’ 10Β°
# Failure-mode fix: skip rotation for near-zero angles
if abs(angle) < 1.0:
return gray
(h, w) = gray.shape
center = (w // 2, h // 2)
rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
deskewed = cv2.warpAffine(
gray,
rotation_matrix,
(w, h),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE,
)
return deskewed
def preprocess_image(image_path: str | Path) -> np.ndarray:
"""
Load and preprocess a bill image for OCR.
Pipeline:
1. Load and convert to grayscale
2. Denoise (remove camera grain and paper texture)
3. Deskew (correct slight rotation from camera angle)
4. Adaptive threshold (handle uneven lighting / shadows)
5. Convert result to BGR (PaddleOCR expected format)
Args:
image_path: Path to the image file (str or pathlib.Path).
Returns:
A preprocessed numpy array of dtype uint8 in BGR format,
ready to be passed directly to PaddleOCR.
Raises:
FileNotFoundError: If the image path does not exist.
ValueError: If the file cannot be decoded as an image.
"""
path = Path(image_path)
if not path.exists():
raise FileNotFoundError(f"Image not found: {path}")
# Step 1 β€” Load as BGR using OpenCV (already BGR, no conversion needed)
bgr = cv2.imread(str(path))
if bgr is None:
raise ValueError(f"Could not decode image: {path}")
gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
# Step 2 β€” Denoise: remove grain while preserving text edges
denoised = cv2.fastNlMeansDenoising(gray, h=10, templateWindowSize=7, searchWindowSize=21)
# Step 3 β€” Deskew
deskewed = _deskew(denoised)
# Step 4 β€” Adaptive threshold: pure black/white; robust to uneven lighting
binary = cv2.adaptiveThreshold(
deskewed,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=31,
C=15,
)
# Step 5 β€” Convert grayscale binary back to BGR for PaddleOCR
bgr_output = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
return bgr_output