"""Image preprocessing for OCR accuracy. Applies grayscale, deskew, thresholding, and noise reduction to scanned page images before feeding them to Tesseract. """ from __future__ import annotations import logging import cv2 import numpy as np from PIL import Image logger = logging.getLogger(__name__) def pil_to_cv(image: Image.Image) -> np.ndarray: """Convert PIL Image to OpenCV BGR array.""" rgb = np.array(image.convert("RGB")) return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR) def cv_to_pil(img: np.ndarray) -> Image.Image: """Convert OpenCV BGR array to PIL Image.""" rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) return Image.fromarray(rgb) def to_grayscale(img: np.ndarray) -> np.ndarray: """Convert to grayscale if not already.""" if len(img.shape) == 3: return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) return img def deskew(gray: np.ndarray, max_angle: float = 10.0) -> np.ndarray: """Correct slight rotation using projection profile. Only corrects rotations up to `max_angle` degrees to avoid false positives on non-skewed images. """ coords = np.column_stack(np.where(gray < 128)) if coords.shape[0] < 100: return gray # not enough ink to detect angle angle = cv2.minAreaRect(coords)[-1] # OpenCV returns angles in [-90, 0); normalize if angle < -45: angle = 90 + angle elif angle > 45: angle = angle - 90 if abs(angle) > max_angle or abs(angle) < 0.2: return gray # skip if angle too large or negligible h, w = gray.shape[:2] center = (w // 2, h // 2) mat = cv2.getRotationMatrix2D(center, angle, 1.0) rotated = cv2.warpAffine( gray, mat, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE, ) logger.debug("Deskewed by %.2f°", angle) return rotated def adaptive_threshold(gray: np.ndarray) -> np.ndarray: """Apply adaptive Gaussian thresholding for mixed lighting.""" return cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, blockSize=31, C=11, ) def denoise(gray: np.ndarray) -> np.ndarray: """Light denoising that preserves text edges.""" return cv2.fastNlMeansDenoising(gray, h=10) def preprocess_for_ocr(image: Image.Image) -> Image.Image: """Full preprocessing pipeline: grayscale → deskew → denoise → threshold. Returns a cleaned PIL Image ready for pytesseract. """ img = pil_to_cv(image) gray = to_grayscale(img) gray = deskew(gray) gray = denoise(gray) gray = adaptive_threshold(gray) return Image.fromarray(gray) def is_mostly_blank(image: Image.Image, threshold: float = 0.98) -> bool: """Return True if the image is almost entirely white (blank page).""" gray = np.array(image.convert("L")) white_ratio = np.sum(gray > 240) / gray.size return white_ratio > threshold