| """Image preprocessing for OCR accuracy.
|
|
|
| Applies grayscale, deskew, thresholding, and noise reduction to
|
| scanned page images before feeding them to Tesseract.
|
| """
|
|
|
| from __future__ import annotations
|
|
|
| import logging
|
|
|
| import cv2
|
| import numpy as np
|
| from PIL import Image
|
|
|
| logger = logging.getLogger(__name__)
|
|
|
|
|
| def pil_to_cv(image: Image.Image) -> np.ndarray:
|
| """Convert PIL Image to OpenCV BGR array."""
|
| rgb = np.array(image.convert("RGB"))
|
| return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
|
|
|
|
|
| def cv_to_pil(img: np.ndarray) -> Image.Image:
|
| """Convert OpenCV BGR array to PIL Image."""
|
| rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| return Image.fromarray(rgb)
|
|
|
|
|
| def to_grayscale(img: np.ndarray) -> np.ndarray:
|
| """Convert to grayscale if not already."""
|
| if len(img.shape) == 3:
|
| return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| return img
|
|
|
|
|
| def deskew(gray: np.ndarray, max_angle: float = 10.0) -> np.ndarray:
|
| """Correct slight rotation using projection profile.
|
|
|
| Only corrects rotations up to `max_angle` degrees to avoid
|
| false positives on non-skewed images.
|
| """
|
| coords = np.column_stack(np.where(gray < 128))
|
| if coords.shape[0] < 100:
|
| return gray
|
|
|
| angle = cv2.minAreaRect(coords)[-1]
|
|
|
| if angle < -45:
|
| angle = 90 + angle
|
| elif angle > 45:
|
| angle = angle - 90
|
|
|
| if abs(angle) > max_angle or abs(angle) < 0.2:
|
| return gray
|
|
|
| h, w = gray.shape[:2]
|
| center = (w // 2, h // 2)
|
| mat = cv2.getRotationMatrix2D(center, angle, 1.0)
|
| rotated = cv2.warpAffine(
|
| gray, mat, (w, h),
|
| flags=cv2.INTER_CUBIC,
|
| borderMode=cv2.BORDER_REPLICATE,
|
| )
|
| logger.debug("Deskewed by %.2f°", angle)
|
| return rotated
|
|
|
|
|
| def adaptive_threshold(gray: np.ndarray) -> np.ndarray:
|
| """Apply adaptive Gaussian thresholding for mixed lighting."""
|
| return cv2.adaptiveThreshold(
|
| gray, 255,
|
| cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| cv2.THRESH_BINARY,
|
| blockSize=31,
|
| C=11,
|
| )
|
|
|
|
|
| def denoise(gray: np.ndarray) -> np.ndarray:
|
| """Light denoising that preserves text edges."""
|
| return cv2.fastNlMeansDenoising(gray, h=10)
|
|
|
|
|
| def preprocess_for_ocr(image: Image.Image) -> Image.Image:
|
| """Full preprocessing pipeline: grayscale → deskew → denoise → threshold.
|
|
|
| Returns a cleaned PIL Image ready for pytesseract.
|
| """
|
| img = pil_to_cv(image)
|
| gray = to_grayscale(img)
|
| gray = deskew(gray)
|
| gray = denoise(gray)
|
| gray = adaptive_threshold(gray)
|
| return Image.fromarray(gray)
|
|
|
|
|
| def is_mostly_blank(image: Image.Image, threshold: float = 0.98) -> bool:
|
| """Return True if the image is almost entirely white (blank page)."""
|
| gray = np.array(image.convert("L"))
|
| white_ratio = np.sum(gray > 240) / gray.size
|
| return white_ratio > threshold
|
|
|