handbook-ocr-engine / app /services /preprocessing.py
internationalscholarsprogram's picture
Initial deploy: ISP Handbook OCR Engine
b12284c verified
"""Image preprocessing for OCR accuracy.
Applies grayscale, deskew, thresholding, and noise reduction to
scanned page images before feeding them to Tesseract.
"""
from __future__ import annotations
import logging
import cv2
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
def pil_to_cv(image: Image.Image) -> np.ndarray:
"""Convert PIL Image to OpenCV BGR array."""
rgb = np.array(image.convert("RGB"))
return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
def cv_to_pil(img: np.ndarray) -> Image.Image:
"""Convert OpenCV BGR array to PIL Image."""
rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return Image.fromarray(rgb)
def to_grayscale(img: np.ndarray) -> np.ndarray:
"""Convert to grayscale if not already."""
if len(img.shape) == 3:
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
return img
def deskew(gray: np.ndarray, max_angle: float = 10.0) -> np.ndarray:
"""Correct slight rotation using projection profile.
Only corrects rotations up to `max_angle` degrees to avoid
false positives on non-skewed images.
"""
coords = np.column_stack(np.where(gray < 128))
if coords.shape[0] < 100:
return gray # not enough ink to detect angle
angle = cv2.minAreaRect(coords)[-1]
# OpenCV returns angles in [-90, 0); normalize
if angle < -45:
angle = 90 + angle
elif angle > 45:
angle = angle - 90
if abs(angle) > max_angle or abs(angle) < 0.2:
return gray # skip if angle too large or negligible
h, w = gray.shape[:2]
center = (w // 2, h // 2)
mat = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(
gray, mat, (w, h),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE,
)
logger.debug("Deskewed by %.2f°", angle)
return rotated
def adaptive_threshold(gray: np.ndarray) -> np.ndarray:
"""Apply adaptive Gaussian thresholding for mixed lighting."""
return cv2.adaptiveThreshold(
gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=31,
C=11,
)
def denoise(gray: np.ndarray) -> np.ndarray:
"""Light denoising that preserves text edges."""
return cv2.fastNlMeansDenoising(gray, h=10)
def preprocess_for_ocr(image: Image.Image) -> Image.Image:
"""Full preprocessing pipeline: grayscale → deskew → denoise → threshold.
Returns a cleaned PIL Image ready for pytesseract.
"""
img = pil_to_cv(image)
gray = to_grayscale(img)
gray = deskew(gray)
gray = denoise(gray)
gray = adaptive_threshold(gray)
return Image.fromarray(gray)
def is_mostly_blank(image: Image.Image, threshold: float = 0.98) -> bool:
"""Return True if the image is almost entirely white (blank page)."""
gray = np.array(image.convert("L"))
white_ratio = np.sum(gray > 240) / gray.size
return white_ratio > threshold