mvm2-math-verification / image_enhancing.py
Antigravity Agent
feat: Add Gradio app.py and consolidated MVM2 core modules for HF Space deployment
bb6d5ae
import cv2
import numpy as np
from pathlib import Path
from typing import Dict, Any, Tuple, Union
import logging
from PIL import Image
logger = logging.getLogger(__name__)
class ImageEnhancer:
"""
Applies the handwritten-math-optimized preprocessing stack from the MVM² architecture.
Pipeline:
1. Robust loading from multiple input types (path / bytes / numpy / PIL).
2. Convert to grayscale and measure initial contrast.
3. Apply Gaussian blur (stabilizes stroke noise for handwriting).
4. Apply CLAHE to locally boost contrast on notebook paper.
5. Optionally apply adaptive binarization if the page is low contrast.
"""
def __init__(self, sigma: float = 1.2):
# Gaussian standard deviation; tuned for typical notebook handwriting.
self.sigma = sigma
def calculate_contrast(self, gray_img: np.ndarray) -> float:
"""
Simple contrast proxy: standard deviation of grayscale intensities.
"""
if gray_img is None or gray_img.size == 0:
return 0.0
return float(gray_img.std())
def enhance(
self,
image_source: Union[str, Path, bytes, np.ndarray, Image.Image],
skip_binarization: bool = False,
) -> Tuple[np.ndarray, Dict[str, Any]]:
"""
Core handwritten-math enhancement routine (CLAHE + Gaussian blur + optional binarization).
Supports:
- str / Path: filesystem path to an image.
- bytes: raw encoded image bytes.
- np.ndarray: BGR / grayscale OpenCV image.
- PIL.Image.Image: Gradio / HF directly supplies PIL objects.
"""
if isinstance(image_source, (str, Path)):
img = cv2.imread(str(image_source))
if img is None:
raise ValueError(f"Could not load image at {image_source}")
elif isinstance(image_source, bytes):
nparr = np.frombuffer(image_source, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
if img is None:
raise ValueError("Could not decode image from bytes")
elif isinstance(image_source, np.ndarray):
img = image_source
elif isinstance(image_source, Image.Image):
# Gradio hands us a PIL Image when type="pil"; convert to OpenCV BGR.
img = cv2.cvtColor(np.array(image_source.convert("RGB")), cv2.COLOR_RGB2BGR)
else:
raise ValueError(f"Unsupported image source type: {type(image_source)}")
height, width = img.shape[:2]
# Always work in grayscale for the enhancer.
if len(img.shape) == 3:
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
else:
gray = img.copy()
initial_contrast = self.calculate_contrast(gray)
# Gaussian Blur (sigma tuned for handwriting strokes).
blurred = cv2.GaussianBlur(gray, (0, 0), sigmaX=self.sigma, sigmaY=self.sigma)
# CLAHE (Contrast Limited Adaptive Histogram Equalization)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
clahe_img = clahe.apply(blurred)
# Adaptive binarization only when the notebook page is low-contrast.
if skip_binarization or initial_contrast > 60:
final_img = clahe_img
bin_applied = False
else:
final_img = cv2.adaptiveThreshold(
clahe_img,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11,
2,
)
bin_applied = True
final_contrast = self.calculate_contrast(final_img)
metadata = {
"resolution": {"width": width, "height": height},
"metrics": {
"initial_contrast": round(initial_contrast, 2),
"final_contrast": round(final_contrast, 2),
"blur_sigma_used": self.sigma,
"binarization_applied": bin_applied,
},
}
return final_img, metadata