Spaces:

xeeshan404
/

agentic-image2word

Sleeping

App Files Files Community

agentic-image2word / core /preprocessor.py

xeeshan404

Initial deployment: Agentic Image2Word Converter

5f3b8de verified 28 days ago

raw

history blame contribute delete

10.9 kB

	"""
	Image Preprocessor — Adaptive image analysis and preprocessing for OCR.

	Analyzes image properties (resolution, contrast, noise, skew) and applies
	optimal preprocessing pipeline. Part of the agentic system's perception layer.
	"""

	import cv2
	import numpy as np
	from PIL import Image
	from dataclasses import dataclass, field
	from typing import Optional, Tuple
	import logging

	logger = logging.getLogger(__name__)


	@dataclass
	class ImageProperties:
	"""Properties extracted from image analysis."""
	width: int = 0
	height: int = 0
	resolution_dpi: int = 72
	contrast_score: float = 0.0 # 0-1 scale
	brightness_score: float = 0.0 # 0-1 scale
	noise_level: float = 0.0 # 0-1 scale (higher = noisier)
	skew_angle: float = 0.0 # degrees
	is_grayscale: bool = False
	is_binary: bool = False
	sharpness_score: float = 0.0 # 0-1 scale
	quality_rating: str = "unknown" # "excellent", "good", "fair", "poor"


	@dataclass
	class PreprocessingConfig:
	"""Configuration for preprocessing steps, determined by image analysis."""
	apply_grayscale: bool = True
	apply_clahe: bool = True
	clahe_clip_limit: float = 2.0
	clahe_grid_size: Tuple[int, int] = (8, 8)
	apply_denoise: bool = True
	denoise_strength: int = 10
	apply_binarize: bool = True
	binarize_method: str = "otsu" # "otsu", "adaptive", "none"
	apply_deskew: bool = False
	apply_sharpen: bool = False
	apply_resize: bool = False
	target_dpi: int = 300


	def analyze_image(image_path: str) -> ImageProperties:
	"""
	Analyze an image and extract its properties for the agent to make
	preprocessing decisions.

	Args:
	image_path: Path to the input image.

	Returns:
	ImageProperties with analysis results.
	"""
	props = ImageProperties()

	img = cv2.imread(image_path)
	if img is None:
	raise FileNotFoundError(f"Cannot open image: {image_path}")

	props.height, props.width = img.shape[:2]
	props.is_grayscale = len(img.shape) == 2 or (len(img.shape) == 3 and img.shape[2] == 1)

	# Convert to grayscale for analysis
	if not props.is_grayscale:
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	else:
	gray = img if len(img.shape) == 2 else img[:, :, 0]

	# Contrast score (standard deviation of pixel values, normalized)
	std_dev = np.std(gray.astype(np.float64))
	props.contrast_score = min(std_dev / 80.0, 1.0)

	# Brightness score (mean pixel value, normalized)
	mean_val = np.mean(gray.astype(np.float64))
	props.brightness_score = mean_val / 255.0

	# Noise estimation (using Laplacian variance)
	laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
	props.sharpness_score = min(laplacian_var / 500.0, 1.0)
	# High Laplacian can also mean noise; estimate noise from smooth regions
	blur = cv2.GaussianBlur(gray, (5, 5), 0)
	noise_estimate = np.mean(np.abs(gray.astype(np.float64) - blur.astype(np.float64)))
	props.noise_level = min(noise_estimate / 30.0, 1.0)

	# Check if already binary
	unique_vals = len(np.unique(gray))
	props.is_binary = unique_vals <= 10

	# Skew detection via Hough Line Transform
	props.skew_angle = _detect_skew(gray)

	# DPI estimation from image metadata
	try:
	pil_img = Image.open(image_path)
	dpi_info = pil_img.info.get('dpi', (72, 72))
	props.resolution_dpi = int(dpi_info[0]) if isinstance(dpi_info, tuple) else int(dpi_info)
	except Exception:
	props.resolution_dpi = 72

	# Overall quality rating
	props.quality_rating = _rate_quality(props)

	logger.info(f"Image analysis: {props.width}x{props.height}, "
	f"contrast={props.contrast_score:.2f}, noise={props.noise_level:.2f}, "
	f"quality={props.quality_rating}")

	return props


	def _detect_skew(gray: np.ndarray) -> float:
	"""Detect text skew angle using Hough line transform."""
	try:
	edges = cv2.Canny(gray, 50, 150, apertureSize=3)
	lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 100,
	minLineLength=gray.shape[1] // 4,
	maxLineGap=10)

	if lines is None or len(lines) == 0:
	return 0.0

	angles = []
	for line in lines:
	x1, y1, x2, y2 = line[0]
	angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
	# Only consider near-horizontal lines
	if abs(angle) < 15:
	angles.append(angle)

	if angles:
	return float(np.median(angles))
	except Exception as e:
	logger.warning(f"Skew detection failed: {e}")

	return 0.0


	def _rate_quality(props: ImageProperties) -> str:
	"""Rate overall image quality for OCR."""
	score = 0

	# Resolution
	if props.resolution_dpi >= 300:
	score += 3
	elif props.resolution_dpi >= 150:
	score += 2
	else:
	score += 1

	# Contrast
	if props.contrast_score > 0.6:
	score += 3
	elif props.contrast_score > 0.3:
	score += 2
	else:
	score += 1

	# Noise
	if props.noise_level < 0.2:
	score += 3
	elif props.noise_level < 0.5:
	score += 2
	else:
	score += 1

	# Sharpness
	if props.sharpness_score > 0.4:
	score += 3
	elif props.sharpness_score > 0.15:
	score += 2
	else:
	score += 1

	if score >= 10:
	return "excellent"
	elif score >= 7:
	return "good"
	elif score >= 5:
	return "fair"
	else:
	return "poor"


	def determine_preprocessing(props: ImageProperties) -> PreprocessingConfig:
	"""
	Agent decision function: determine optimal preprocessing based on
	image properties. This is the intelligence layer for preprocessing.

	Args:
	props: Image properties from analysis.

	Returns:
	PreprocessingConfig with recommended preprocessing steps.
	"""
	config = PreprocessingConfig()

	# Already binary? Skip binarization
	if props.is_binary:
	config.apply_binarize = False
	config.apply_clahe = False
	logger.info("Image already binary — skipping CLAHE and binarization")

	# Low contrast → stronger CLAHE
	if props.contrast_score < 0.3:
	config.apply_clahe = True
	config.clahe_clip_limit = 3.0
	logger.info("Low contrast detected — increasing CLAHE clip limit")
	elif props.contrast_score > 0.7:
	config.apply_clahe = False
	logger.info("High contrast — CLAHE not needed")

	# High noise → stronger denoising
	if props.noise_level > 0.5:
	config.apply_denoise = True
	config.denoise_strength = 15
	logger.info("High noise — increasing denoise strength")
	elif props.noise_level < 0.15:
	config.apply_denoise = False
	logger.info("Low noise — denoising not needed")

	# Skewed → deskew
	if abs(props.skew_angle) > 0.5:
	config.apply_deskew = True
	logger.info(f"Skew detected ({props.skew_angle:.1f}°) — enabling deskew")

	# Low sharpness → sharpen
	if props.sharpness_score < 0.15:
	config.apply_sharpen = True
	logger.info("Low sharpness — enabling sharpening")

	# Low resolution → upscale
	if props.resolution_dpi < 150 and max(props.width, props.height) < 1500:
	config.apply_resize = True
	logger.info("Low resolution — enabling upscaling")

	# Adaptive binarization for uneven lighting
	if props.brightness_score < 0.3 or props.brightness_score > 0.7:
	config.binarize_method = "adaptive"
	logger.info("Uneven brightness — using adaptive binarization")

	return config


	def preprocess_image(image_path: str, config: Optional[PreprocessingConfig] = None) -> np.ndarray:
	"""
	Preprocess an image for OCR based on the given configuration.

	Args:
	image_path: Path to the input image.
	config: Preprocessing configuration. If None, auto-determine.

	Returns:
	Preprocessed image as numpy array.
	"""
	img = cv2.imread(image_path)
	if img is None:
	raise FileNotFoundError(f"Cannot open image: {image_path}")

	if config is None:
	props = analyze_image(image_path)
	config = determine_preprocessing(props)

	# Step 1: Grayscale
	if config.apply_grayscale and len(img.shape) == 3:
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	elif len(img.shape) == 2:
	gray = img
	else:
	gray = img[:, :, 0] if img.shape[2] == 1 else cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Step 2: Resize/upscale
	if config.apply_resize:
	scale = config.target_dpi / 72.0
	scale = min(scale, 3.0) # Cap at 3x
	new_w = int(gray.shape[1] * scale)
	new_h = int(gray.shape[0] * scale)
	gray = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
	logger.info(f"Resized to {new_w}x{new_h}")

	# Step 3: Deskew
	if config.apply_deskew:
	gray = _deskew(gray)

	# Step 4: CLAHE contrast enhancement
	if config.apply_clahe:
	clahe = cv2.createCLAHE(
	clipLimit=config.clahe_clip_limit,
	tileGridSize=config.clahe_grid_size
	)
	gray = clahe.apply(gray)

	# Step 5: Denoise
	if config.apply_denoise:
	gray = cv2.fastNlMeansDenoising(gray, h=config.denoise_strength)

	# Step 6: Sharpen
	if config.apply_sharpen:
	kernel = np.array([[-1, -1, -1],
	[-1, 9, -1],
	[-1, -1, -1]])
	gray = cv2.filter2D(gray, -1, kernel)

	# Step 7: Binarize
	if config.apply_binarize:
	if config.binarize_method == "otsu":
	_, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	elif config.binarize_method == "adaptive":
	gray = cv2.adaptiveThreshold(
	gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY, 11, 2
	)

	return gray


	def _deskew(image: np.ndarray) -> np.ndarray:
	"""Correct text skew in an image."""
	coords = np.column_stack(np.where(image > 0))
	if len(coords) < 100:
	return image

	try:
	angle = cv2.minAreaRect(coords)[-1]

	if angle < -45:
	angle = -(90 + angle)
	else:
	angle = -angle

	if abs(angle) < 0.5:
	return image

	h, w = image.shape[:2]
	center = (w // 2, h // 2)
	M = cv2.getRotationMatrix2D(center, angle, 1.0)
	rotated = cv2.warpAffine(
	image, M, (w, h),
	flags=cv2.INTER_CUBIC,
	borderMode=cv2.BORDER_REPLICATE
	)
	logger.info(f"Deskewed by {angle:.2f}°")
	return rotated
	except Exception as e:
	logger.warning(f"Deskew failed: {e}")
	return image