Spaces:

hashirama7x
/

multimodal-rag

Build error

multimodal-rag / src /preprocessing /cv_pipeline.py

itachi

Initial deployment

a809248 2 months ago

17.2 kB

	"""
	Computer Vision Pipeline for Document Image Processing.
	Handles image preprocessing, enhancement, and OCR extraction.
	"""

	import re
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple, Union

	import cv2
	import numpy as np
	from PIL import Image

	from ..utils import get_logger, LoggerMixin

	logger = get_logger(__name__)


	@dataclass
	class OCRResult:
	"""Container for OCR extraction results."""

	text: str
	confidence: float
	bounding_boxes: List[Dict] = field(default_factory=list)
	page_number: int = 1
	metadata: Dict = field(default_factory=dict)

	def to_dict(self) -> Dict:
	return {
	"text": self.text,
	"confidence": self.confidence,
	"bounding_boxes": self.bounding_boxes,
	"page_number": self.page_number,
	"metadata": self.metadata
	}


	@dataclass
	class ImageQuality:
	"""Image quality assessment metrics."""

	blur_score: float
	contrast_score: float
	brightness_score: float
	noise_level: float
	is_acceptable: bool

	def to_dict(self) -> Dict:
	return {
	"blur_score": self.blur_score,
	"contrast_score": self.contrast_score,
	"brightness_score": self.brightness_score,
	"noise_level": self.noise_level,
	"is_acceptable": self.is_acceptable
	}


	class ImagePreprocessor(LoggerMixin):
	"""
	Image preprocessing for OCR optimization.

	Applies various enhancement techniques to improve OCR accuracy:
	- Grayscale conversion
	- Adaptive thresholding
	- Noise reduction
	- Deskewing
	- Contrast enhancement (CLAHE)
	"""

	def __init__(
	self,
	target_dpi: int = 300,
	denoise_strength: int = 10,
	adaptive_threshold_block_size: int = 11,
	adaptive_threshold_c: int = 2
	):
	self.target_dpi = target_dpi
	self.denoise_strength = denoise_strength
	self.adaptive_threshold_block_size = adaptive_threshold_block_size
	self.adaptive_threshold_c = adaptive_threshold_c

	def preprocess(
	self,
	image: np.ndarray,
	apply_deskew: bool = True,
	apply_denoise: bool = True,
	apply_threshold: bool = True
	) -> np.ndarray:
	"""
	Apply full preprocessing pipeline to an image.

	Args:
	image: Input image (BGR or grayscale)
	apply_deskew: Whether to apply deskewing
	apply_denoise: Whether to apply noise reduction
	apply_threshold: Whether to apply adaptive thresholding

	Returns:
	Preprocessed image
	"""
	self.logger.debug("Starting image preprocessing pipeline")

	# Convert to grayscale if needed
	if len(image.shape) == 3:
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	else:
	gray = image.copy()

	# Apply CLAHE for contrast enhancement
	gray = self._apply_clahe(gray)

	# Denoise
	if apply_denoise:
	gray = self._denoise(gray)

	# Deskew
	if apply_deskew:
	gray = self._deskew(gray)

	# Adaptive thresholding
	if apply_threshold:
	gray = self._adaptive_threshold(gray)

	self.logger.debug("Preprocessing pipeline completed")
	return gray

	def _apply_clahe(self, image: np.ndarray) -> np.ndarray:
	"""Apply Contrast Limited Adaptive Histogram Equalization."""
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	return clahe.apply(image)

	def _denoise(self, image: np.ndarray) -> np.ndarray:
	"""Apply non-local means denoising."""
	return cv2.fastNlMeansDenoising(
	image,
	None,
	h=self.denoise_strength,
	templateWindowSize=7,
	searchWindowSize=21
	)

	def _deskew(self, image: np.ndarray) -> np.ndarray:
	"""
	Correct image skew using Hough Transform.

	Detects lines in the image and calculates the median angle
	to determine the skew angle, then rotates to correct.
	"""
	# Edge detection
	edges = cv2.Canny(image, 50, 150, apertureSize=3)

	# Hough line detection
	lines = cv2.HoughLinesP(
	edges,
	1,
	np.pi / 180,
	threshold=100,
	minLineLength=100,
	maxLineGap=10
	)

	if lines is None or len(lines) == 0:
	return image

	# Calculate angles
	angles = []
	for line in lines:
	x1, y1, x2, y2 = line[0]
	angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
	if abs(angle) < 45: # Filter out vertical lines
	angles.append(angle)

	if not angles:
	return image

	# Use median angle to avoid outliers
	median_angle = np.median(angles)

	if abs(median_angle) < 0.5: # Skip if angle is negligible
	return image

	# Rotate image
	(h, w) = image.shape[:2]
	center = (w // 2, h // 2)
	rotation_matrix = cv2.getRotationMatrix2D(center, median_angle, 1.0)
	rotated = cv2.warpAffine(
	image,
	rotation_matrix,
	(w, h),
	flags=cv2.INTER_CUBIC,
	borderMode=cv2.BORDER_REPLICATE
	)

	self.logger.debug(f"Deskewed image by {median_angle:.2f} degrees")
	return rotated

	def _adaptive_threshold(self, image: np.ndarray) -> np.ndarray:
	"""Apply adaptive Gaussian thresholding."""
	return cv2.adaptiveThreshold(
	image,
	255,
	cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY,
	self.adaptive_threshold_block_size,
	self.adaptive_threshold_c
	)

	def assess_quality(self, image: np.ndarray) -> ImageQuality:
	"""
	Assess image quality for OCR suitability.

	Args:
	image: Input image

	Returns:
	ImageQuality metrics
	"""
	# Convert to grayscale if needed
	if len(image.shape) == 3:
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	else:
	gray = image.copy()

	# Blur detection using Laplacian variance
	laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
	blur_score = min(laplacian_var / 500, 1.0) # Normalize

	# Contrast score
	contrast_score = gray.std() / 128 # Normalize to ~1.0

	# Brightness score (0.5 is optimal)
	brightness = gray.mean() / 255
	brightness_score = 1 - abs(brightness - 0.5) * 2

	# Noise level estimation
	noise_level = self._estimate_noise(gray)

	# Determine if quality is acceptable
	is_acceptable = (
	blur_score > 0.1 and
	contrast_score > 0.2 and
	brightness_score > 0.3
	)

	return ImageQuality(
	blur_score=round(blur_score, 3),
	contrast_score=round(contrast_score, 3),
	brightness_score=round(brightness_score, 3),
	noise_level=round(noise_level, 3),
	is_acceptable=is_acceptable
	)

	def _estimate_noise(self, image: np.ndarray) -> float:
	"""Estimate noise level using Laplacian method."""
	sigma = np.median(np.abs(cv2.Laplacian(image, cv2.CV_64F))) / 0.6745
	return min(sigma / 50, 1.0) # Normalize


	class OCREngine(LoggerMixin):
	"""
	OCR engine using Tesseract for text extraction.

	Provides text extraction with confidence scoring,
	bounding box detection, and multi-language support.
	"""

	def __init__(
	self,
	lang: str = "eng",
	config: str = "--oem 3 --psm 3",
	min_confidence: float = 0.0
	):
	"""
	Initialize OCR engine.

	Args:
	lang: Tesseract language code
	config: Tesseract configuration string
	min_confidence: Minimum confidence threshold for text
	"""
	self.lang = lang
	self.config = config
	self.min_confidence = min_confidence

	# Import pytesseract here to handle missing installation gracefully
	try:
	import pytesseract
	self.pytesseract = pytesseract
	self.logger.info("Tesseract OCR initialized successfully")
	except ImportError:
	self.pytesseract = None
	self.logger.warning(
	"pytesseract not installed. OCR functionality will be limited."
	)

	def extract_text(
	self,
	image: np.ndarray,
	with_confidence: bool = True
	) -> OCRResult:
	"""
	Extract text from an image.

	Args:
	image: Preprocessed image
	with_confidence: Whether to include confidence scores

	Returns:
	OCRResult with extracted text and metadata
	"""
	if self.pytesseract is None:
	self.logger.error("pytesseract not available")
	return OCRResult(text="", confidence=0.0)

	self.logger.debug("Starting OCR extraction")

	# Convert numpy array to PIL Image if needed
	if isinstance(image, np.ndarray):
	pil_image = Image.fromarray(image)
	else:
	pil_image = image

	if with_confidence:
	# Get detailed data with confidence
	data = self.pytesseract.image_to_data(
	pil_image,
	lang=self.lang,
	config=self.config,
	output_type=self.pytesseract.Output.DICT
	)

	# Process results
	text_parts = []
	bounding_boxes = []
	confidences = []

	for i, word in enumerate(data['text']):
	conf = float(data['conf'][i])

	if conf > self.min_confidence and word.strip():
	text_parts.append(word)
	confidences.append(conf)

	bounding_boxes.append({
	'text': word,
	'confidence': conf,
	'left': data['left'][i],
	'top': data['top'][i],
	'width': data['width'][i],
	'height': data['height'][i],
	'block_num': data['block_num'][i],
	'line_num': data['line_num'][i]
	})

	text = ' '.join(text_parts)
	avg_confidence = np.mean(confidences) if confidences else 0.0

	else:
	# Simple text extraction
	text = self.pytesseract.image_to_string(
	pil_image,
	lang=self.lang,
	config=self.config
	)
	avg_confidence = 0.0
	bounding_boxes = []

	# Clean up text
	text = self._clean_text(text)

	self.logger.debug(f"OCR extracted {len(text)} characters with {avg_confidence:.1f}% confidence")

	return OCRResult(
	text=text,
	confidence=round(avg_confidence, 2),
	bounding_boxes=bounding_boxes
	)

	def _clean_text(self, text: str) -> str:
	"""Clean and normalize extracted text."""
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove non-printable characters
	text = re.sub(r'[^\x20-\x7E\n]', '', text)
	return text.strip()


	class CVPipeline(LoggerMixin):
	"""
	Complete Computer Vision pipeline for document processing.

	Combines image preprocessing and OCR into a unified pipeline
	for processing scanned documents and images.
	"""

	def __init__(
	self,
	preprocessor: Optional[ImagePreprocessor] = None,
	ocr_engine: Optional[OCREngine] = None
	):
	self.preprocessor = preprocessor or ImagePreprocessor()
	self.ocr_engine = ocr_engine or OCREngine()

	def process_image(
	self,
	image_path: Union[str, Path],
	preprocess: bool = True
	) -> OCRResult:
	"""
	Process a single image file through the CV pipeline.

	Args:
	image_path: Path to the image file
	preprocess: Whether to apply preprocessing

	Returns:
	OCRResult with extracted text
	"""
	image_path = Path(image_path)
	self.logger.info(f"Processing image: {image_path.name}")

	# Load image
	image = cv2.imread(str(image_path))
	if image is None:
	self.logger.error(f"Failed to load image: {image_path}")
	return OCRResult(text="", confidence=0.0)

	# Assess quality
	quality = self.preprocessor.assess_quality(image)
	self.logger.debug(f"Image quality: {quality.to_dict()}")

	# Preprocess
	if preprocess:
	processed = self.preprocessor.preprocess(image)
	else:
	processed = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

	# OCR
	result = self.ocr_engine.extract_text(processed)
	result.metadata['source_file'] = str(image_path)
	result.metadata['quality'] = quality.to_dict()

	return result

	def process_images(
	self,
	image_paths: List[Union[str, Path]],
	preprocess: bool = True
	) -> List[OCRResult]:
	"""
	Process multiple images through the CV pipeline.

	Args:
	image_paths: List of image file paths
	preprocess: Whether to apply preprocessing

	Returns:
	List of OCRResults
	"""
	results = []
	for i, path in enumerate(image_paths, 1):
	result = self.process_image(path, preprocess)
	result.page_number = i
	results.append(result)
	return results

	def process_pdf_images(
	self,
	pdf_path: Union[str, Path],
	dpi: int = 300,
	preprocess: bool = True
	) -> List[OCRResult]:
	"""
	Convert PDF pages to images and process through OCR.

	Args:
	pdf_path: Path to PDF file
	dpi: DPI for PDF to image conversion
	preprocess: Whether to apply preprocessing

	Returns:
	List of OCRResults, one per page
	"""
	pdf_path = Path(pdf_path)
	self.logger.info(f"Processing PDF through OCR: {pdf_path.name}")

	try:
	from pdf2image import convert_from_path

	# Convert PDF to images
	images = convert_from_path(str(pdf_path), dpi=dpi)
	self.logger.debug(f"Converted PDF to {len(images)} images")

	results = []
	for i, pil_image in enumerate(images, 1):
	# Convert PIL to numpy
	image = np.array(pil_image)
	image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

	# Assess quality
	quality = self.preprocessor.assess_quality(image)

	# Preprocess
	if preprocess:
	processed = self.preprocessor.preprocess(image)
	else:
	processed = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

	# OCR
	result = self.ocr_engine.extract_text(processed)
	result.page_number = i
	result.metadata['source_file'] = str(pdf_path)
	result.metadata['quality'] = quality.to_dict()

	results.append(result)

	return results

	except ImportError:
	self.logger.error("pdf2image not installed")
	return []
	except Exception as e:
	self.logger.error(f"Error processing PDF: {e}")
	return []


	if __name__ == "__main__":
	# Test the CV pipeline
	import argparse

	parser = argparse.ArgumentParser(description="CV Pipeline Test")
	parser.add_argument("--test", action="store_true", help="Run test mode")
	parser.add_argument("--image", type=str, help="Image path to process")
	args = parser.parse_args()

	if args.test:
	print("CV Pipeline initialized successfully!")
	pipeline = CVPipeline()
	print(f"Preprocessor: {pipeline.preprocessor}")
	print(f"OCR Engine: {pipeline.ocr_engine}")

	if args.image:
	pipeline = CVPipeline()
	result = pipeline.process_image(args.image)
	print(f"\nExtracted Text:\n{result.text[:500]}...")
	print(f"\nConfidence: {result.confidence}%")