Spaces:

AlBaraa63
/

text_detection

Sleeping

File size: 5,515 Bytes

d5841ad

"""
Preprocessing functions to improve OCR accuracy
Includes various image enhancement techniques
"""
import cv2
import numpy as np


def convert_to_grayscale(img):
    """Convert image to grayscale"""
    if len(img.shape) == 3:
        return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return img


def apply_thresholding(img, method='otsu'):
    """
    Apply thresholding to image
    
    Methods:
        - 'otsu': Otsu's automatic thresholding
        - 'adaptive': Adaptive thresholding
        - 'binary': Simple binary thresholding
    """
    gray = convert_to_grayscale(img)
    
    if method == 'otsu':
        # Otsu's thresholding - automatic threshold selection
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    elif method == 'adaptive':
        # Adaptive thresholding - good for varying lighting
        thresh = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
            cv2.THRESH_BINARY, 11, 2
        )
    
    elif method == 'binary':
        # Simple binary thresholding
        _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
    
    else:
        thresh = gray
    
    return thresh


def remove_noise(img, method='median'):
    """
    Remove noise from image
    
    Methods:
        - 'median': Median blur (good for salt-and-pepper noise)
        - 'gaussian': Gaussian blur (general smoothing)
        - 'bilateral': Bilateral filter (preserves edges)
    """
    if method == 'median':
        return cv2.medianBlur(img, 3)
    
    elif method == 'gaussian':
        return cv2.GaussianBlur(img, (5, 5), 0)
    
    elif method == 'bilateral':
        return cv2.bilateralFilter(img, 9, 75, 75)
    
    return img


def dilate_text(img, kernel_size=(1, 1)):
    """Dilate text to make it thicker"""
    kernel = np.ones(kernel_size, np.uint8)
    return cv2.dilate(img, kernel, iterations=1)


def erode_text(img, kernel_size=(1, 1)):
    """Erode text to make it thinner"""
    kernel = np.ones(kernel_size, np.uint8)
    return cv2.erode(img, kernel, iterations=1)


def invert_image(img):
    """Invert image colors (useful if text is white on black)"""
    return cv2.bitwise_not(img)


def enhance_contrast(img):
    """Enhance image contrast using CLAHE"""
    gray = convert_to_grayscale(img)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    return clahe.apply(gray)


def resize_image(img, scale=2.0):
    """
    Resize image for better OCR
    Larger images often work better with Tesseract
    """
    height, width = img.shape[:2]
    new_width = int(width * scale)
    new_height = int(height * scale)
    return cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_CUBIC)


def add_border(img, border_size=10, color=255):
    """Add white border around image"""
    return cv2.copyMakeBorder(
        img, border_size, border_size, border_size, border_size,
        cv2.BORDER_CONSTANT, value=color
    )


def preprocess_pipeline(img, config='default'):
    """
    Complete preprocessing pipeline
    
    Configs:
        - 'default': Standard preprocessing
        - 'aggressive': More aggressive preprocessing
        - 'light': Light preprocessing
        - 'custom': Custom pipeline
    """
    if config == 'default':
        # Standard pipeline
        processed = convert_to_grayscale(img)
        processed = remove_noise(processed, 'median')
        processed = apply_thresholding(processed, 'otsu')
        processed = add_border(processed, 10)
        
    elif config == 'aggressive':
        # Aggressive preprocessing
        processed = convert_to_grayscale(img)
        processed = enhance_contrast(processed)
        processed = remove_noise(processed, 'bilateral')
        processed = apply_thresholding(processed, 'adaptive')
        processed = dilate_text(processed, (2, 2))
        processed = add_border(processed, 15)
        
    elif config == 'light':
        # Light preprocessing
        processed = convert_to_grayscale(img)
        processed = apply_thresholding(processed, 'otsu')
        
    elif config == 'upscale':
        # Upscale and process
        processed = resize_image(img, scale=3.0)
        processed = convert_to_grayscale(processed)
        processed = remove_noise(processed, 'median')
        processed = apply_thresholding(processed, 'otsu')
        processed = add_border(processed, 20)
    
    else:
        # No preprocessing
        processed = img
    
    return processed


def preprocess_for_ocr(img, show_steps=False):
    """
    Optimized preprocessing for OCR
    Returns preprocessed image ready for Tesseract
    """
    steps = {}
    
    # Step 1: Convert to grayscale
    gray = convert_to_grayscale(img)
    if show_steps:
        steps['1_grayscale'] = gray.copy()
    
    # Step 2: Upscale image (Tesseract works better with larger images)
    upscaled = resize_image(gray, scale=2.5)
    if show_steps:
        steps['2_upscaled'] = upscaled.copy()
    
    # Step 3: Remove noise
    denoised = remove_noise(upscaled, 'bilateral')
    if show_steps:
        steps['3_denoised'] = denoised.copy()
    
    # Step 4: Apply thresholding
    thresh = apply_thresholding(denoised, 'otsu')
    if show_steps:
        steps['4_threshold'] = thresh.copy()
    
    # Step 5: Add border
    bordered = add_border(thresh, 20)
    if show_steps:
        steps['5_bordered'] = bordered.copy()
    
    if show_steps:
        return bordered, steps
    
    return bordered