Spaces:

Solomon7890
/

Proverbs_Law_MainPage_Official

Build error

File size: 5,622 Bytes

d28e16a

"""

OCR Utilities for document processing

"""

import cv2
import numpy as np
from PIL import Image

class OCRProcessor:
    """Handles OCR processing for images including handwriting detection."""
    
    def __init__(self):
        try:
            import pytesseract
            self.pytesseract = pytesseract
        except ImportError:
            self.pytesseract = None
            print("Warning: pytesseract not available")
    
    def detect_handwriting(self, image):
        """Detect if image contains handwriting."""
        try:
            # Convert PIL Image to numpy array
            img_array = np.array(image)
            
            # Convert to grayscale
            if len(img_array.shape) == 3:
                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
            else:
                gray = img_array
            
            # Apply edge detection
            edges = cv2.Canny(gray, 50, 150)
            
            # Count edges
            edge_pixels = np.sum(edges > 0)
            total_pixels = edges.size
            edge_ratio = edge_pixels / total_pixels
            
            # Find contours
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            # Handwriting typically has more irregular contours
            is_handwritten = edge_ratio > 0.05 and len(contours) > 20
            
            return {
                'is_handwritten': is_handwritten,
                'confidence': edge_ratio * 10,  # Normalize to 0-1 range
                'edge_ratio': edge_ratio,
                'contour_count': len(contours)
            }
        except Exception as e:
            return {
                'is_handwritten': False,
                'confidence': 0,
                'edge_ratio': 0,
                'error': str(e)
            }
    
    def extract_text(self, image, enhance=True):
        """Extract text from image using standard OCR."""
        if not self.pytesseract:
            return "OCR not available"
        
        try:
            if enhance:
                image = self._enhance_image(image)
            
            text = self.pytesseract.image_to_string(image)
            return text
        except Exception as e:
            return f"OCR error: {str(e)}"
    
    def extract_from_handwriting(self, image):
        """Extract text from handwritten image."""
        if not self.pytesseract:
            return "OCR not available"
        
        try:
            # Enhance for handwriting
            enhanced = self._enhance_for_handwriting(image)
            
            # Use specific OCR config for handwriting
            custom_config = r'--oem 3 --psm 6'
            text = self.pytesseract.image_to_string(enhanced, config=custom_config)
            return text
        except Exception as e:
            return f"Handwriting OCR error: {str(e)}"
    
    def extract_text_with_confidence(self, image):
        """Extract text with confidence scores."""
        if not self.pytesseract:
            return {'text': 'OCR not available', 'confidence': 0, 'word_count': 0}
        
        try:
            data = self.pytesseract.image_to_data(image, output_type=self.pytesseract.Output.DICT)
            
            # Filter by confidence
            text_parts = []
            confidences = []
            
            for i, conf in enumerate(data['conf']):
                if int(conf) > 30:  # Threshold
                    text_parts.append(data['text'][i])
                    confidences.append(int(conf))
            
            text = ' '.join(text_parts)
            avg_confidence = np.mean(confidences) if confidences else 0
            
            return {
                'text': text,
                'confidence': avg_confidence,
                'word_count': len(text_parts)
            }
        except Exception as e:
            return {
                'text': f"Error: {str(e)}",
                'confidence': 0,
                'word_count': 0
            }
    
    def _enhance_image(self, image):
        """Enhance image for better OCR."""
        try:
            # Convert PIL to numpy
            img_array = np.array(image)
            
            # Convert to grayscale
            if len(img_array.shape) == 3:
                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
            else:
                gray = img_array
            
            # Apply thresholding
            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            # Convert back to PIL
            return Image.fromarray(binary)
        except Exception:
            return image
    
    def _enhance_for_handwriting(self, image):
        """Enhance image specifically for handwriting recognition."""
        try:
            img_array = np.array(image)
            
            if len(img_array.shape) == 3:
                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
            else:
                gray = img_array
            
            # Apply adaptive thresholding for handwriting
            binary = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
            )
            
            # Denoise
            denoised = cv2.fastNlMeansDenoising(binary)
            
            return Image.fromarray(denoised)
        except Exception:
            return image