|
|
"""
|
|
|
OCR Utilities for document processing
|
|
|
"""
|
|
|
|
|
|
import cv2
|
|
|
import numpy as np
|
|
|
from PIL import Image
|
|
|
|
|
|
class OCRProcessor:
|
|
|
"""Handles OCR processing for images including handwriting detection."""
|
|
|
|
|
|
def __init__(self):
|
|
|
try:
|
|
|
import pytesseract
|
|
|
self.pytesseract = pytesseract
|
|
|
except ImportError:
|
|
|
self.pytesseract = None
|
|
|
print("Warning: pytesseract not available")
|
|
|
|
|
|
def detect_handwriting(self, image):
|
|
|
"""Detect if image contains handwriting."""
|
|
|
try:
|
|
|
|
|
|
img_array = np.array(image)
|
|
|
|
|
|
|
|
|
if len(img_array.shape) == 3:
|
|
|
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
|
else:
|
|
|
gray = img_array
|
|
|
|
|
|
|
|
|
edges = cv2.Canny(gray, 50, 150)
|
|
|
|
|
|
|
|
|
edge_pixels = np.sum(edges > 0)
|
|
|
total_pixels = edges.size
|
|
|
edge_ratio = edge_pixels / total_pixels
|
|
|
|
|
|
|
|
|
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
|
|
|
|
|
is_handwritten = edge_ratio > 0.05 and len(contours) > 20
|
|
|
|
|
|
return {
|
|
|
'is_handwritten': is_handwritten,
|
|
|
'confidence': edge_ratio * 10,
|
|
|
'edge_ratio': edge_ratio,
|
|
|
'contour_count': len(contours)
|
|
|
}
|
|
|
except Exception as e:
|
|
|
return {
|
|
|
'is_handwritten': False,
|
|
|
'confidence': 0,
|
|
|
'edge_ratio': 0,
|
|
|
'error': str(e)
|
|
|
}
|
|
|
|
|
|
def extract_text(self, image, enhance=True):
|
|
|
"""Extract text from image using standard OCR."""
|
|
|
if not self.pytesseract:
|
|
|
return "OCR not available"
|
|
|
|
|
|
try:
|
|
|
if enhance:
|
|
|
image = self._enhance_image(image)
|
|
|
|
|
|
text = self.pytesseract.image_to_string(image)
|
|
|
return text
|
|
|
except Exception as e:
|
|
|
return f"OCR error: {str(e)}"
|
|
|
|
|
|
def extract_from_handwriting(self, image):
|
|
|
"""Extract text from handwritten image."""
|
|
|
if not self.pytesseract:
|
|
|
return "OCR not available"
|
|
|
|
|
|
try:
|
|
|
|
|
|
enhanced = self._enhance_for_handwriting(image)
|
|
|
|
|
|
|
|
|
custom_config = r'--oem 3 --psm 6'
|
|
|
text = self.pytesseract.image_to_string(enhanced, config=custom_config)
|
|
|
return text
|
|
|
except Exception as e:
|
|
|
return f"Handwriting OCR error: {str(e)}"
|
|
|
|
|
|
def extract_text_with_confidence(self, image):
|
|
|
"""Extract text with confidence scores."""
|
|
|
if not self.pytesseract:
|
|
|
return {'text': 'OCR not available', 'confidence': 0, 'word_count': 0}
|
|
|
|
|
|
try:
|
|
|
data = self.pytesseract.image_to_data(image, output_type=self.pytesseract.Output.DICT)
|
|
|
|
|
|
|
|
|
text_parts = []
|
|
|
confidences = []
|
|
|
|
|
|
for i, conf in enumerate(data['conf']):
|
|
|
if int(conf) > 30:
|
|
|
text_parts.append(data['text'][i])
|
|
|
confidences.append(int(conf))
|
|
|
|
|
|
text = ' '.join(text_parts)
|
|
|
avg_confidence = np.mean(confidences) if confidences else 0
|
|
|
|
|
|
return {
|
|
|
'text': text,
|
|
|
'confidence': avg_confidence,
|
|
|
'word_count': len(text_parts)
|
|
|
}
|
|
|
except Exception as e:
|
|
|
return {
|
|
|
'text': f"Error: {str(e)}",
|
|
|
'confidence': 0,
|
|
|
'word_count': 0
|
|
|
}
|
|
|
|
|
|
def _enhance_image(self, image):
|
|
|
"""Enhance image for better OCR."""
|
|
|
try:
|
|
|
|
|
|
img_array = np.array(image)
|
|
|
|
|
|
|
|
|
if len(img_array.shape) == 3:
|
|
|
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
|
else:
|
|
|
gray = img_array
|
|
|
|
|
|
|
|
|
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
|
|
|
|
|
|
|
return Image.fromarray(binary)
|
|
|
except Exception:
|
|
|
return image
|
|
|
|
|
|
def _enhance_for_handwriting(self, image):
|
|
|
"""Enhance image specifically for handwriting recognition."""
|
|
|
try:
|
|
|
img_array = np.array(image)
|
|
|
|
|
|
if len(img_array.shape) == 3:
|
|
|
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
|
else:
|
|
|
gray = img_array
|
|
|
|
|
|
|
|
|
binary = cv2.adaptiveThreshold(
|
|
|
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
|
|
)
|
|
|
|
|
|
|
|
|
denoised = cv2.fastNlMeansDenoising(binary)
|
|
|
|
|
|
return Image.fromarray(denoised)
|
|
|
except Exception:
|
|
|
return image
|
|
|
|