Spaces:
Sleeping
Sleeping
File size: 11,721 Bytes
52a0fe9 a2aa7c3 52a0fe9 a2aa7c3 52a0fe9 f4a6b1e a2aa7c3 52a0fe9 a2aa7c3 52a0fe9 a2aa7c3 483f7ec a2aa7c3 52a0fe9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | """
Image OCR extraction using EasyOCR (primary) and Tesseract (fallback).
Includes advanced image preprocessing for maximum accuracy.
"""
import time
import os
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
from models.schemas import ExtractionResult, DocumentMetadata
import config
try:
import google.generativeai as genai
GEMINI_AVAILABLE = True
except ImportError:
GEMINI_AVAILABLE = False
# --- OCR Engine Detection ---
try:
import easyocr
EASYOCR_AVAILABLE = True
except ImportError:
EASYOCR_AVAILABLE = False
try:
import pytesseract
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
# Global reader instance for EasyOCR (lazy loaded)
_EASY_READER = None
def get_easyocr_reader():
"""Get or create the EasyOCR reader instance."""
global _EASY_READER
if _EASY_READER is None and EASYOCR_AVAILABLE:
try:
# Initialize with configured languages and GPU setting
_EASY_READER = easyocr.Reader(config.EASYOCR_LANGS, gpu=config.EASYOCR_GPU)
except Exception as e:
print(f"Error initializing EasyOCR: {e}")
return None
return _EASY_READER
def _configure_tesseract():
"""Configure tesseract path from config."""
if config.TESSERACT_CMD and TESSERACT_AVAILABLE:
pytesseract.pytesseract.tesseract_cmd = config.TESSERACT_CMD
return True
elif TESSERACT_AVAILABLE:
try:
pytesseract.get_tesseract_version()
return True
except Exception:
return False
return False
def _preprocess_image(image: Image.Image) -> Image.Image:
"""Preprocess image for maximum OCR accuracy."""
# 1. Convert to grayscale
if image.mode != "L":
image = image.convert("L")
# 2. Dynamic Contrast / Lighting correction
image = ImageOps.autocontrast(image)
# 3. Resize to optimal DPI (approx 300)
width, height = image.size
if width < 1500 or height < 1500:
scale = max(1800 / width, 1800 / height, 2.0)
new_size = (int(width * scale), int(height * scale))
image = image.resize(new_size, Image.Resampling.LANCZOS)
# 4. Sharpening (Unsharp Mask equivalent)
image = image.filter(ImageFilter.SHARPEN)
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(1.8)
# 5. Denoising
image = image.filter(ImageFilter.MedianFilter(size=3))
return image
def _reconstruct_from_boxes(results: list) -> str:
""" Reconstruct text layout from bounding boxes.
Sort by top, then group by 'lines' based on y-coordinate.
"""
if not results:
return ""
# Sort results by top y-coordinate
results.sort(key=lambda x: x[0][0][1])
lines = []
if results:
current_line = [results[0]]
for i in range(1, len(results)):
# If the current block's mid-y is within the previous block's height range
prev_box = results[i-1][0]
curr_box = results[i][0]
prev_y_center = (prev_box[0][1] + prev_box[2][1]) / 2
curr_y_center = (curr_box[0][1] + curr_box[2][1]) / 2
# Threshold for 'same line' is approx 1/3 of the box height
height = prev_box[2][1] - prev_box[0][1]
if abs(curr_y_center - prev_y_center) < (height * 0.5):
current_line.append(results[i])
else:
lines.append(current_line)
current_line = [results[i]]
lines.append(current_line)
final_text = []
for line in lines:
# Sort each line by left x-coordinate
line.sort(key=lambda x: x[0][0][0])
line_text = []
for i, res in enumerate(line):
# Add relative spacing based on horizontal gap
if i > 0:
prev_right = line[i-1][0][1][0]
curr_left = res[0][0][0]
gap = curr_left - prev_right
# If gap is significant, add spaces
char_width = (res[0][1][0] - res[0][0][0]) / (len(res[1]) or 1)
num_spaces = int(gap / (char_width * 1.5))
line_text.append(" " * max(1, num_spaces))
line_text.append(res[1])
final_text.append(" ".join(line_text))
return "\n".join(final_text)
def extract_image_gemini(file_path: str) -> ExtractionResult:
"""Extract text from an image using Gemini 1.5 Flash for perfect layout alignment."""
if not config.GEMINI_API_KEY:
return ExtractionResult(success=False, error_message="Gemini API Key missing", raw_text="", metadata=DocumentMetadata())
start_time = time.time()
try:
genai.configure(api_key=config.GEMINI_API_KEY)
model = genai.GenerativeModel(config.GEMINI_MODEL_NAME)
image = Image.open(file_path)
# Prompt for perfect extraction with layout preservation
prompt = (
"Perform OCR on this image. Extract EVERY bit of text correctly. "
"Maintain the original layout, columns, and spacing exactly as they appear. "
"Do not add any explanations, markdown, or commentary. Output only the extracted text."
)
response = model.generate_content([prompt, image])
text = response.text.strip()
if text:
elapsed = (time.time() - start_time) * 1000
metadata = DocumentMetadata(
title=os.path.basename(file_path),
page_count=1,
word_count=len(text.split()),
character_count=len(text),
file_type="Image (Gemini AI)",
extra={
"image_width": image.width,
"image_height": image.height,
"ocr_engine": "Gemini 1.5 Flash",
"accuracy": "Perfect (Vision-Language Model)"
}
)
return ExtractionResult(
raw_text=text,
metadata=metadata,
success=True,
extraction_time_ms=elapsed
)
except Exception as e:
print(f"Gemini OCR failed: {e}")
return ExtractionResult(success=False, error_message="Gemini failed", raw_text="", metadata=DocumentMetadata())
def extract_image(file_path: str) -> ExtractionResult:
"""Extract text from an image using the best available OCR engine (Gemini -> EasyOCR -> Tesseract)."""
start_time = time.time()
# 0. Check for Gemini (Best quality, layout aware)
if GEMINI_AVAILABLE and config.is_gemini_available():
result = extract_image_gemini(file_path)
if result.success:
return result
# 1. Check for EasyOCR (Preferred local)
if EASYOCR_AVAILABLE:
try:
reader = get_easyocr_reader()
if reader:
# Get original dimensions for metadata
with Image.open(file_path) as img:
original_size = img.size
# EasyOCR works well with both original and preprocessed images
# We'll use a slightly preprocessed version for consistency
# Perform OCR with layout awareness
# Adjusting thresholds for better numeric and tabular capture
results = reader.readtext(
file_path,
detail=1,
paragraph=False, # We want individual boxes for layout reconstruction
canvas_size=1200, # Shrunk to detect huge fonts (like certificate names) that CRAFT misses
contrast_ths=0.1 # Reset to 0.1 so colored/light text isn't dropped
)
# Reconstruct full layout from bounding boxes
text = _reconstruct_from_boxes(results)
if text.strip():
elapsed = (time.time() - start_time) * 1000
metadata = DocumentMetadata(
title=os.path.basename(file_path),
page_count=1,
word_count=len(text.split()),
character_count=len(text),
file_type="Image (EasyOCR)",
extra={
"image_width": original_size[0],
"image_height": original_size[1],
"ocr_engine": "EasyOCR",
"accuracy": "High (Deep Learning)"
}
)
return ExtractionResult(
raw_text=text.strip(),
metadata=metadata,
success=True,
extraction_time_ms=elapsed
)
except Exception as e:
print(f"EasyOCR extraction failed, falling back to Tesseract: {e}")
# 2. Fallback to Tesseract
if TESSERACT_AVAILABLE and _configure_tesseract():
try:
image = Image.open(file_path)
original_size = image.size
processed_image = _preprocess_image(image)
custom_config = f"--oem 3 --psm 6 -l {config.TESSERACT_LANG}"
text = pytesseract.image_to_string(processed_image, config=custom_config)
# Confidence
try:
data = pytesseract.image_to_data(processed_image, config=custom_config, output_type=pytesseract.Output.DICT)
confidences = [int(c) for c in data["conf"] if int(c) > 0]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
except Exception:
avg_confidence = 0
elapsed = (time.time() - start_time) * 1000
if text.strip():
metadata = DocumentMetadata(
title=os.path.basename(file_path),
page_count=1,
word_count=len(text.split()),
character_count=len(text),
file_type="Image (Tesseract)",
extra={
"image_width": original_size[0],
"image_height": original_size[1],
"ocr_confidence": round(avg_confidence, 2),
"ocr_engine": "Tesseract"
}
)
return ExtractionResult(
raw_text=text.strip(),
metadata=metadata,
success=True,
extraction_time_ms=elapsed
)
except Exception as e:
print(f"Tesseract extraction failed: {e}")
# 3. Failure cases
elapsed = (time.time() - start_time) * 1000
if not EASYOCR_AVAILABLE and not TESSERACT_AVAILABLE:
error_msg = "No OCR libraries installed. Please run 'pip install easyocr'."
elif not EASYOCR_AVAILABLE and TESSERACT_AVAILABLE:
error_msg = "EasyOCR is not installed, and Tesseract binary was not found or failed. Please run 'pip install easyocr' for best results."
elif EASYOCR_AVAILABLE and not TESSERACT_AVAILABLE:
error_msg = "EasyOCR failed to extract text, and Tesseract is not installed."
else:
error_msg = "OCR extraction failed. Both EasyOCR and Tesseract engines were unable to extract text from this image."
return ExtractionResult(
raw_text="",
metadata=DocumentMetadata(file_type="Image (OCR)"),
success=False,
error_message=error_msg,
extraction_time_ms=elapsed,
)
|