Spaces:
Sleeping
Sleeping
fix: Multi-strategy OCR for better colored/gradient text extraction
Browse files
app.py
CHANGED
|
@@ -257,7 +257,7 @@ def pdf_to_word():
|
|
| 257 |
|
| 258 |
@app.route('/image-to-text', methods=['POST'])
|
| 259 |
def image_to_text():
|
| 260 |
-
"""Extract text from image using Tesseract OCR with
|
| 261 |
|
| 262 |
if 'file' not in request.files:
|
| 263 |
return jsonify({'error': 'No file provided'}), 400
|
|
@@ -277,66 +277,81 @@ def image_to_text():
|
|
| 277 |
if image.mode != 'RGB':
|
| 278 |
image = image.convert('RGB')
|
| 279 |
|
| 280 |
-
# Convert PIL Image to numpy array for preprocessing
|
| 281 |
import numpy as np
|
| 282 |
import cv2
|
| 283 |
|
| 284 |
img_array = np.array(image)
|
| 285 |
|
| 286 |
-
#
|
| 287 |
-
|
| 288 |
-
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 289 |
|
| 290 |
-
#
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
-
#
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
|
|
|
|
|
|
| 300 |
|
| 301 |
-
#
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
#
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
#
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
-
#
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
-
|
|
|
|
|
|
|
| 335 |
|
| 336 |
logger.info(f"OCR successful! Extracted {len(text)} characters")
|
| 337 |
|
| 338 |
# Create text file
|
| 339 |
-
text_content = f"Extracted Text from {file.filename}\n\n{text
|
| 340 |
|
| 341 |
# Return as downloadable text file
|
| 342 |
buffer = BytesIO()
|
|
|
|
| 257 |
|
| 258 |
@app.route('/image-to-text', methods=['POST'])
|
| 259 |
def image_to_text():
|
| 260 |
+
"""Extract text from image using Tesseract OCR with smart preprocessing"""
|
| 261 |
|
| 262 |
if 'file' not in request.files:
|
| 263 |
return jsonify({'error': 'No file provided'}), 400
|
|
|
|
| 277 |
if image.mode != 'RGB':
|
| 278 |
image = image.convert('RGB')
|
| 279 |
|
|
|
|
| 280 |
import numpy as np
|
| 281 |
import cv2
|
| 282 |
|
| 283 |
img_array = np.array(image)
|
| 284 |
|
| 285 |
+
# Try multiple OCR strategies and pick the best result
|
| 286 |
+
results = []
|
|
|
|
| 287 |
|
| 288 |
+
# Strategy 1: Original image (best for colored text, graphics)
|
| 289 |
+
try:
|
| 290 |
+
config1 = r'--oem 3 --psm 3'
|
| 291 |
+
text1 = pytesseract.image_to_string(image, config=config1, lang='eng')
|
| 292 |
+
results.append(('original', text1, len(text1.strip())))
|
| 293 |
+
except Exception as e:
|
| 294 |
+
logger.warning(f"Strategy 1 failed: {e}")
|
| 295 |
|
| 296 |
+
# Strategy 2: Grayscale (good for normal documents)
|
| 297 |
+
try:
|
| 298 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 299 |
+
gray_img = Image.fromarray(gray)
|
| 300 |
+
config2 = r'--oem 3 --psm 6'
|
| 301 |
+
text2 = pytesseract.image_to_string(gray_img, config=config2, lang='eng')
|
| 302 |
+
results.append(('grayscale', text2, len(text2.strip())))
|
| 303 |
+
except Exception as e:
|
| 304 |
+
logger.warning(f"Strategy 2 failed: {e}")
|
| 305 |
|
| 306 |
+
# Strategy 3: High contrast (for faded text)
|
| 307 |
+
try:
|
| 308 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 309 |
+
# Increase contrast
|
| 310 |
+
alpha = 1.5 # Contrast control
|
| 311 |
+
beta = 0 # Brightness control
|
| 312 |
+
contrast = cv2.convertScaleAbs(gray, alpha=alpha, beta=beta)
|
| 313 |
+
contrast_img = Image.fromarray(contrast)
|
| 314 |
+
config3 = r'--oem 3 --psm 3'
|
| 315 |
+
text3 = pytesseract.image_to_string(contrast_img, config=config3, lang='eng')
|
| 316 |
+
results.append(('contrast', text3, len(text3.strip())))
|
| 317 |
+
except Exception as e:
|
| 318 |
+
logger.warning(f"Strategy 3 failed: {e}")
|
| 319 |
+
|
| 320 |
+
# Pick the result with the most content
|
| 321 |
+
if not results:
|
| 322 |
+
raise Exception("All OCR strategies failed")
|
| 323 |
+
|
| 324 |
+
# Sort by text length (more text usually means better recognition)
|
| 325 |
+
results.sort(key=lambda x: x[2], reverse=True)
|
| 326 |
+
best_strategy, raw_text, _ = results[0]
|
| 327 |
+
|
| 328 |
+
logger.info(f"Best strategy: {best_strategy} with {len(raw_text)} characters")
|
| 329 |
+
|
| 330 |
+
# Clean up the text
|
| 331 |
+
lines = []
|
| 332 |
+
for line in raw_text.split('\n'):
|
| 333 |
+
# Strip whitespace
|
| 334 |
+
line = line.strip()
|
| 335 |
+
|
| 336 |
+
# Skip empty lines
|
| 337 |
+
if not line:
|
| 338 |
+
continue
|
| 339 |
|
| 340 |
+
# Skip lines that are mostly noise (too many special chars)
|
| 341 |
+
alnum = sum(c.isalnum() or c in ' .,!?-$%()' for c in line)
|
| 342 |
+
if len(line) > 0 and (alnum / len(line)) > 0.4:
|
| 343 |
+
lines.append(line)
|
| 344 |
+
|
| 345 |
+
text = '\n'.join(lines)
|
| 346 |
|
| 347 |
+
# If result is still too short, try without filtering
|
| 348 |
+
if len(text) < 20 and len(raw_text.strip()) > len(text):
|
| 349 |
+
text = raw_text.strip()
|
| 350 |
|
| 351 |
logger.info(f"OCR successful! Extracted {len(text)} characters")
|
| 352 |
|
| 353 |
# Create text file
|
| 354 |
+
text_content = f"Extracted Text from {file.filename}\n\n{text}"
|
| 355 |
|
| 356 |
# Return as downloadable text file
|
| 357 |
buffer = BytesIO()
|