omthakur1 commited on
Commit
d189c3a
·
1 Parent(s): 19d3640

fix: Multi-strategy OCR for better colored/gradient text extraction

Browse files
Files changed (1) hide show
  1. app.py +62 -47
app.py CHANGED
@@ -257,7 +257,7 @@ def pdf_to_word():
257
 
258
  @app.route('/image-to-text', methods=['POST'])
259
  def image_to_text():
260
- """Extract text from image using Tesseract OCR with advanced preprocessing"""
261
 
262
  if 'file' not in request.files:
263
  return jsonify({'error': 'No file provided'}), 400
@@ -277,66 +277,81 @@ def image_to_text():
277
  if image.mode != 'RGB':
278
  image = image.convert('RGB')
279
 
280
- # Convert PIL Image to numpy array for preprocessing
281
  import numpy as np
282
  import cv2
283
 
284
  img_array = np.array(image)
285
 
286
- # Image preprocessing for better OCR
287
- # 1. Convert to grayscale
288
- gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
289
 
290
- # 2. Apply slight Gaussian blur to reduce noise
291
- blurred = cv2.GaussianBlur(gray, (3, 3), 0)
 
 
 
 
 
292
 
293
- # 3. Apply adaptive thresholding for better contrast
294
- thresh = cv2.adaptiveThreshold(
295
- blurred, 255,
296
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
297
- cv2.THRESH_BINARY,
298
- 11, 2
299
- )
 
 
300
 
301
- # 4. Apply morphological operations to remove small noise
302
- kernel = np.ones((1, 1), np.uint8)
303
- processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
304
- processed = cv2.medianBlur(processed, 1)
305
-
306
- # Convert back to PIL Image for Tesseract
307
- processed_image = Image.fromarray(processed)
308
-
309
- # Configure Tesseract for better accuracy
310
- # PSM 3 = Fully automatic page segmentation, but no OSD
311
- # PSM 6 = Assume a single uniform block of text
312
- # OEM 3 = Default, based on what is available (LSTM + Legacy)
313
- custom_config = r'--oem 3 --psm 3 -c preserve_interword_spaces=1'
314
-
315
- # Perform OCR with configuration
316
- text = pytesseract.image_to_string(processed_image, config=custom_config, lang='eng')
317
-
318
- # Clean up the extracted text
319
- # Remove excessive whitespace and empty lines
320
- lines = [line.strip() for line in text.split('\n')]
321
- cleaned_lines = [line for line in lines if line and len(line) > 0]
322
-
323
- # Filter out lines with too many special characters (likely errors)
324
- filtered_lines = []
325
- for line in cleaned_lines:
326
- # Count alphanumeric vs special chars
327
- alnum_count = sum(c.isalnum() or c.isspace() for c in line)
328
- special_count = len(line) - alnum_count
 
 
 
 
 
329
 
330
- # Keep line if it has reasonable ratio of alphanumeric characters
331
- if len(line) > 0 and (alnum_count / len(line)) > 0.5:
332
- filtered_lines.append(line)
 
 
 
333
 
334
- text = '\n'.join(filtered_lines)
 
 
335
 
336
  logger.info(f"OCR successful! Extracted {len(text)} characters")
337
 
338
  # Create text file
339
- text_content = f"Extracted Text from {file.filename}\n\n{text.strip()}"
340
 
341
  # Return as downloadable text file
342
  buffer = BytesIO()
 
257
 
258
  @app.route('/image-to-text', methods=['POST'])
259
  def image_to_text():
260
+ """Extract text from image using Tesseract OCR with smart preprocessing"""
261
 
262
  if 'file' not in request.files:
263
  return jsonify({'error': 'No file provided'}), 400
 
277
  if image.mode != 'RGB':
278
  image = image.convert('RGB')
279
 
 
280
  import numpy as np
281
  import cv2
282
 
283
  img_array = np.array(image)
284
 
285
+ # Try multiple OCR strategies and pick the best result
286
+ results = []
 
287
 
288
+ # Strategy 1: Original image (best for colored text, graphics)
289
+ try:
290
+ config1 = r'--oem 3 --psm 3'
291
+ text1 = pytesseract.image_to_string(image, config=config1, lang='eng')
292
+ results.append(('original', text1, len(text1.strip())))
293
+ except Exception as e:
294
+ logger.warning(f"Strategy 1 failed: {e}")
295
 
296
+ # Strategy 2: Grayscale (good for normal documents)
297
+ try:
298
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
299
+ gray_img = Image.fromarray(gray)
300
+ config2 = r'--oem 3 --psm 6'
301
+ text2 = pytesseract.image_to_string(gray_img, config=config2, lang='eng')
302
+ results.append(('grayscale', text2, len(text2.strip())))
303
+ except Exception as e:
304
+ logger.warning(f"Strategy 2 failed: {e}")
305
 
306
+ # Strategy 3: High contrast (for faded text)
307
+ try:
308
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
309
+ # Increase contrast
310
+ alpha = 1.5 # Contrast control
311
+ beta = 0 # Brightness control
312
+ contrast = cv2.convertScaleAbs(gray, alpha=alpha, beta=beta)
313
+ contrast_img = Image.fromarray(contrast)
314
+ config3 = r'--oem 3 --psm 3'
315
+ text3 = pytesseract.image_to_string(contrast_img, config=config3, lang='eng')
316
+ results.append(('contrast', text3, len(text3.strip())))
317
+ except Exception as e:
318
+ logger.warning(f"Strategy 3 failed: {e}")
319
+
320
+ # Pick the result with the most content
321
+ if not results:
322
+ raise Exception("All OCR strategies failed")
323
+
324
+ # Sort by text length (more text usually means better recognition)
325
+ results.sort(key=lambda x: x[2], reverse=True)
326
+ best_strategy, raw_text, _ = results[0]
327
+
328
+ logger.info(f"Best strategy: {best_strategy} with {len(raw_text)} characters")
329
+
330
+ # Clean up the text
331
+ lines = []
332
+ for line in raw_text.split('\n'):
333
+ # Strip whitespace
334
+ line = line.strip()
335
+
336
+ # Skip empty lines
337
+ if not line:
338
+ continue
339
 
340
+ # Skip lines that are mostly noise (too many special chars)
341
+ alnum = sum(c.isalnum() or c in ' .,!?-$%()' for c in line)
342
+ if len(line) > 0 and (alnum / len(line)) > 0.4:
343
+ lines.append(line)
344
+
345
+ text = '\n'.join(lines)
346
 
347
+ # If result is still too short, try without filtering
348
+ if len(text) < 20 and len(raw_text.strip()) > len(text):
349
+ text = raw_text.strip()
350
 
351
  logger.info(f"OCR successful! Extracted {len(text)} characters")
352
 
353
  # Create text file
354
+ text_content = f"Extracted Text from {file.filename}\n\n{text}"
355
 
356
  # Return as downloadable text file
357
  buffer = BytesIO()