mike23415 commited on
Commit
236b2b6
·
verified ·
1 Parent(s): 7906dde

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -34
app.py CHANGED
@@ -265,55 +265,83 @@ def post_process_text(text):
265
 
266
  return processed_text
267
 
268
- def extract_text_tesseract_improved(image, lang='eng', psm=6):
269
  """
270
- Extract text using PyTesseract with improved settings for documents
271
  """
272
  try:
273
- # Configure Tesseract with improved settings
274
- if psm == 6: # Block of text
275
- # Fixed: Removed problematic quotes from whitelist and use simpler config
276
- custom_config = f'--oem 3 --psm {psm} -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,!?-:;()[]{{}}=+×÷%/'
277
- else:
278
- custom_config = f'--oem 3 --psm {psm}'
279
-
280
- # Extract text
281
- text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
282
-
283
- # Get confidence scores
284
- data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
285
-
286
- # Calculate average confidence
287
- confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
288
- avg_confidence = sum(confidences) / len(confidences) if confidences else 0
289
-
290
- # Post-process the text
291
- cleaned_text = post_process_text(text)
292
 
293
- return {
294
- 'text': cleaned_text,
295
- 'raw_text': text, # Keep original for comparison
296
- 'confidence': avg_confidence / 100.0,
297
- 'word_count': len([w for w in data['text'] if w.strip()])
298
- }
 
 
 
 
 
299
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  except Exception as e:
301
- logger.error(f"Tesseract OCR error: {e}")
302
  return {'text': '', 'raw_text': '', 'confidence': 0.0, 'word_count': 0}
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  def process_image_smart_improved(image, enhance_type="default"):
305
  """
306
- Smart processing with improved text handling
307
  """
308
  try:
309
  # First, try with advanced preprocessing
310
  processed_img = preprocess_image_advanced(image, enhance_type)
311
 
312
- # Try different approaches
313
  results = []
314
 
315
  # Mode 6: Block of text (best for documents)
316
- result = extract_text_tesseract_improved(processed_img, psm=6)
317
  if result['text']:
318
  results.append(('psm_6', result))
319
 
@@ -321,19 +349,19 @@ def process_image_smart_improved(image, enhance_type="default"):
321
  if not results or results[0][1]['confidence'] < 0.6:
322
  if enhance_type != "document":
323
  doc_processed = preprocess_image_advanced(image, "document")
324
- result = extract_text_tesseract_improved(doc_processed, psm=6)
325
  if result['text'] and result['confidence'] > (results[0][1]['confidence'] if results else 0):
326
  results = [('psm_6_document', result)]
327
 
328
  # Try other PSM modes if still poor results
329
  if not results or results[0][1]['confidence'] < 0.5:
330
  # Mode 4: Single column of text
331
- result = extract_text_tesseract_improved(processed_img, psm=4)
332
  if result['text']:
333
  results.append(('psm_4', result))
334
 
335
  # Mode 13: Single text line
336
- result = extract_text_tesseract_improved(processed_img, psm=13)
337
  if result['text']:
338
  results.append(('psm_13', result))
339
 
@@ -356,6 +384,85 @@ def process_image_smart_improved(image, enhance_type="default"):
356
  'method': 'error', 'preprocessing': enhance_type
357
  }
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  @app.route('/')
360
  def home():
361
  """Root endpoint"""
 
265
 
266
  return processed_text
267
 
268
+ def extract_text_tesseract_adaptive(image, lang='eng', psm=6):
269
  """
270
+ Adaptive OCR that tries multiple configurations for different image types
271
  """
272
  try:
273
+ # Strategy 1: Try with conservative whitelist first
274
+ try:
275
+ whitelist_chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,!?-:;()[]{}=+×÷%/'
276
+ custom_config = f'--oem 3 --psm {psm} -c tessedit_char_whitelist={whitelist_chars}'
277
+ text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
278
+ data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
279
+
280
+ # Check if we got reasonable results
281
+ if text.strip() and len(text.strip()) > 0:
282
+ logger.info("Strategy 1 (whitelist) successful")
283
+ return process_ocr_result(text, data, "whitelist")
284
+ except Exception as e:
285
+ logger.warning(f"Strategy 1 (whitelist) failed: {e}")
 
 
 
 
 
 
286
 
287
+ # Strategy 2: Try without whitelist but with other optimizations
288
+ try:
289
+ custom_config = f'--oem 3 --psm {psm} -c tessedit_do_invert=0'
290
+ text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
291
+ data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
292
+
293
+ if text.strip() and len(text.strip()) > 0:
294
+ logger.info("Strategy 2 (no whitelist) successful")
295
+ return process_ocr_result(text, data, "no_whitelist")
296
+ except Exception as e:
297
+ logger.warning(f"Strategy 2 (no whitelist) failed: {e}")
298
 
299
+ # Strategy 3: Basic configuration as fallback
300
+ try:
301
+ custom_config = f'--oem 3 --psm {psm}'
302
+ text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
303
+ data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
304
+
305
+ logger.info("Strategy 3 (basic) used as fallback")
306
+ return process_ocr_result(text, data, "basic")
307
+ except Exception as e:
308
+ logger.error(f"All OCR strategies failed: {e}")
309
+ return {'text': '', 'raw_text': '', 'confidence': 0.0, 'word_count': 0}
310
+
311
  except Exception as e:
312
+ logger.error(f"Adaptive OCR error: {e}")
313
  return {'text': '', 'raw_text': '', 'confidence': 0.0, 'word_count': 0}
314
 
315
+ def process_ocr_result(text, data, strategy):
316
+ """Helper function to process OCR results consistently"""
317
+ # Calculate average confidence
318
+ confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
319
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
320
+
321
+ # Post-process the text
322
+ cleaned_text = post_process_text(text)
323
+
324
+ return {
325
+ 'text': cleaned_text,
326
+ 'raw_text': text,
327
+ 'confidence': avg_confidence / 100.0,
328
+ 'word_count': len([w for w in data['text'] if w.strip()]),
329
+ 'strategy': strategy
330
+ }
331
+
332
  def process_image_smart_improved(image, enhance_type="default"):
333
  """
334
+ Smart processing with adaptive OCR strategies
335
  """
336
  try:
337
  # First, try with advanced preprocessing
338
  processed_img = preprocess_image_advanced(image, enhance_type)
339
 
340
+ # Try different approaches with adaptive OCR
341
  results = []
342
 
343
  # Mode 6: Block of text (best for documents)
344
+ result = extract_text_tesseract_adaptive(processed_img, psm=6)
345
  if result['text']:
346
  results.append(('psm_6', result))
347
 
 
349
  if not results or results[0][1]['confidence'] < 0.6:
350
  if enhance_type != "document":
351
  doc_processed = preprocess_image_advanced(image, "document")
352
+ result = extract_text_tesseract_adaptive(doc_processed, psm=6)
353
  if result['text'] and result['confidence'] > (results[0][1]['confidence'] if results else 0):
354
  results = [('psm_6_document', result)]
355
 
356
  # Try other PSM modes if still poor results
357
  if not results or results[0][1]['confidence'] < 0.5:
358
  # Mode 4: Single column of text
359
+ result = extract_text_tesseract_adaptive(processed_img, psm=4)
360
  if result['text']:
361
  results.append(('psm_4', result))
362
 
363
  # Mode 13: Single text line
364
+ result = extract_text_tesseract_adaptive(processed_img, psm=13)
365
  if result['text']:
366
  results.append(('psm_13', result))
367
 
 
384
  'method': 'error', 'preprocessing': enhance_type
385
  }
386
 
387
+ # Alternative: Image-specific preprocessing detector
388
+ def detect_image_type(image):
389
+ """
390
+ Detect image characteristics to choose optimal processing
391
+ """
392
+ try:
393
+ # Convert to numpy array for analysis
394
+ if isinstance(image, Image.Image):
395
+ img_array = np.array(image.convert('RGB'))
396
+ else:
397
+ img_array = image
398
+
399
+ # Calculate image statistics
400
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) if len(img_array.shape) == 3 else img_array
401
+ height, width = gray.shape
402
+
403
+ # Check image size
404
+ is_small = max(height, width) < 600
405
+
406
+ # Check contrast
407
+ contrast = gray.std()
408
+ is_low_contrast = contrast < 50
409
+
410
+ # Check if mostly text (high edge density in certain patterns)
411
+ edges = cv2.Canny(gray, 50, 150)
412
+ edge_density = np.sum(edges > 0) / (height * width)
413
+ is_text_heavy = edge_density > 0.1
414
+
415
+ # Determine optimal enhancement
416
+ if is_small or is_low_contrast:
417
+ return "enhance"
418
+ elif is_text_heavy:
419
+ return "document"
420
+ else:
421
+ return "default"
422
+
423
+ except Exception as e:
424
+ logger.warning(f"Image type detection failed: {e}")
425
+ return "default"
426
+
427
+ # Enhanced OCR endpoint with auto-detection
428
+ def ocr_endpoint_enhanced():
429
+ """
430
+ OCR endpoint with automatic image type detection
431
+ """
432
+ try:
433
+ logger.info("OCR request received")
434
+
435
+ # ... (existing parameter handling code) ...
436
+
437
+ # Auto-detect optimal enhancement if not specified
438
+ if enhancement == 'auto':
439
+ enhancement = detect_image_type(image)
440
+ logger.info(f"Auto-detected enhancement type: {enhancement}")
441
+
442
+ # Process image with improved OCR
443
+ logger.info("Starting adaptive OCR processing")
444
+ result = process_image_smart_improved(image, enhancement)
445
+
446
+ # Add debugging info
447
+ response = {
448
+ "success": True,
449
+ "text": result['text'],
450
+ "confidence": round(result['confidence'], 3),
451
+ "character_count": len(result['text']),
452
+ "word_count": result.get('word_count', 0),
453
+ "method_used": result.get('method', 'unknown'),
454
+ "preprocessing_used": result.get('preprocessing', 'unknown'),
455
+ "ocr_strategy": result.get('strategy', 'unknown'), # New field
456
+ "language": language,
457
+ "engine": "PyTesseract Adaptive"
458
+ }
459
+
460
+ return jsonify(response)
461
+
462
+ except Exception as e:
463
+ logger.error(f"OCR processing error: {str(e)}")
464
+ return jsonify({"error": str(e), "success": False}), 500
465
+
466
  @app.route('/')
467
  def home():
468
  """Root endpoint"""