Spaces:

mike23415
/

Orc1

Sleeping

App Files Files Community

mike23415 commited on May 31, 2025

Commit

236b2b6

verified ·

1 Parent(s): 7906dde

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -34

app.py CHANGED Viewed

@@ -265,55 +265,83 @@ def post_process_text(text):
     return processed_text
-def extract_text_tesseract_improved(image, lang='eng', psm=6):
     """
-    Extract text using PyTesseract with improved settings for documents
     """
     try:
-        # Configure Tesseract with improved settings
-        if psm == 6:  # Block of text
-            # Fixed: Removed problematic quotes from whitelist and use simpler config
-            custom_config = f'--oem 3 --psm {psm} -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,!?-:;()[]{{}}=+×÷%/'
-        else:
-            custom_config = f'--oem 3 --psm {psm}'
-        # Extract text
-        text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
-        # Get confidence scores
-        data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
-        # Calculate average confidence
-        confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
-        avg_confidence = sum(confidences) / len(confidences) if confidences else 0
-        # Post-process the text
-        cleaned_text = post_process_text(text)
-        return {
-            'text': cleaned_text,
-            'raw_text': text,  # Keep original for comparison
-            'confidence': avg_confidence / 100.0,
-            'word_count': len([w for w in data['text'] if w.strip()])
-        }
     except Exception as e:
-        logger.error(f"Tesseract OCR error: {e}")
         return {'text': '', 'raw_text': '', 'confidence': 0.0, 'word_count': 0}
 def process_image_smart_improved(image, enhance_type="default"):
     """
-    Smart processing with improved text handling
     """
     try:
         # First, try with advanced preprocessing
         processed_img = preprocess_image_advanced(image, enhance_type)
-        # Try different approaches
         results = []
         # Mode 6: Block of text (best for documents)
-        result = extract_text_tesseract_improved(processed_img, psm=6)
         if result['text']:
             results.append(('psm_6', result))
@@ -321,19 +349,19 @@ def process_image_smart_improved(image, enhance_type="default"):
         if not results or results[0][1]['confidence'] < 0.6:
             if enhance_type != "document":
                 doc_processed = preprocess_image_advanced(image, "document")
-                result = extract_text_tesseract_improved(doc_processed, psm=6)
                 if result['text'] and result['confidence'] > (results[0][1]['confidence'] if results else 0):
                     results = [('psm_6_document', result)]
         # Try other PSM modes if still poor results
         if not results or results[0][1]['confidence'] < 0.5:
             # Mode 4: Single column of text
-            result = extract_text_tesseract_improved(processed_img, psm=4)
             if result['text']:
                 results.append(('psm_4', result))
             # Mode 13: Single text line
-            result = extract_text_tesseract_improved(processed_img, psm=13)
             if result['text']:
                 results.append(('psm_13', result))
@@ -356,6 +384,85 @@ def process_image_smart_improved(image, enhance_type="default"):
             'method': 'error', 'preprocessing': enhance_type
         }
 @app.route('/')
 def home():
     """Root endpoint"""

     return processed_text
+def extract_text_tesseract_adaptive(image, lang='eng', psm=6):
     """
+    Adaptive OCR that tries multiple configurations for different image types
     """
     try:
+        # Strategy 1: Try with conservative whitelist first
+        try:
+            whitelist_chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,!?-:;()[]{}=+×÷%/'
+            custom_config = f'--oem 3 --psm {psm} -c tessedit_char_whitelist={whitelist_chars}'
+            text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
+            data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
+            # Check if we got reasonable results
+            if text.strip() and len(text.strip()) > 0:
+                logger.info("Strategy 1 (whitelist) successful")
+                return process_ocr_result(text, data, "whitelist")
+        except Exception as e:
+            logger.warning(f"Strategy 1 (whitelist) failed: {e}")
+        # Strategy 2: Try without whitelist but with other optimizations
+        try:
+            custom_config = f'--oem 3 --psm {psm} -c tessedit_do_invert=0'
+            text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
+            data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
+            if text.strip() and len(text.strip()) > 0:
+                logger.info("Strategy 2 (no whitelist) successful")
+                return process_ocr_result(text, data, "no_whitelist")
+        except Exception as e:
+            logger.warning(f"Strategy 2 (no whitelist) failed: {e}")
+        # Strategy 3: Basic configuration as fallback
+        try:
+            custom_config = f'--oem 3 --psm {psm}'
+            text = pytesseract.image_to_string(image, lang=lang, config=custom_config)
+            data = pytesseract.image_to_data(image, lang=lang, config=custom_config, output_type=pytesseract.Output.DICT)
+            logger.info("Strategy 3 (basic) used as fallback")
+            return process_ocr_result(text, data, "basic")
+        except Exception as e:
+            logger.error(f"All OCR strategies failed: {e}")
+            return {'text': '', 'raw_text': '', 'confidence': 0.0, 'word_count': 0}
     except Exception as e:
+        logger.error(f"Adaptive OCR error: {e}")
         return {'text': '', 'raw_text': '', 'confidence': 0.0, 'word_count': 0}
+def process_ocr_result(text, data, strategy):
+    """Helper function to process OCR results consistently"""
+    # Calculate average confidence
+    confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
+    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
+    # Post-process the text
+    cleaned_text = post_process_text(text)
+    return {
+        'text': cleaned_text,
+        'raw_text': text,
+        'confidence': avg_confidence / 100.0,
+        'word_count': len([w for w in data['text'] if w.strip()]),
+        'strategy': strategy
+    }
 def process_image_smart_improved(image, enhance_type="default"):
     """
+    Smart processing with adaptive OCR strategies
     """
     try:
         # First, try with advanced preprocessing
         processed_img = preprocess_image_advanced(image, enhance_type)
+        # Try different approaches with adaptive OCR
         results = []
         # Mode 6: Block of text (best for documents)
+        result = extract_text_tesseract_adaptive(processed_img, psm=6)
         if result['text']:
             results.append(('psm_6', result))
         if not results or results[0][1]['confidence'] < 0.6:
             if enhance_type != "document":
                 doc_processed = preprocess_image_advanced(image, "document")
+                result = extract_text_tesseract_adaptive(doc_processed, psm=6)
                 if result['text'] and result['confidence'] > (results[0][1]['confidence'] if results else 0):
                     results = [('psm_6_document', result)]
         # Try other PSM modes if still poor results
         if not results or results[0][1]['confidence'] < 0.5:
             # Mode 4: Single column of text
+            result = extract_text_tesseract_adaptive(processed_img, psm=4)
             if result['text']:
                 results.append(('psm_4', result))
             # Mode 13: Single text line
+            result = extract_text_tesseract_adaptive(processed_img, psm=13)
             if result['text']:
                 results.append(('psm_13', result))
             'method': 'error', 'preprocessing': enhance_type
         }
+# Alternative: Image-specific preprocessing detector
+def detect_image_type(image):
+    """
+    Detect image characteristics to choose optimal processing
+    """
+    try:
+        # Convert to numpy array for analysis
+        if isinstance(image, Image.Image):
+            img_array = np.array(image.convert('RGB'))
+        else:
+            img_array = image
+        # Calculate image statistics
+        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) if len(img_array.shape) == 3 else img_array
+        height, width = gray.shape
+        # Check image size
+        is_small = max(height, width) < 600
+        # Check contrast
+        contrast = gray.std()
+        is_low_contrast = contrast < 50
+        # Check if mostly text (high edge density in certain patterns)
+        edges = cv2.Canny(gray, 50, 150)
+        edge_density = np.sum(edges > 0) / (height * width)
+        is_text_heavy = edge_density > 0.1
+        # Determine optimal enhancement
+        if is_small or is_low_contrast:
+            return "enhance"
+        elif is_text_heavy:
+            return "document"
+        else:
+            return "default"
+    except Exception as e:
+        logger.warning(f"Image type detection failed: {e}")
+        return "default"
+# Enhanced OCR endpoint with auto-detection
+def ocr_endpoint_enhanced():
+    """
+    OCR endpoint with automatic image type detection
+    """
+    try:
+        logger.info("OCR request received")
+        # ... (existing parameter handling code) ...
+        # Auto-detect optimal enhancement if not specified
+        if enhancement == 'auto':
+            enhancement = detect_image_type(image)
+            logger.info(f"Auto-detected enhancement type: {enhancement}")
+        # Process image with improved OCR
+        logger.info("Starting adaptive OCR processing")
+        result = process_image_smart_improved(image, enhancement)
+        # Add debugging info
+        response = {
+            "success": True,
+            "text": result['text'],
+            "confidence": round(result['confidence'], 3),
+            "character_count": len(result['text']),
+            "word_count": result.get('word_count', 0),
+            "method_used": result.get('method', 'unknown'),
+            "preprocessing_used": result.get('preprocessing', 'unknown'),
+            "ocr_strategy": result.get('strategy', 'unknown'),  # New field
+            "language": language,
+            "engine": "PyTesseract Adaptive"
+        }
+        return jsonify(response)
+    except Exception as e:
+        logger.error(f"OCR processing error: {str(e)}")
+        return jsonify({"error": str(e), "success": False}), 500
 @app.route('/')
 def home():
     """Root endpoint"""