Spaces:

omthakur1
/

python-doc-convert

Sleeping

App Files Files Community

omthakur1 commited on Feb 10

Commit

d189c3a

1 Parent(s): 19d3640

fix: Multi-strategy OCR for better colored/gradient text extraction

Browse files

Files changed (1) hide show

app.py +62 -47

app.py CHANGED Viewed

@@ -257,7 +257,7 @@ def pdf_to_word():
 @app.route('/image-to-text', methods=['POST'])
 def image_to_text():
-    """Extract text from image using Tesseract OCR with advanced preprocessing"""
     if 'file' not in request.files:
         return jsonify({'error': 'No file provided'}), 400
@@ -277,66 +277,81 @@ def image_to_text():
         if image.mode != 'RGB':
             image = image.convert('RGB')
-        # Convert PIL Image to numpy array for preprocessing
         import numpy as np
         import cv2
         img_array = np.array(image)
-        # Image preprocessing for better OCR
-        # 1. Convert to grayscale
-        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
-        # 2. Apply slight Gaussian blur to reduce noise
-        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
-        # 3. Apply adaptive thresholding for better contrast
-        thresh = cv2.adaptiveThreshold(
-            blurred, 255,
-            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-            cv2.THRESH_BINARY,
-            11, 2
-        )
-        # 4. Apply morphological operations to remove small noise
-        kernel = np.ones((1, 1), np.uint8)
-        processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
-        processed = cv2.medianBlur(processed, 1)
-        # Convert back to PIL Image for Tesseract
-        processed_image = Image.fromarray(processed)
-        # Configure Tesseract for better accuracy
-        # PSM 3 = Fully automatic page segmentation, but no OSD
-        # PSM 6 = Assume a single uniform block of text
-        # OEM 3 = Default, based on what is available (LSTM + Legacy)
-        custom_config = r'--oem 3 --psm 3 -c preserve_interword_spaces=1'
-        # Perform OCR with configuration
-        text = pytesseract.image_to_string(processed_image, config=custom_config, lang='eng')
-        # Clean up the extracted text
-        # Remove excessive whitespace and empty lines
-        lines = [line.strip() for line in text.split('\n')]
-        cleaned_lines = [line for line in lines if line and len(line) > 0]
-        # Filter out lines with too many special characters (likely errors)
-        filtered_lines = []
-        for line in cleaned_lines:
-            # Count alphanumeric vs special chars
-            alnum_count = sum(c.isalnum() or c.isspace() for c in line)
-            special_count = len(line) - alnum_count
-            # Keep line if it has reasonable ratio of alphanumeric characters
-            if len(line) > 0 and (alnum_count / len(line)) > 0.5:
-                filtered_lines.append(line)
-        text = '\n'.join(filtered_lines)
         logger.info(f"OCR successful! Extracted {len(text)} characters")
         # Create text file
-        text_content = f"Extracted Text from {file.filename}\n\n{text.strip()}"
         # Return as downloadable text file
         buffer = BytesIO()

 @app.route('/image-to-text', methods=['POST'])
 def image_to_text():
+    """Extract text from image using Tesseract OCR with smart preprocessing"""
     if 'file' not in request.files:
         return jsonify({'error': 'No file provided'}), 400
         if image.mode != 'RGB':
             image = image.convert('RGB')
         import numpy as np
         import cv2
         img_array = np.array(image)
+        # Try multiple OCR strategies and pick the best result
+        results = []
+        # Strategy 1: Original image (best for colored text, graphics)
+        try:
+            config1 = r'--oem 3 --psm 3'
+            text1 = pytesseract.image_to_string(image, config=config1, lang='eng')
+            results.append(('original', text1, len(text1.strip())))
+        except Exception as e:
+            logger.warning(f"Strategy 1 failed: {e}")
+        # Strategy 2: Grayscale (good for normal documents)
+        try:
+            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            gray_img = Image.fromarray(gray)
+            config2 = r'--oem 3 --psm 6'
+            text2 = pytesseract.image_to_string(gray_img, config=config2, lang='eng')
+            results.append(('grayscale', text2, len(text2.strip())))
+        except Exception as e:
+            logger.warning(f"Strategy 2 failed: {e}")
+        # Strategy 3: High contrast (for faded text)
+        try:
+            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            # Increase contrast
+            alpha = 1.5  # Contrast control
+            beta = 0     # Brightness control
+            contrast = cv2.convertScaleAbs(gray, alpha=alpha, beta=beta)
+            contrast_img = Image.fromarray(contrast)
+            config3 = r'--oem 3 --psm 3'
+            text3 = pytesseract.image_to_string(contrast_img, config=config3, lang='eng')
+            results.append(('contrast', text3, len(text3.strip())))
+        except Exception as e:
+            logger.warning(f"Strategy 3 failed: {e}")
+        # Pick the result with the most content
+        if not results:
+            raise Exception("All OCR strategies failed")
+        # Sort by text length (more text usually means better recognition)
+        results.sort(key=lambda x: x[2], reverse=True)
+        best_strategy, raw_text, _ = results[0]
+        logger.info(f"Best strategy: {best_strategy} with {len(raw_text)} characters")
+        # Clean up the text
+        lines = []
+        for line in raw_text.split('\n'):
+            # Strip whitespace
+            line = line.strip()
+            # Skip empty lines
+            if not line:
+                continue
+            # Skip lines that are mostly noise (too many special chars)
+            alnum = sum(c.isalnum() or c in ' .,!?-$%()' for c in line)
+            if len(line) > 0 and (alnum / len(line)) > 0.4:
+                lines.append(line)
+        text = '\n'.join(lines)
+        # If result is still too short, try without filtering
+        if len(text) < 20 and len(raw_text.strip()) > len(text):
+            text = raw_text.strip()
         logger.info(f"OCR successful! Extracted {len(text)} characters")
         # Create text file
+        text_content = f"Extracted Text from {file.filename}\n\n{text}"
         # Return as downloadable text file
         buffer = BytesIO()