Spaces:

omthakur1
/

python-doc-convert

Sleeping

App Files Files Community

omthakur1 commited on Feb 10

Commit

19d3640

1 Parent(s): 5ec4d20

feat: Advanced OCR with image preprocessing for accurate text extraction

Browse files

Files changed (2) hide show

app.py +60 -3
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -257,7 +257,7 @@ def pdf_to_word():
 @app.route('/image-to-text', methods=['POST'])
 def image_to_text():
-    """Extract text from image using Tesseract OCR"""
     if 'file' not in request.files:
         return jsonify({'error': 'No file provided'}), 400
@@ -273,8 +273,65 @@ def image_to_text():
         logger.info(f"Extracting text from image ({image.size})...")
-        # Perform OCR
-        text = pytesseract.image_to_string(image)
         logger.info(f"OCR successful! Extracted {len(text)} characters")

 @app.route('/image-to-text', methods=['POST'])
 def image_to_text():
+    """Extract text from image using Tesseract OCR with advanced preprocessing"""
     if 'file' not in request.files:
         return jsonify({'error': 'No file provided'}), 400
         logger.info(f"Extracting text from image ({image.size})...")
+        # Convert to RGB if necessary
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Convert PIL Image to numpy array for preprocessing
+        import numpy as np
+        import cv2
+        img_array = np.array(image)
+        # Image preprocessing for better OCR
+        # 1. Convert to grayscale
+        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        # 2. Apply slight Gaussian blur to reduce noise
+        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
+        # 3. Apply adaptive thresholding for better contrast
+        thresh = cv2.adaptiveThreshold(
+            blurred, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            11, 2
+        )
+        # 4. Apply morphological operations to remove small noise
+        kernel = np.ones((1, 1), np.uint8)
+        processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
+        processed = cv2.medianBlur(processed, 1)
+        # Convert back to PIL Image for Tesseract
+        processed_image = Image.fromarray(processed)
+        # Configure Tesseract for better accuracy
+        # PSM 3 = Fully automatic page segmentation, but no OSD
+        # PSM 6 = Assume a single uniform block of text
+        # OEM 3 = Default, based on what is available (LSTM + Legacy)
+        custom_config = r'--oem 3 --psm 3 -c preserve_interword_spaces=1'
+        # Perform OCR with configuration
+        text = pytesseract.image_to_string(processed_image, config=custom_config, lang='eng')
+        # Clean up the extracted text
+        # Remove excessive whitespace and empty lines
+        lines = [line.strip() for line in text.split('\n')]
+        cleaned_lines = [line for line in lines if line and len(line) > 0]
+        # Filter out lines with too many special characters (likely errors)
+        filtered_lines = []
+        for line in cleaned_lines:
+            # Count alphanumeric vs special chars
+            alnum_count = sum(c.isalnum() or c.isspace() for c in line)
+            special_count = len(line) - alnum_count
+            # Keep line if it has reasonable ratio of alphanumeric characters
+            if len(line) > 0 and (alnum_count / len(line)) > 0.5:
+                filtered_lines.append(line)
+        text = '\n'.join(filtered_lines)
         logger.info(f"OCR successful! Extracted {len(text)} characters")

requirements.txt CHANGED Viewed

@@ -6,3 +6,5 @@ PyPDF2==3.0.1
 pytesseract==0.3.10
 Pillow==10.2.0
 pdf2docx==0.5.8

 pytesseract==0.3.10
 Pillow==10.2.0
 pdf2docx==0.5.8
+opencv-python-headless==4.8.1.78
+numpy==1.24.3