mcp_ocr_json

Sleeping

App Files Files Community

Vachudev commited on Dec 5, 2025

Commit

006541d

verified ·

1 Parent(s): 7880958

added ocr_preprocessing_engine call

Browse files

Files changed (1) hide show

ocr_engine.py +50 -15

ocr_engine.py CHANGED Viewed

@@ -3,41 +3,76 @@ from pdf2image import convert_from_path
 from PIL import Image
 import os
 import logging
 logger = logging.getLogger("ocr_engine")
 def extract_text_from_file(file_path: str) -> str:
     """
-    Extracts text from a PDF or Image file using Tesseract.
     """
     if not os.path.exists(file_path):
         return ""
     text_content = ""
     try:
-        # Handle PDF
         if file_path.lower().endswith('.pdf'):
             try:
-                # Convert PDF pages to images
-                images = convert_from_path(file_path)
-                for i, image in enumerate(images):
-                    page_text = pytesseract.image_to_string(image)
-                    text_content += f"--- Page {i+1} ---\n{page_text}\n"
             except Exception as e:
-                logger.error(f"Error converting PDF: {e}")
                 return f"Error reading PDF: {str(e)}"
-        # Handle Images (JPG, PNG, etc.)
         elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
             try:
-                image = Image.open(file_path)
-                text_content = pytesseract.image_to_string(image)
             except Exception as e:
-                logger.error(f"Error reading image: {e}")
                 return f"Error reading image: {str(e)}"
         else:
             return "Unsupported file format. Please upload PDF or Image."
     except Exception as e:
         logger.error(f"OCR Critical Error: {e}")
         return f"OCR Failed: {str(e)}"

 from PIL import Image
 import os
 import logging
+# Import the Robust Vision logic
+# Ensure ocr_preprocessing_engine.py is in the same directory
+try:
+    from ocr_preprocessing_engine import preprocess_image
+except ImportError:
+    # Fail-safe if the module is missing
+    logging.warning("ocr_preprocessing_engine not found. Using raw OCR only.")
+    def preprocess_image(img, page_num): return img
 logger = logging.getLogger("ocr_engine")
 def extract_text_from_file(file_path: str) -> str:
     """
+    Extracts text using a Hybrid Pipeline:
+    1. Attempt Robust Preprocessing (Deskew -> Denoise -> Adaptive Threshold).
+    2. Fallback to Raw Image if preprocessing yields low/empty confidence.
+    Ref: Tesseract best practices for DPI and Preprocessing [3], [1].
     """
     if not os.path.exists(file_path):
         return ""
     text_content = ""
+    images = []
     try:
+        # 1. Image Loading & DPI Scaling
+        # Tesseract works best at 300 DPI [3].
         if file_path.lower().endswith('.pdf'):
             try:
+                images = convert_from_path(file_path, dpi=300)
             except Exception as e:
                 return f"Error reading PDF: {str(e)}"
         elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
             try:
+                images = [Image.open(file_path)]
             except Exception as e:
                 return f"Error reading image: {str(e)}"
         else:
             return "Unsupported file format. Please upload PDF or Image."
+        # 2. Page-by-Page Extraction
+        for i, raw_img in enumerate(images):
+            page_num = i + 1
+            # Tesseract Configuration
+            # --psm 4: Assume variable size text (good for single-column invoices) [4]
+            # --oem 3: Default LSTM engine
+            custom_config = r'--oem 3 --psm 4'
+            page_text = ""
+            # --- STRATEGY A: ROBUST PREPROCESSING ---
+            try:
+                # Apply the "Make OCR Work" pipeline (Deskew, Denoise, Threshold) [5], [6]
+                processed_img = preprocess_image(raw_img, page_num)
+                page_text = pytesseract.image_to_string(processed_img, config=custom_config)
+            except Exception as e:
+                logger.warning(f"Page {page_num}: Preprocessing failed ({e}). Skipping to fallback.")
+            # --- STRATEGY B: FALLBACK MECHANISM ---
+            # If preprocessing was too aggressive (e.g., thresholding wiped the text),
+            # rely on Tesseract's internal Otsu binarization [3], [1].
+            if len(page_text.strip()) < 10:
+                logger.info(f"Page {page_num}: Low confidence extraction. Retrying with raw image...")
+                page_text = pytesseract.image_to_string(raw_img, config=custom_config)
+            text_content += f"--- Page {page_num} ---\n{page_text}\n"
     except Exception as e:
         logger.error(f"OCR Critical Error: {e}")
         return f"OCR Failed: {str(e)}"