mcp_ocr_json / ocr_engine.py
Vachudev's picture
added ocr_preprocessing_engine call
006541d verified
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
# Import the Robust Vision logic
# Ensure ocr_preprocessing_engine.py is in the same directory
try:
from ocr_preprocessing_engine import preprocess_image
except ImportError:
# Fail-safe if the module is missing
logging.warning("ocr_preprocessing_engine not found. Using raw OCR only.")
def preprocess_image(img, page_num): return img
logger = logging.getLogger("ocr_engine")
def extract_text_from_file(file_path: str) -> str:
"""
Extracts text using a Hybrid Pipeline:
1. Attempt Robust Preprocessing (Deskew -> Denoise -> Adaptive Threshold).
2. Fallback to Raw Image if preprocessing yields low/empty confidence.
Ref: Tesseract best practices for DPI and Preprocessing [3], [1].
"""
if not os.path.exists(file_path):
return ""
text_content = ""
images = []
try:
# 1. Image Loading & DPI Scaling
# Tesseract works best at 300 DPI [3].
if file_path.lower().endswith('.pdf'):
try:
images = convert_from_path(file_path, dpi=300)
except Exception as e:
return f"Error reading PDF: {str(e)}"
elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
try:
images = [Image.open(file_path)]
except Exception as e:
return f"Error reading image: {str(e)}"
else:
return "Unsupported file format. Please upload PDF or Image."
# 2. Page-by-Page Extraction
for i, raw_img in enumerate(images):
page_num = i + 1
# Tesseract Configuration
# --psm 4: Assume variable size text (good for single-column invoices) [4]
# --oem 3: Default LSTM engine
custom_config = r'--oem 3 --psm 4'
page_text = ""
# --- STRATEGY A: ROBUST PREPROCESSING ---
try:
# Apply the "Make OCR Work" pipeline (Deskew, Denoise, Threshold) [5], [6]
processed_img = preprocess_image(raw_img, page_num)
page_text = pytesseract.image_to_string(processed_img, config=custom_config)
except Exception as e:
logger.warning(f"Page {page_num}: Preprocessing failed ({e}). Skipping to fallback.")
# --- STRATEGY B: FALLBACK MECHANISM ---
# If preprocessing was too aggressive (e.g., thresholding wiped the text),
# rely on Tesseract's internal Otsu binarization [3], [1].
if len(page_text.strip()) < 10:
logger.info(f"Page {page_num}: Low confidence extraction. Retrying with raw image...")
page_text = pytesseract.image_to_string(raw_img, config=custom_config)
text_content += f"--- Page {page_num} ---\n{page_text}\n"
except Exception as e:
logger.error(f"OCR Critical Error: {e}")
return f"OCR Failed: {str(e)}"
return text_content.strip()