mcp_ocr_json

Sleeping

App Files Files Community

mcp_ocr_json / ocr_engine.py

Vachudev

added ocr_preprocessing_engine call

006541d verified about 2 months ago

raw

history blame contribute delete

3.09 kB

	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import os
	import logging

	# Import the Robust Vision logic
	# Ensure ocr_preprocessing_engine.py is in the same directory
	try:
	from ocr_preprocessing_engine import preprocess_image
	except ImportError:
	# Fail-safe if the module is missing
	logging.warning("ocr_preprocessing_engine not found. Using raw OCR only.")
	def preprocess_image(img, page_num): return img

	logger = logging.getLogger("ocr_engine")

	def extract_text_from_file(file_path: str) -> str:
	"""
	Extracts text using a Hybrid Pipeline:
	1. Attempt Robust Preprocessing (Deskew -> Denoise -> Adaptive Threshold).
	2. Fallback to Raw Image if preprocessing yields low/empty confidence.

	Ref: Tesseract best practices for DPI and Preprocessing [3], [1].
	"""
	if not os.path.exists(file_path):
	return ""

	text_content = ""
	images = []

	try:
	# 1. Image Loading & DPI Scaling
	# Tesseract works best at 300 DPI [3].
	if file_path.lower().endswith('.pdf'):
	try:
	images = convert_from_path(file_path, dpi=300)
	except Exception as e:
	return f"Error reading PDF: {str(e)}"
	elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
	try:
	images = [Image.open(file_path)]
	except Exception as e:
	return f"Error reading image: {str(e)}"
	else:
	return "Unsupported file format. Please upload PDF or Image."

	# 2. Page-by-Page Extraction
	for i, raw_img in enumerate(images):
	page_num = i + 1

	# Tesseract Configuration
	# --psm 4: Assume variable size text (good for single-column invoices) [4]
	# --oem 3: Default LSTM engine
	custom_config = r'--oem 3 --psm 4'

	page_text = ""

	# --- STRATEGY A: ROBUST PREPROCESSING ---
	try:
	# Apply the "Make OCR Work" pipeline (Deskew, Denoise, Threshold) [5], [6]
	processed_img = preprocess_image(raw_img, page_num)
	page_text = pytesseract.image_to_string(processed_img, config=custom_config)
	except Exception as e:
	logger.warning(f"Page {page_num}: Preprocessing failed ({e}). Skipping to fallback.")

	# --- STRATEGY B: FALLBACK MECHANISM ---
	# If preprocessing was too aggressive (e.g., thresholding wiped the text),
	# rely on Tesseract's internal Otsu binarization [3], [1].
	if len(page_text.strip()) < 10:
	logger.info(f"Page {page_num}: Low confidence extraction. Retrying with raw image...")
	page_text = pytesseract.image_to_string(raw_img, config=custom_config)

	text_content += f"--- Page {page_num} ---\n{page_text}\n"

	except Exception as e:
	logger.error(f"OCR Critical Error: {e}")
	return f"OCR Failed: {str(e)}"

	return text_content.strip()