mcp_ocr_json

Sleeping

App Files Files Community

mcp_ocr_json / ocr_preprocessing_engine.py

Vachudev

Update ocr_preprocessing_engine.py

7880958 verified about 2 months ago

raw

history blame contribute delete

3.92 kB

	import cv2
	import numpy as np
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image
	import os
	import logging
	import tempfile # Added for safe path handling

	logger = logging.getLogger("ocr_preprocessor")

	# Toggle to save images for debugging
	DEBUG_SAVE_IMAGES = True

	def preprocess_image(image: Image.Image, page_num: int) -> Image.Image:
	"""
	Applies the preprocessing steps for OCR enhancement
	1. Normalization (Contrast Stretching)
	2. Denoising (Gaussian Blur)
	3. Deskewing (Rotation Correction)
	4. Thresholding (Binarization)

	Saves debug images to the system temp directory to avoid permission errors in HF Spaces.
	"""
	# 1. Convert to Grayscale
	img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)

	# 2. Normalization
	norm_img = np.zeros(img_cv.shape, dtype=np.uint8)
	img_cv = cv2.normalize(img_cv, norm_img, 0, 255, cv2.NORM_MINMAX)

	# 3. Denoising (3x3 kernel)
	denoised = cv2.GaussianBlur(img_cv, (3, 3), 0)

	# 4. Adaptive Thresholding
	binary = cv2.adaptiveThreshold(
	denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	cv2.THRESH_BINARY, 11, 2
	)

	# 5. Deskewing (Inverted)
	inverted_binary = cv2.bitwise_not(binary)
	coords = np.column_stack(np.where(inverted_binary > 0))

	if coords.size > 0:
	angle = cv2.minAreaRect(coords)[-1]
	if angle < -45: angle = -(90 + angle)
	else: angle = -angle

	if abs(angle) > 0.5:
	(h, w) = binary.shape[:2]
	center = (w // 2, h // 2)
	M = cv2.getRotationMatrix2D(center, angle, 1.0)
	binary = cv2.warpAffine(
	binary, M, (w, h),
	flags=cv2.INTER_CUBIC,
	borderMode=cv2.BORDER_REPLICATE
	)

	# --- DEBUG SAVING LOGIC ---
	if DEBUG_SAVE_IMAGES:
	try:
	# Use the system temp directory (/tmp in Linux/HF Spaces)
	temp_dir = tempfile.gettempdir()
	debug_filename = f"debug_page_{page_num}_processed.png"
	debug_path = os.path.join(temp_dir, debug_filename)

	cv2.imwrite(debug_path, binary)
	logger.info(f"Debug image saved to: {debug_path}")
	except Exception as e:
	logger.warning(f"Could not save debug image: {e}")
	# --------------------------

	return Image.fromarray(binary)

	def extract_text_with_preprocessing(file_path: str) -> str:
	"""
	Pipeline: PDF -> 300 DPI Image -> Preprocessing -> Tesseract
	Converts PDF to 300 DPI images, pre-processes them,
	and runs Tesseract with layout preservation.
	"""
	if not os.path.exists(file_path):
	return ""

	text_content = ""
	try:
	if file_path.lower().endswith('.pdf'):
	# Convert PDF to images at 300 DPI [2]
	images = convert_from_path(file_path, dpi=300)
	else:
	images = [Image.open(file_path)]

	for i, raw_img in enumerate(images):
	custom_config = r'--oem 3 --psm 4'

	# Try Robust Preprocessing
	try:
	processed_img = preprocess_image(raw_img, i+1)
	page_text = pytesseract.image_to_string(processed_img, config=custom_config)
	except Exception as e:
	logger.warning(f"Preprocessing failed: {e}")
	page_text = ""

	# Fallback to Raw Image if preprocessing fails or yields empty text [3]
	if len(page_text.strip()) < 10:
	logger.warning(f"Page {i+1}: Low confidence. Retrying with raw image.")
	page_text = pytesseract.image_to_string(raw_img, config=custom_config)

	text_content += f"--- Page {i+1} ---\n{page_text}\n"

	except Exception as e:
	logger.error(f"OCR Pipeline Error: {e}")
	return f"Error processing file: {str(e)}"

	return text_content.strip()