Spaces:

dipan004
/

Proofly

Sleeping

App Files Files Community

Proofly / agents /text_extraction_agent.py

Dipan04

Initial clean commit for Hugging Face Space

2c41dce about 1 month ago

raw

history blame contribute delete

6.57 kB

	"""
	Text Extraction Agent (OCR)
	Deterministic preprocessing agent for extracting text from images and PDFs.
	Uses Tesseract OCR - a deterministic, non-AI algorithm.
	"""

	from typing import Dict, Any, Optional
	import io
	import logging

	from core.agent_base import Agent
	from core.errors import (
	OCRNotApplicableError,
	OCRProcessingError,
	OCRDependencyMissingError
	)
	from config.settings import settings

	# Lazy imports - only load if OCR is needed
	try:
	import pytesseract
	from PIL import Image
	TESSERACT_AVAILABLE = True
	except ImportError:
	TESSERACT_AVAILABLE = False

	logger = logging.getLogger(__name__)


	class TextExtractionAgent(Agent):
	"""
	Extracts text from images and PDFs using Tesseract OCR.
	This is a deterministic preprocessing step, not AI.
	"""

	# Content types that support OCR
	OCR_SUPPORTED_TYPES = {
	"image/png",
	"image/jpeg",
	"image/jpg",
	"image/tiff",
	"image/bmp",
	"image/gif",
	}

	def __init__(self):
	super().__init__()
	self._check_dependencies()

	def _check_dependencies(self):
	"""Check if Tesseract is available."""
	if not TESSERACT_AVAILABLE:
	logger.warning(
	"Tesseract dependencies not available. "
	"Install with: pip install pytesseract pillow"
	)
	return

	try:
	# Verify Tesseract binary is accessible
	pytesseract.get_tesseract_version()
	logger.info("Tesseract OCR is available and ready")
	except Exception as e:
	logger.warning(
	f"Tesseract binary not found in PATH: {str(e)}. "
	"OCR will be skipped for all inputs."
	)

	def execute(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Extract text from content if applicable.

	Expected input_data:
	{
	"content": bytes,
	"content_type": str,
	"size": int,
	...other fields from validation...
	}

	Returns:
	{
	"extracted_text": str \| None,
	"ocr_engine": str \| None,
	"ocr_status": "success" \| "skipped" \| "failed",
	"ocr_confidence": float \| None, # Future enhancement
	...passes through input_data...
	}
	"""
	# Skip if OCR is globally disabled
	if not settings.OCR_ENABLED:
	logger.debug("OCR is disabled in settings")
	return self._skip_ocr(input_data, "disabled")

	# Check if dependencies are available
	if not TESSERACT_AVAILABLE:
	logger.debug("OCR dependencies not available")
	return self._skip_ocr(input_data, "dependencies_missing")

	content_type = input_data.get("content_type", "")
	content = input_data.get("content")

	# Check if content type supports OCR
	if not self._is_ocr_applicable(content_type):
	logger.debug(f"OCR not applicable for content type: {content_type}")
	return self._skip_ocr(input_data, "not_applicable")

	# Attempt OCR extraction
	try:
	extracted_text = self._extract_text(content, content_type)

	result = input_data.copy()
	result.update({
	"extracted_text": extracted_text,
	"ocr_engine": "tesseract",
	"ocr_status": "success",
	"ocr_confidence": None, # Tesseract confidence available but not used in MVP
	})

	logger.info(
	f"OCR successful: extracted {len(extracted_text)} characters"
	)
	return result

	except Exception as e:
	logger.error(f"OCR processing failed: {str(e)}")
	return self._skip_ocr(input_data, "failed", error=str(e))

	def _is_ocr_applicable(self, content_type: str) -> bool:
	"""
	Check if OCR is applicable for this content type.

	Args:
	content_type: MIME type of the content

	Returns:
	True if OCR should be attempted
	"""
	return content_type.lower() in self.OCR_SUPPORTED_TYPES

	def _extract_text(self, content: bytes, content_type: str) -> str:
	"""
	Extract text using Tesseract OCR.

	Args:
	content: Image bytes
	content_type: MIME type

	Returns:
	Extracted text string

	Raises:
	OCRProcessingError: If extraction fails
	"""
	try:
	# Convert bytes to PIL Image
	image = Image.open(io.BytesIO(content))

	# Perform OCR with configured language
	text = pytesseract.image_to_string(
	image,
	lang=settings.OCR_LANGUAGE,
	config='--psm 3' # Fully automatic page segmentation
	)

	# Clean up extracted text
	text = text.strip()

	if not text:
	logger.warning("OCR completed but no text was extracted")

	return text

	except pytesseract.TesseractNotFoundError as e:
	raise OCRDependencyMissingError(
	"Tesseract binary not found. Please install Tesseract OCR."
	) from e
	except Exception as e:
	raise OCRProcessingError(
	f"Text extraction failed: {str(e)}"
	) from e

	def _skip_ocr(
	self,
	input_data: Dict[str, Any],
	reason: str,
	error: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Return input data with OCR skipped.

	Args:
	input_data: Original input data
	reason: Reason for skipping
	error: Optional error message

	Returns:
	Input data with OCR status = skipped/failed
	"""
	result = input_data.copy()

	ocr_status = "skipped" if reason != "failed" else "failed"

	result.update({
	"extracted_text": None,
	"ocr_engine": None,
	"ocr_status": ocr_status,
	"ocr_confidence": None,
	})

	if error:
	result["ocr_error"] = error

	return result