""" Text Extraction Agent (OCR) Deterministic preprocessing agent for extracting text from images and PDFs. Uses Tesseract OCR - a deterministic, non-AI algorithm. """ from typing import Dict, Any, Optional import io import logging from core.agent_base import Agent from core.errors import ( OCRNotApplicableError, OCRProcessingError, OCRDependencyMissingError ) from config.settings import settings # Lazy imports - only load if OCR is needed try: import pytesseract from PIL import Image TESSERACT_AVAILABLE = True except ImportError: TESSERACT_AVAILABLE = False logger = logging.getLogger(__name__) class TextExtractionAgent(Agent): """ Extracts text from images and PDFs using Tesseract OCR. This is a deterministic preprocessing step, not AI. """ # Content types that support OCR OCR_SUPPORTED_TYPES = { "image/png", "image/jpeg", "image/jpg", "image/tiff", "image/bmp", "image/gif", } def __init__(self): super().__init__() self._check_dependencies() def _check_dependencies(self): """Check if Tesseract is available.""" if not TESSERACT_AVAILABLE: logger.warning( "Tesseract dependencies not available. " "Install with: pip install pytesseract pillow" ) return try: # Verify Tesseract binary is accessible pytesseract.get_tesseract_version() logger.info("Tesseract OCR is available and ready") except Exception as e: logger.warning( f"Tesseract binary not found in PATH: {str(e)}. " "OCR will be skipped for all inputs." ) def execute(self, input_data: Dict[str, Any]) -> Dict[str, Any]: """ Extract text from content if applicable. Expected input_data: { "content": bytes, "content_type": str, "size": int, ...other fields from validation... } Returns: { "extracted_text": str | None, "ocr_engine": str | None, "ocr_status": "success" | "skipped" | "failed", "ocr_confidence": float | None, # Future enhancement ...passes through input_data... } """ # Skip if OCR is globally disabled if not settings.OCR_ENABLED: logger.debug("OCR is disabled in settings") return self._skip_ocr(input_data, "disabled") # Check if dependencies are available if not TESSERACT_AVAILABLE: logger.debug("OCR dependencies not available") return self._skip_ocr(input_data, "dependencies_missing") content_type = input_data.get("content_type", "") content = input_data.get("content") # Check if content type supports OCR if not self._is_ocr_applicable(content_type): logger.debug(f"OCR not applicable for content type: {content_type}") return self._skip_ocr(input_data, "not_applicable") # Attempt OCR extraction try: extracted_text = self._extract_text(content, content_type) result = input_data.copy() result.update({ "extracted_text": extracted_text, "ocr_engine": "tesseract", "ocr_status": "success", "ocr_confidence": None, # Tesseract confidence available but not used in MVP }) logger.info( f"OCR successful: extracted {len(extracted_text)} characters" ) return result except Exception as e: logger.error(f"OCR processing failed: {str(e)}") return self._skip_ocr(input_data, "failed", error=str(e)) def _is_ocr_applicable(self, content_type: str) -> bool: """ Check if OCR is applicable for this content type. Args: content_type: MIME type of the content Returns: True if OCR should be attempted """ return content_type.lower() in self.OCR_SUPPORTED_TYPES def _extract_text(self, content: bytes, content_type: str) -> str: """ Extract text using Tesseract OCR. Args: content: Image bytes content_type: MIME type Returns: Extracted text string Raises: OCRProcessingError: If extraction fails """ try: # Convert bytes to PIL Image image = Image.open(io.BytesIO(content)) # Perform OCR with configured language text = pytesseract.image_to_string( image, lang=settings.OCR_LANGUAGE, config='--psm 3' # Fully automatic page segmentation ) # Clean up extracted text text = text.strip() if not text: logger.warning("OCR completed but no text was extracted") return text except pytesseract.TesseractNotFoundError as e: raise OCRDependencyMissingError( "Tesseract binary not found. Please install Tesseract OCR." ) from e except Exception as e: raise OCRProcessingError( f"Text extraction failed: {str(e)}" ) from e def _skip_ocr( self, input_data: Dict[str, Any], reason: str, error: Optional[str] = None ) -> Dict[str, Any]: """ Return input data with OCR skipped. Args: input_data: Original input data reason: Reason for skipping error: Optional error message Returns: Input data with OCR status = skipped/failed """ result = input_data.copy() ocr_status = "skipped" if reason != "failed" else "failed" result.update({ "extracted_text": None, "ocr_engine": None, "ocr_status": ocr_status, "ocr_confidence": None, }) if error: result["ocr_error"] = error return result