Spaces:

dipan004
/

Proofly

Sleeping

File size: 6,572 Bytes

2c41dce

"""
Text Extraction Agent (OCR)
Deterministic preprocessing agent for extracting text from images and PDFs.
Uses Tesseract OCR - a deterministic, non-AI algorithm.
"""

from typing import Dict, Any, Optional
import io
import logging

from core.agent_base import Agent
from core.errors import (
    OCRNotApplicableError,
    OCRProcessingError,
    OCRDependencyMissingError
)
from config.settings import settings

# Lazy imports - only load if OCR is needed
try:
    import pytesseract
    from PIL import Image
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False

logger = logging.getLogger(__name__)


class TextExtractionAgent(Agent):
    """
    Extracts text from images and PDFs using Tesseract OCR.
    This is a deterministic preprocessing step, not AI.
    """
    
    # Content types that support OCR
    OCR_SUPPORTED_TYPES = {
        "image/png",
        "image/jpeg",
        "image/jpg",
        "image/tiff",
        "image/bmp",
        "image/gif",
    }
    
    def __init__(self):
        super().__init__()
        self._check_dependencies()
    
    def _check_dependencies(self):
        """Check if Tesseract is available."""
        if not TESSERACT_AVAILABLE:
            logger.warning(
                "Tesseract dependencies not available. "
                "Install with: pip install pytesseract pillow"
            )
            return
        
        try:
            # Verify Tesseract binary is accessible
            pytesseract.get_tesseract_version()
            logger.info("Tesseract OCR is available and ready")
        except Exception as e:
            logger.warning(
                f"Tesseract binary not found in PATH: {str(e)}. "
                "OCR will be skipped for all inputs."
            )
    
    def execute(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract text from content if applicable.
        
        Expected input_data:
            {
                "content": bytes,
                "content_type": str,
                "size": int,
                ...other fields from validation...
            }
        
        Returns:
            {
                "extracted_text": str | None,
                "ocr_engine": str | None,
                "ocr_status": "success" | "skipped" | "failed",
                "ocr_confidence": float | None,  # Future enhancement
                ...passes through input_data...
            }
        """
        # Skip if OCR is globally disabled
        if not settings.OCR_ENABLED:
            logger.debug("OCR is disabled in settings")
            return self._skip_ocr(input_data, "disabled")
        
        # Check if dependencies are available
        if not TESSERACT_AVAILABLE:
            logger.debug("OCR dependencies not available")
            return self._skip_ocr(input_data, "dependencies_missing")
        
        content_type = input_data.get("content_type", "")
        content = input_data.get("content")
        
        # Check if content type supports OCR
        if not self._is_ocr_applicable(content_type):
            logger.debug(f"OCR not applicable for content type: {content_type}")
            return self._skip_ocr(input_data, "not_applicable")
        
        # Attempt OCR extraction
        try:
            extracted_text = self._extract_text(content, content_type)
            
            result = input_data.copy()
            result.update({
                "extracted_text": extracted_text,
                "ocr_engine": "tesseract",
                "ocr_status": "success",
                "ocr_confidence": None,  # Tesseract confidence available but not used in MVP
            })
            
            logger.info(
                f"OCR successful: extracted {len(extracted_text)} characters"
            )
            return result
            
        except Exception as e:
            logger.error(f"OCR processing failed: {str(e)}")
            return self._skip_ocr(input_data, "failed", error=str(e))
    
    def _is_ocr_applicable(self, content_type: str) -> bool:
        """
        Check if OCR is applicable for this content type.
        
        Args:
            content_type: MIME type of the content
            
        Returns:
            True if OCR should be attempted
        """
        return content_type.lower() in self.OCR_SUPPORTED_TYPES
    
    def _extract_text(self, content: bytes, content_type: str) -> str:
        """
        Extract text using Tesseract OCR.
        
        Args:
            content: Image bytes
            content_type: MIME type
            
        Returns:
            Extracted text string
            
        Raises:
            OCRProcessingError: If extraction fails
        """
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(content))
            
            # Perform OCR with configured language
            text = pytesseract.image_to_string(
                image,
                lang=settings.OCR_LANGUAGE,
                config='--psm 3'  # Fully automatic page segmentation
            )
            
            # Clean up extracted text
            text = text.strip()
            
            if not text:
                logger.warning("OCR completed but no text was extracted")
            
            return text
            
        except pytesseract.TesseractNotFoundError as e:
            raise OCRDependencyMissingError(
                "Tesseract binary not found. Please install Tesseract OCR."
            ) from e
        except Exception as e:
            raise OCRProcessingError(
                f"Text extraction failed: {str(e)}"
            ) from e
    
    def _skip_ocr(
        self,
        input_data: Dict[str, Any],
        reason: str,
        error: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Return input data with OCR skipped.
        
        Args:
            input_data: Original input data
            reason: Reason for skipping
            error: Optional error message
            
        Returns:
            Input data with OCR status = skipped/failed
        """
        result = input_data.copy()
        
        ocr_status = "skipped" if reason != "failed" else "failed"
        
        result.update({
            "extracted_text": None,
            "ocr_engine": None,
            "ocr_status": ocr_status,
            "ocr_confidence": None,
        })
        
        if error:
            result["ocr_error"] = error
        
        return result