File size: 6,572 Bytes
2c41dce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
Text Extraction Agent (OCR)
Deterministic preprocessing agent for extracting text from images and PDFs.
Uses Tesseract OCR - a deterministic, non-AI algorithm.
"""

from typing import Dict, Any, Optional
import io
import logging

from core.agent_base import Agent
from core.errors import (
    OCRNotApplicableError,
    OCRProcessingError,
    OCRDependencyMissingError
)
from config.settings import settings

# Lazy imports - only load if OCR is needed
try:
    import pytesseract
    from PIL import Image
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False

logger = logging.getLogger(__name__)


class TextExtractionAgent(Agent):
    """
    Extracts text from images and PDFs using Tesseract OCR.
    This is a deterministic preprocessing step, not AI.
    """
    
    # Content types that support OCR
    OCR_SUPPORTED_TYPES = {
        "image/png",
        "image/jpeg",
        "image/jpg",
        "image/tiff",
        "image/bmp",
        "image/gif",
    }
    
    def __init__(self):
        super().__init__()
        self._check_dependencies()
    
    def _check_dependencies(self):
        """Check if Tesseract is available."""
        if not TESSERACT_AVAILABLE:
            logger.warning(
                "Tesseract dependencies not available. "
                "Install with: pip install pytesseract pillow"
            )
            return
        
        try:
            # Verify Tesseract binary is accessible
            pytesseract.get_tesseract_version()
            logger.info("Tesseract OCR is available and ready")
        except Exception as e:
            logger.warning(
                f"Tesseract binary not found in PATH: {str(e)}. "
                "OCR will be skipped for all inputs."
            )
    
    def execute(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract text from content if applicable.
        
        Expected input_data:
            {
                "content": bytes,
                "content_type": str,
                "size": int,
                ...other fields from validation...
            }
        
        Returns:
            {
                "extracted_text": str | None,
                "ocr_engine": str | None,
                "ocr_status": "success" | "skipped" | "failed",
                "ocr_confidence": float | None,  # Future enhancement
                ...passes through input_data...
            }
        """
        # Skip if OCR is globally disabled
        if not settings.OCR_ENABLED:
            logger.debug("OCR is disabled in settings")
            return self._skip_ocr(input_data, "disabled")
        
        # Check if dependencies are available
        if not TESSERACT_AVAILABLE:
            logger.debug("OCR dependencies not available")
            return self._skip_ocr(input_data, "dependencies_missing")
        
        content_type = input_data.get("content_type", "")
        content = input_data.get("content")
        
        # Check if content type supports OCR
        if not self._is_ocr_applicable(content_type):
            logger.debug(f"OCR not applicable for content type: {content_type}")
            return self._skip_ocr(input_data, "not_applicable")
        
        # Attempt OCR extraction
        try:
            extracted_text = self._extract_text(content, content_type)
            
            result = input_data.copy()
            result.update({
                "extracted_text": extracted_text,
                "ocr_engine": "tesseract",
                "ocr_status": "success",
                "ocr_confidence": None,  # Tesseract confidence available but not used in MVP
            })
            
            logger.info(
                f"OCR successful: extracted {len(extracted_text)} characters"
            )
            return result
            
        except Exception as e:
            logger.error(f"OCR processing failed: {str(e)}")
            return self._skip_ocr(input_data, "failed", error=str(e))
    
    def _is_ocr_applicable(self, content_type: str) -> bool:
        """
        Check if OCR is applicable for this content type.
        
        Args:
            content_type: MIME type of the content
            
        Returns:
            True if OCR should be attempted
        """
        return content_type.lower() in self.OCR_SUPPORTED_TYPES
    
    def _extract_text(self, content: bytes, content_type: str) -> str:
        """
        Extract text using Tesseract OCR.
        
        Args:
            content: Image bytes
            content_type: MIME type
            
        Returns:
            Extracted text string
            
        Raises:
            OCRProcessingError: If extraction fails
        """
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(content))
            
            # Perform OCR with configured language
            text = pytesseract.image_to_string(
                image,
                lang=settings.OCR_LANGUAGE,
                config='--psm 3'  # Fully automatic page segmentation
            )
            
            # Clean up extracted text
            text = text.strip()
            
            if not text:
                logger.warning("OCR completed but no text was extracted")
            
            return text
            
        except pytesseract.TesseractNotFoundError as e:
            raise OCRDependencyMissingError(
                "Tesseract binary not found. Please install Tesseract OCR."
            ) from e
        except Exception as e:
            raise OCRProcessingError(
                f"Text extraction failed: {str(e)}"
            ) from e
    
    def _skip_ocr(
        self,
        input_data: Dict[str, Any],
        reason: str,
        error: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Return input data with OCR skipped.
        
        Args:
            input_data: Original input data
            reason: Reason for skipping
            error: Optional error message
            
        Returns:
            Input data with OCR status = skipped/failed
        """
        result = input_data.copy()
        
        ocr_status = "skipped" if reason != "failed" else "failed"
        
        result.update({
            "extracted_text": None,
            "ocr_engine": None,
            "ocr_status": ocr_status,
            "ocr_confidence": None,
        })
        
        if error:
            result["ocr_error"] = error
        
        return result