|
|
""" |
|
|
Text Extraction Agent (OCR) |
|
|
Deterministic preprocessing agent for extracting text from images and PDFs. |
|
|
Uses Tesseract OCR - a deterministic, non-AI algorithm. |
|
|
""" |
|
|
|
|
|
from typing import Dict, Any, Optional |
|
|
import io |
|
|
import logging |
|
|
|
|
|
from core.agent_base import Agent |
|
|
from core.errors import ( |
|
|
OCRNotApplicableError, |
|
|
OCRProcessingError, |
|
|
OCRDependencyMissingError |
|
|
) |
|
|
from config.settings import settings |
|
|
|
|
|
|
|
|
try: |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
TESSERACT_AVAILABLE = True |
|
|
except ImportError: |
|
|
TESSERACT_AVAILABLE = False |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class TextExtractionAgent(Agent): |
|
|
""" |
|
|
Extracts text from images and PDFs using Tesseract OCR. |
|
|
This is a deterministic preprocessing step, not AI. |
|
|
""" |
|
|
|
|
|
|
|
|
OCR_SUPPORTED_TYPES = { |
|
|
"image/png", |
|
|
"image/jpeg", |
|
|
"image/jpg", |
|
|
"image/tiff", |
|
|
"image/bmp", |
|
|
"image/gif", |
|
|
} |
|
|
|
|
|
def __init__(self): |
|
|
super().__init__() |
|
|
self._check_dependencies() |
|
|
|
|
|
def _check_dependencies(self): |
|
|
"""Check if Tesseract is available.""" |
|
|
if not TESSERACT_AVAILABLE: |
|
|
logger.warning( |
|
|
"Tesseract dependencies not available. " |
|
|
"Install with: pip install pytesseract pillow" |
|
|
) |
|
|
return |
|
|
|
|
|
try: |
|
|
|
|
|
pytesseract.get_tesseract_version() |
|
|
logger.info("Tesseract OCR is available and ready") |
|
|
except Exception as e: |
|
|
logger.warning( |
|
|
f"Tesseract binary not found in PATH: {str(e)}. " |
|
|
"OCR will be skipped for all inputs." |
|
|
) |
|
|
|
|
|
def execute(self, input_data: Dict[str, Any]) -> Dict[str, Any]: |
|
|
""" |
|
|
Extract text from content if applicable. |
|
|
|
|
|
Expected input_data: |
|
|
{ |
|
|
"content": bytes, |
|
|
"content_type": str, |
|
|
"size": int, |
|
|
...other fields from validation... |
|
|
} |
|
|
|
|
|
Returns: |
|
|
{ |
|
|
"extracted_text": str | None, |
|
|
"ocr_engine": str | None, |
|
|
"ocr_status": "success" | "skipped" | "failed", |
|
|
"ocr_confidence": float | None, # Future enhancement |
|
|
...passes through input_data... |
|
|
} |
|
|
""" |
|
|
|
|
|
if not settings.OCR_ENABLED: |
|
|
logger.debug("OCR is disabled in settings") |
|
|
return self._skip_ocr(input_data, "disabled") |
|
|
|
|
|
|
|
|
if not TESSERACT_AVAILABLE: |
|
|
logger.debug("OCR dependencies not available") |
|
|
return self._skip_ocr(input_data, "dependencies_missing") |
|
|
|
|
|
content_type = input_data.get("content_type", "") |
|
|
content = input_data.get("content") |
|
|
|
|
|
|
|
|
if not self._is_ocr_applicable(content_type): |
|
|
logger.debug(f"OCR not applicable for content type: {content_type}") |
|
|
return self._skip_ocr(input_data, "not_applicable") |
|
|
|
|
|
|
|
|
try: |
|
|
extracted_text = self._extract_text(content, content_type) |
|
|
|
|
|
result = input_data.copy() |
|
|
result.update({ |
|
|
"extracted_text": extracted_text, |
|
|
"ocr_engine": "tesseract", |
|
|
"ocr_status": "success", |
|
|
"ocr_confidence": None, |
|
|
}) |
|
|
|
|
|
logger.info( |
|
|
f"OCR successful: extracted {len(extracted_text)} characters" |
|
|
) |
|
|
return result |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"OCR processing failed: {str(e)}") |
|
|
return self._skip_ocr(input_data, "failed", error=str(e)) |
|
|
|
|
|
def _is_ocr_applicable(self, content_type: str) -> bool: |
|
|
""" |
|
|
Check if OCR is applicable for this content type. |
|
|
|
|
|
Args: |
|
|
content_type: MIME type of the content |
|
|
|
|
|
Returns: |
|
|
True if OCR should be attempted |
|
|
""" |
|
|
return content_type.lower() in self.OCR_SUPPORTED_TYPES |
|
|
|
|
|
def _extract_text(self, content: bytes, content_type: str) -> str: |
|
|
""" |
|
|
Extract text using Tesseract OCR. |
|
|
|
|
|
Args: |
|
|
content: Image bytes |
|
|
content_type: MIME type |
|
|
|
|
|
Returns: |
|
|
Extracted text string |
|
|
|
|
|
Raises: |
|
|
OCRProcessingError: If extraction fails |
|
|
""" |
|
|
try: |
|
|
|
|
|
image = Image.open(io.BytesIO(content)) |
|
|
|
|
|
|
|
|
text = pytesseract.image_to_string( |
|
|
image, |
|
|
lang=settings.OCR_LANGUAGE, |
|
|
config='--psm 3' |
|
|
) |
|
|
|
|
|
|
|
|
text = text.strip() |
|
|
|
|
|
if not text: |
|
|
logger.warning("OCR completed but no text was extracted") |
|
|
|
|
|
return text |
|
|
|
|
|
except pytesseract.TesseractNotFoundError as e: |
|
|
raise OCRDependencyMissingError( |
|
|
"Tesseract binary not found. Please install Tesseract OCR." |
|
|
) from e |
|
|
except Exception as e: |
|
|
raise OCRProcessingError( |
|
|
f"Text extraction failed: {str(e)}" |
|
|
) from e |
|
|
|
|
|
def _skip_ocr( |
|
|
self, |
|
|
input_data: Dict[str, Any], |
|
|
reason: str, |
|
|
error: Optional[str] = None |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
Return input data with OCR skipped. |
|
|
|
|
|
Args: |
|
|
input_data: Original input data |
|
|
reason: Reason for skipping |
|
|
error: Optional error message |
|
|
|
|
|
Returns: |
|
|
Input data with OCR status = skipped/failed |
|
|
""" |
|
|
result = input_data.copy() |
|
|
|
|
|
ocr_status = "skipped" if reason != "failed" else "failed" |
|
|
|
|
|
result.update({ |
|
|
"extracted_text": None, |
|
|
"ocr_engine": None, |
|
|
"ocr_status": ocr_status, |
|
|
"ocr_confidence": None, |
|
|
}) |
|
|
|
|
|
if error: |
|
|
result["ocr_error"] = error |
|
|
|
|
|
return result |