Proofly / agents /text_extraction_agent.py
Dipan04's picture
Initial clean commit for Hugging Face Space
2c41dce
"""
Text Extraction Agent (OCR)
Deterministic preprocessing agent for extracting text from images and PDFs.
Uses Tesseract OCR - a deterministic, non-AI algorithm.
"""
from typing import Dict, Any, Optional
import io
import logging
from core.agent_base import Agent
from core.errors import (
OCRNotApplicableError,
OCRProcessingError,
OCRDependencyMissingError
)
from config.settings import settings
# Lazy imports - only load if OCR is needed
try:
import pytesseract
from PIL import Image
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
logger = logging.getLogger(__name__)
class TextExtractionAgent(Agent):
"""
Extracts text from images and PDFs using Tesseract OCR.
This is a deterministic preprocessing step, not AI.
"""
# Content types that support OCR
OCR_SUPPORTED_TYPES = {
"image/png",
"image/jpeg",
"image/jpg",
"image/tiff",
"image/bmp",
"image/gif",
}
def __init__(self):
super().__init__()
self._check_dependencies()
def _check_dependencies(self):
"""Check if Tesseract is available."""
if not TESSERACT_AVAILABLE:
logger.warning(
"Tesseract dependencies not available. "
"Install with: pip install pytesseract pillow"
)
return
try:
# Verify Tesseract binary is accessible
pytesseract.get_tesseract_version()
logger.info("Tesseract OCR is available and ready")
except Exception as e:
logger.warning(
f"Tesseract binary not found in PATH: {str(e)}. "
"OCR will be skipped for all inputs."
)
def execute(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract text from content if applicable.
Expected input_data:
{
"content": bytes,
"content_type": str,
"size": int,
...other fields from validation...
}
Returns:
{
"extracted_text": str | None,
"ocr_engine": str | None,
"ocr_status": "success" | "skipped" | "failed",
"ocr_confidence": float | None, # Future enhancement
...passes through input_data...
}
"""
# Skip if OCR is globally disabled
if not settings.OCR_ENABLED:
logger.debug("OCR is disabled in settings")
return self._skip_ocr(input_data, "disabled")
# Check if dependencies are available
if not TESSERACT_AVAILABLE:
logger.debug("OCR dependencies not available")
return self._skip_ocr(input_data, "dependencies_missing")
content_type = input_data.get("content_type", "")
content = input_data.get("content")
# Check if content type supports OCR
if not self._is_ocr_applicable(content_type):
logger.debug(f"OCR not applicable for content type: {content_type}")
return self._skip_ocr(input_data, "not_applicable")
# Attempt OCR extraction
try:
extracted_text = self._extract_text(content, content_type)
result = input_data.copy()
result.update({
"extracted_text": extracted_text,
"ocr_engine": "tesseract",
"ocr_status": "success",
"ocr_confidence": None, # Tesseract confidence available but not used in MVP
})
logger.info(
f"OCR successful: extracted {len(extracted_text)} characters"
)
return result
except Exception as e:
logger.error(f"OCR processing failed: {str(e)}")
return self._skip_ocr(input_data, "failed", error=str(e))
def _is_ocr_applicable(self, content_type: str) -> bool:
"""
Check if OCR is applicable for this content type.
Args:
content_type: MIME type of the content
Returns:
True if OCR should be attempted
"""
return content_type.lower() in self.OCR_SUPPORTED_TYPES
def _extract_text(self, content: bytes, content_type: str) -> str:
"""
Extract text using Tesseract OCR.
Args:
content: Image bytes
content_type: MIME type
Returns:
Extracted text string
Raises:
OCRProcessingError: If extraction fails
"""
try:
# Convert bytes to PIL Image
image = Image.open(io.BytesIO(content))
# Perform OCR with configured language
text = pytesseract.image_to_string(
image,
lang=settings.OCR_LANGUAGE,
config='--psm 3' # Fully automatic page segmentation
)
# Clean up extracted text
text = text.strip()
if not text:
logger.warning("OCR completed but no text was extracted")
return text
except pytesseract.TesseractNotFoundError as e:
raise OCRDependencyMissingError(
"Tesseract binary not found. Please install Tesseract OCR."
) from e
except Exception as e:
raise OCRProcessingError(
f"Text extraction failed: {str(e)}"
) from e
def _skip_ocr(
self,
input_data: Dict[str, Any],
reason: str,
error: Optional[str] = None
) -> Dict[str, Any]:
"""
Return input data with OCR skipped.
Args:
input_data: Original input data
reason: Reason for skipping
error: Optional error message
Returns:
Input data with OCR status = skipped/failed
"""
result = input_data.copy()
ocr_status = "skipped" if reason != "failed" else "failed"
result.update({
"extracted_text": None,
"ocr_engine": None,
"ocr_status": ocr_status,
"ocr_confidence": None,
})
if error:
result["ocr_error"] = error
return result