Spaces:

DataQuests
/

DeepCritical

Running

File size: 8,249 Bytes

"""Image-to-text service using Gradio Client API (Multimodal-OCR3)."""

import asyncio
import tempfile
from functools import lru_cache
from pathlib import Path
from typing import Any

import numpy as np
import structlog
from gradio_client import Client, handle_file
from PIL import Image

from src.utils.config import settings
from src.utils.exceptions import ConfigurationError

logger = structlog.get_logger(__name__)


class ImageOCRService:
    """Image OCR service using prithivMLmods/Multimodal-OCR3 Gradio Space."""

    def __init__(self, api_url: str | None = None, hf_token: str | None = None) -> None:
        """Initialize Image OCR service.

        Args:
            api_url: Gradio Space URL (default: settings.ocr_api_url)
            hf_token: HuggingFace token for authenticated Spaces (default: None)

        Raises:
            ConfigurationError: If API URL not configured
        """
        # Defensively access ocr_api_url - may not exist in older config versions
        default_url = getattr(settings, "ocr_api_url", None) or "https://prithivmlmods-multimodal-ocr3.hf.space"
        self.api_url = api_url or default_url
        if not self.api_url:
            raise ConfigurationError("OCR API URL not configured")
        self.hf_token = hf_token
        self.client: Client | None = None

    async def _get_client(self, hf_token: str | None = None) -> Client:
        """Get or create Gradio Client (lazy initialization).

        Args:
            hf_token: HuggingFace token for authenticated Spaces (overrides instance token)

        Returns:
            Gradio Client instance
        """
        # Use provided token or instance token
        token = hf_token or self.hf_token
        
        # If client exists but token changed, recreate it
        if self.client is not None and token != self.hf_token:
            self.client = None
        
        if self.client is None:
            loop = asyncio.get_running_loop()
            # Pass token to Client for authenticated Spaces
            # Gradio Client uses 'token' parameter, not 'hf_token'
            if token:
                self.client = await loop.run_in_executor(
                    None,
                    lambda: Client(self.api_url, token=token),
                )
            else:
                self.client = await loop.run_in_executor(
                    None,
                    lambda: Client(self.api_url),
                )
            # Update instance token for future use
            self.hf_token = token
        return self.client

    async def extract_text(
        self,
        image_path: str,
        model: str | None = None,
        hf_token: str | None = None,
    ) -> str:
        """Extract text from image using Gradio API.

        Args:
            image_path: Path to image file
            model: Optional model selection (default: None, uses API default)

        Returns:
            Extracted text string

        Raises:
            ConfigurationError: If OCR extraction fails
        """
        client = await self._get_client(hf_token=hf_token)

        logger.info(
            "extracting_text_from_image",
            image_path=image_path,
            model=model,
        )

        try:
            # Call /Multimodal_OCR3_generate_image API endpoint
            # According to the MCP tool description, this yields raw text and Markdown-formatted text
            loop = asyncio.get_running_loop()

            # The API might require file upload first, then call the generate function
            # For now, we'll use handle_file to upload and pass the path
            result = await loop.run_in_executor(
                None,
                lambda: client.predict(
                    image_path=handle_file(image_path),
                    api_name="/Multimodal_OCR3_generate_image",
                ),
            )

            # Extract text from result
            extracted_text = self._extract_text_from_result(result)

            logger.info(
                "image_ocr_complete",
                text_length=len(extracted_text),
            )

            return extracted_text

        except Exception as e:
            logger.error("image_ocr_failed", error=str(e), error_type=type(e).__name__)
            raise ConfigurationError(f"Image OCR failed: {e}") from e

    async def extract_text_from_image(
        self,
        image_data: np.ndarray | Image.Image | str,
        hf_token: str | None = None,
    ) -> str:
        """Extract text from image data (numpy array, PIL Image, or file path).

        Args:
            image_data: Image as numpy array, PIL Image, or file path string

        Returns:
            Extracted text string
        """
        # Handle different input types
        if isinstance(image_data, str):
            # Assume it's a file path
            image_path = image_data
        elif isinstance(image_data, Image.Image):
            # Save PIL Image to temp file
            image_path = self._save_image_temp(image_data)
        elif isinstance(image_data, np.ndarray):
            # Convert numpy array to PIL Image, then save
            pil_image = Image.fromarray(image_data)
            image_path = self._save_image_temp(pil_image)
        else:
            raise ValueError(f"Unsupported image data type: {type(image_data)}")

        try:
            # Extract text from the image file
            extracted_text = await self.extract_text(image_path, hf_token=hf_token)
            return extracted_text
        finally:
            # Clean up temp file if we created it
            if image_path != image_data or not isinstance(image_data, str):
                try:
                    Path(image_path).unlink(missing_ok=True)
                except Exception as e:
                    logger.warning("failed_to_cleanup_temp_file", path=image_path, error=str(e))

    def _extract_text_from_result(self, api_result: Any) -> str:
        """Extract text from API result.

        Args:
            api_result: Result from Gradio API

        Returns:
            Extracted text string
        """
        # The API yields raw text and Markdown-formatted text
        # Result might be a string, tuple, or generator
        if isinstance(api_result, str):
            return api_result.strip()

        if isinstance(api_result, tuple):
            # Try to extract text from tuple
            for item in api_result:
                if isinstance(item, str):
                    return item.strip()
                # Check if it's a dict with text fields
                if isinstance(item, dict):
                    if "text" in item:
                        return str(item["text"]).strip()
                    if "content" in item:
                        return str(item["content"]).strip()

        # If result is a generator or async generator, we'd need to iterate
        # For now, convert to string representation
        if api_result is not None:
            text = str(api_result).strip()
            if text and text != "None":
                return text

        logger.warning("could_not_extract_text_from_result", result_type=type(api_result).__name__)
        return ""

    def _save_image_temp(self, image: Image.Image) -> str:
        """Save PIL Image to temporary file.

        Args:
            image: PIL Image object

        Returns:
            Path to temporary image file
        """
        # Create temp file
        temp_file = tempfile.NamedTemporaryFile(
            suffix=".png",
            delete=False,
        )
        temp_path = temp_file.name
        temp_file.close()

        try:
            # Save image as PNG
            image.save(temp_path, "PNG")

            logger.debug("saved_image_temp", path=temp_path, size=image.size)

            return temp_path

        except Exception as e:
            logger.error("failed_to_save_image_temp", error=str(e))
            raise ConfigurationError(f"Failed to save image to temp file: {e}") from e


@lru_cache(maxsize=1)
def get_image_ocr_service() -> ImageOCRService:
    """Get or create singleton Image OCR service instance.

    Returns:
        ImageOCRService instance
    """
    return ImageOCRService()