agentbee

Sleeping

File size: 10,330 Bytes
"""
Vision Tool - Image analysis using multimodal LLMs
Author: @mangobee
Date: 2026-01-02

Provides image analysis functionality using:
- Gemini 2.0 Flash (default, free tier)
- Claude Sonnet 4.5 (fallback, if configured)

Supports:
- Image file loading and encoding
- Question answering about images
- Object detection/description
- Text extraction (OCR)
- Visual reasoning
"""

import base64
import logging
from pathlib import Path
from typing import Dict, Optional
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)

from src.config.settings import Settings

# ============================================================================
# CONFIG
# ============================================================================
MAX_RETRIES = 3
RETRY_MIN_WAIT = 1  # seconds
RETRY_MAX_WAIT = 10  # seconds
MAX_IMAGE_SIZE_MB = 10  # Maximum image size in MB
SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}

# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)


# ============================================================================
# Image Loading and Encoding
# ============================================================================

def load_and_encode_image(image_path: str) -> Dict[str, str]:
    """
    Load image file and encode as base64.

    Args:
        image_path: Path to image file

    Returns:
        Dict with structure: {
            "data": str,          # Base64 encoded image
            "mime_type": str,     # MIME type (e.g., "image/jpeg")
            "size_mb": float,     # File size in MB
        }

    Raises:
        FileNotFoundError: If image doesn't exist
        ValueError: If file is not a supported image format or too large
    """
    path = Path(image_path)

    if not path.exists():
        raise FileNotFoundError(f"Image file not found: {image_path}")

    # Check file extension
    extension = path.suffix.lower()
    if extension not in SUPPORTED_IMAGE_FORMATS:
        raise ValueError(
            f"Unsupported image format: {extension}. "
            f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}"
        )

    # Check file size
    size_bytes = path.stat().st_size
    size_mb = size_bytes / (1024 * 1024)

    if size_mb > MAX_IMAGE_SIZE_MB:
        raise ValueError(
            f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB"
        )

    # Read and encode image
    with open(path, 'rb') as f:
        image_data = f.read()

    encoded = base64.b64encode(image_data).decode('utf-8')

    # Determine MIME type
    mime_types = {
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.gif': 'image/gif',
        '.webp': 'image/webp',
        '.bmp': 'image/bmp',
    }
    mime_type = mime_types.get(extension, 'image/jpeg')

    logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})")

    return {
        "data": encoded,
        "mime_type": mime_type,
        "size_mb": size_mb,
    }


# ============================================================================
# Gemini Vision
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((ConnectionError, TimeoutError)),
    reraise=True,
)
def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict:
    """
    Analyze image using Gemini 2.0 Flash.

    Args:
        image_path: Path to image file
        question: Optional question about the image (default: "Describe this image")

    Returns:
        Dict with structure: {
            "answer": str,       # LLM's analysis/answer
            "model": "gemini-2.0-flash",
            "image_path": str,
            "question": str
        }

    Raises:
        ValueError: If API key not configured or image invalid
        ConnectionError: If API connection fails (triggers retry)
    """
    try:
        import google.genai as genai

        settings = Settings()
        api_key = settings.google_api_key

        if not api_key:
            raise ValueError("GOOGLE_API_KEY not configured in settings")

        # Load and encode image
        image_data = load_and_encode_image(image_path)

        # Default question
        if not question:
            question = "Describe this image in detail."

        logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'")

        # Configure Gemini client
        client = genai.Client(api_key=api_key)

        # Create content with image and text
        response = client.models.generate_content(
            model='gemini-2.0-flash-exp',
            contents=[
                question,
                {
                    "mime_type": image_data["mime_type"],
                    "data": image_data["data"]
                }
            ]
        )

        answer = response.text.strip()

        logger.info(f"Gemini vision successful: {len(answer)} chars")

        return {
            "answer": answer,
            "model": "gemini-2.0-flash",
            "image_path": image_path,
            "question": question,
        }

    except ValueError as e:
        logger.error(f"Gemini configuration/input error: {e}")
        raise
    except (ConnectionError, TimeoutError) as e:
        logger.warning(f"Gemini connection error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"Gemini vision error: {e}")
        raise Exception(f"Gemini vision failed: {str(e)}")


# ============================================================================
# Claude Vision (Fallback)
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((ConnectionError, TimeoutError)),
    reraise=True,
)
def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict:
    """
    Analyze image using Claude Sonnet 4.5.

    Args:
        image_path: Path to image file
        question: Optional question about the image (default: "Describe this image")

    Returns:
        Dict with structure: {
            "answer": str,       # LLM's analysis/answer
            "model": "claude-sonnet-4.5",
            "image_path": str,
            "question": str
        }

    Raises:
        ValueError: If API key not configured or image invalid
        ConnectionError: If API connection fails (triggers retry)
    """
    try:
        from anthropic import Anthropic

        settings = Settings()
        api_key = settings.anthropic_api_key

        if not api_key:
            raise ValueError("ANTHROPIC_API_KEY not configured in settings")

        # Load and encode image
        image_data = load_and_encode_image(image_path)

        # Default question
        if not question:
            question = "Describe this image in detail."

        logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'")

        # Configure Claude client
        client = Anthropic(api_key=api_key)

        # Create message with image
        response = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": image_data["mime_type"],
                                "data": image_data["data"],
                            },
                        },
                        {
                            "type": "text",
                            "text": question
                        }
                    ],
                }
            ],
        )

        answer = response.content[0].text.strip()

        logger.info(f"Claude vision successful: {len(answer)} chars")

        return {
            "answer": answer,
            "model": "claude-sonnet-4.5",
            "image_path": image_path,
            "question": question,
        }

    except ValueError as e:
        logger.error(f"Claude configuration/input error: {e}")
        raise
    except (ConnectionError, TimeoutError) as e:
        logger.warning(f"Claude connection error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"Claude vision error: {e}")
        raise Exception(f"Claude vision failed: {str(e)}")


# ============================================================================
# Unified Vision Analysis
# ============================================================================

def analyze_image(image_path: str, question: Optional[str] = None) -> Dict:
    """
    Analyze image using available multimodal LLM.

    Tries Gemini first (free tier), falls back to Claude if configured.

    Args:
        image_path: Path to image file
        question: Optional question about the image

    Returns:
        Dict with analysis results from either Gemini or Claude

    Raises:
        Exception: If both Gemini and Claude fail or are not configured
    """
    settings = Settings()

    # Try Gemini first (default, free tier)
    if settings.google_api_key:
        try:
            return analyze_image_gemini(image_path, question)
        except Exception as e:
            logger.warning(f"Gemini failed, trying Claude: {e}")

    # Fallback to Claude
    if settings.anthropic_api_key:
        try:
            return analyze_image_claude(image_path, question)
        except Exception as e:
            logger.error(f"Claude also failed: {e}")
            raise Exception(f"Vision analysis failed - Gemini and Claude both failed")

    # No API keys configured
    raise ValueError(
        "No vision API configured. Please set GOOGLE_API_KEY or ANTHROPIC_API_KEY"
    )