agentbee

Running

File size: 14,524 Bytes

"""
Vision Tool - Image analysis using multimodal LLMs
Author: @mangubee
Date: 2026-01-02

Provides image analysis functionality using:
- HuggingFace Inference API (Gemini-3-27B, recommended)
- Gemini 2.0 Flash (fallback)
- Claude Sonnet 4.5 (fallback)

Supports:
- Image file loading and encoding
- Question answering about images
- Object detection/description
- Text extraction (OCR)
- Visual reasoning
"""

import os
import base64
import logging
from pathlib import Path
from typing import Dict, Optional
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)

from src.config.settings import Settings

# ============================================================================
# CONFIG
# ============================================================================
MAX_RETRIES = 3
RETRY_MIN_WAIT = 1  # seconds
RETRY_MAX_WAIT = 10  # seconds
MAX_IMAGE_SIZE_MB = 10  # Maximum image size in MB
SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}
HF_VISION_MODEL = os.getenv("HF_VISION_MODEL", "google/gemma-3-27b-it:scaleway")
HF_TIMEOUT = 120  # seconds for large images

# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)


# ============================================================================
# Image Loading and Encoding
# ============================================================================

def load_and_encode_image(image_path: str) -> Dict[str, str]:
    """
    Load image file and encode as base64.

    Args:
        image_path: Path to image file

    Returns:
        Dict with structure: {
            "data": str,          # Base64 encoded image
            "mime_type": str,     # MIME type (e.g., "image/jpeg")
            "size_mb": float,     # File size in MB
        }

    Raises:
        FileNotFoundError: If image doesn't exist
        ValueError: If file is not a supported image format or too large
    """
    path = Path(image_path)

    if not path.exists():
        raise FileNotFoundError(f"Image file not found: {image_path}")

    # Check file extension
    extension = path.suffix.lower()
    if extension not in SUPPORTED_IMAGE_FORMATS:
        raise ValueError(
            f"Unsupported image format: {extension}. "
            f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}"
        )

    # Check file size
    size_bytes = path.stat().st_size
    size_mb = size_bytes / (1024 * 1024)

    if size_mb > MAX_IMAGE_SIZE_MB:
        raise ValueError(
            f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB"
        )

    # Read and encode image
    with open(path, 'rb') as f:
        image_data = f.read()

    encoded = base64.b64encode(image_data).decode('utf-8')

    # Determine MIME type
    mime_types = {
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.gif': 'image/gif',
        '.webp': 'image/webp',
        '.bmp': 'image/bmp',
    }
    mime_type = mime_types.get(extension, 'image/jpeg')

    logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})")

    return {
        "data": encoded,
        "mime_type": mime_type,
        "size_mb": size_mb,
    }


# ============================================================================
# Gemini Vision
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((ConnectionError, TimeoutError)),
    reraise=True,
)
def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict:
    """
    Analyze image using Gemini 2.0 Flash.

    Args:
        image_path: Path to image file
        question: Optional question about the image (default: "Describe this image")

    Returns:
        Dict with structure: {
            "answer": str,       # LLM's analysis/answer
            "model": "gemini-2.0-flash",
            "image_path": str,
            "question": str
        }

    Raises:
        ValueError: If API key not configured or image invalid
        ConnectionError: If API connection fails (triggers retry)
    """
    try:
        import google.genai as genai

        settings = Settings()
        api_key = settings.google_api_key

        if not api_key:
            raise ValueError("GOOGLE_API_KEY not configured in settings")

        # Load and encode image
        image_data = load_and_encode_image(image_path)

        # Default question
        if not question:
            question = "Describe this image in detail."

        logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'")

        # Configure Gemini client
        client = genai.Client(api_key=api_key)

        # Create content with image and text
        response = client.models.generate_content(
            model='gemini-2.0-flash-exp',
            contents=[
                question,
                {
                    "mime_type": image_data["mime_type"],
                    "data": image_data["data"]
                }
            ]
        )

        answer = response.text.strip()

        logger.info(f"Gemini vision successful: {len(answer)} chars")

        return {
            "answer": answer,
            "model": "gemini-2.0-flash",
            "image_path": image_path,
            "question": question,
        }

    except ValueError as e:
        logger.error(f"Gemini configuration/input error: {e}")
        raise
    except (ConnectionError, TimeoutError) as e:
        logger.warning(f"Gemini connection error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"Gemini vision error: {e}")
        raise Exception(f"Gemini vision failed: {str(e)}")


# ============================================================================
# Claude Vision (Fallback)
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((ConnectionError, TimeoutError)),
    reraise=True,
)
def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict:
    """
    Analyze image using Claude Sonnet 4.5.

    Args:
        image_path: Path to image file
        question: Optional question about the image (default: "Describe this image")

    Returns:
        Dict with structure: {
            "answer": str,       # LLM's analysis/answer
            "model": "claude-sonnet-4.5",
            "image_path": str,
            "question": str
        }

    Raises:
        ValueError: If API key not configured or image invalid
        ConnectionError: If API connection fails (triggers retry)
    """
    try:
        from anthropic import Anthropic

        settings = Settings()
        api_key = settings.anthropic_api_key

        if not api_key:
            raise ValueError("ANTHROPIC_API_KEY not configured in settings")

        # Load and encode image
        image_data = load_and_encode_image(image_path)

        # Default question
        if not question:
            question = "Describe this image in detail."

        logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'")

        # Configure Claude client
        client = Anthropic(api_key=api_key)

        # Create message with image
        response = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": image_data["mime_type"],
                                "data": image_data["data"],
                            },
                        },
                        {
                            "type": "text",
                            "text": question
                        }
                    ],
                }
            ],
        )

        answer = response.content[0].text.strip()

        logger.info(f"Claude vision successful: {len(answer)} chars")

        return {
            "answer": answer,
            "model": "claude-sonnet-4.5",
            "image_path": image_path,
            "question": question,
        }

    except ValueError as e:
        logger.error(f"Claude configuration/input error: {e}")
        raise
    except (ConnectionError, TimeoutError) as e:
        logger.warning(f"Claude connection error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"Claude vision error: {e}")
        raise Exception(f"Claude vision failed: {str(e)}")


# ============================================================================
# HuggingFace Vision
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((ConnectionError, TimeoutError)),
    reraise=True,
)
def analyze_image_hf(image_path: str, question: Optional[str] = None) -> Dict:
    """
    Analyze image using HuggingFace Inference API.

    Validated models (Phase 0 testing):
    - google/gemma-3-27b-it:scaleway (recommended, ~6s)
    - CohereLabs/aya-vision-32b (~7s)
    - Qwen/Qwen3-VL-30B-A3B-Instruct:novita (~14s)

    Args:
        image_path: Path to image file
        question: Optional question about the image (default: "Describe this image")

    Returns:
        Dict with structure: {
            "answer": str,
            "model": str,
            "image_path": str,
            "question": str
        }

    Raises:
        ValueError: If HF_TOKEN not configured or image invalid
        ConnectionError: If API connection fails (triggers retry)
    """
    try:
        from huggingface_hub import InferenceClient

        settings = Settings()
        hf_token = settings.hf_token

        if not hf_token:
            raise ValueError("HF_TOKEN not configured in settings")

        # Load and encode image
        image_data = load_and_encode_image(image_path)

        # Default question
        if not question:
            question = "Describe this image in detail."

        logger.info(f"HF vision analysis: {Path(image_path).name} - '{question}'")
        logger.info(f"Using model: {HF_VISION_MODEL}")

        # Configure HF client
        client = InferenceClient(token=hf_token)

        # Create messages with base64 image
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{image_data['mime_type']};base64,{image_data['data']}"
                        }
                    }
                ]
            }
        ]

        # Call chat completion
        response = client.chat_completion(
            model=HF_VISION_MODEL,
            messages=messages,
            max_tokens=1024,
        )

        answer = response.choices[0].message.content.strip()

        logger.info(f"HF vision successful: {len(answer)} chars")

        return {
            "answer": answer,
            "model": HF_VISION_MODEL,
            "image_path": image_path,
            "question": question,
        }

    except ValueError as e:
        logger.error(f"HF configuration/input error: {e}")
        raise
    except (ConnectionError, TimeoutError) as e:
        logger.warning(f"HF connection error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"HF vision error: {e}")
        raise Exception(f"HF vision failed: {str(e)}")


# ============================================================================
# Unified Vision Analysis
# ============================================================================

def analyze_image(image_path: str, question: Optional[str] = None) -> Dict:
    """
    Analyze image using provider specified by LLM_PROVIDER environment variable.

    Respects LLM_PROVIDER setting:
    - "huggingface" -> Uses HF Inference API
    - "gemini" -> Uses Gemini 2.0 Flash
    - "claude" -> Uses Claude Sonnet 4.5
    - "groq" -> Not yet implemented

    Args:
        image_path: Path to image file
        question: Optional question about the image

    Returns:
        Dict with analysis results from selected provider

    Raises:
        Exception: If selected provider fails or is not configured
    """
    provider = os.getenv("LLM_PROVIDER", "gemini").lower()
    settings = Settings()

    logger.info(f"Vision analysis with provider: {provider}")

    # Route to selected provider (each fails independently - NO fallback chains)
    if provider == "huggingface":
        try:
            return analyze_image_hf(image_path, question)
        except Exception as e:
            logger.error(f"HF vision failed: {e}")
            raise Exception(f"HF vision failed: {str(e)}")

    elif provider == "gemini":
        if not settings.google_api_key:
            raise ValueError("GOOGLE_API_KEY not configured for Gemini provider")
        try:
            return analyze_image_gemini(image_path, question)
        except Exception as e:
            logger.error(f"Gemini vision failed: {e}")
            raise Exception(f"Gemini vision failed: {str(e)}")

    elif provider == "claude":
        if not settings.anthropic_api_key:
            raise ValueError("ANTHROPIC_API_KEY not configured for Claude provider")
        try:
            return analyze_image_claude(image_path, question)
        except Exception as e:
            logger.error(f"Claude vision failed: {e}")
            raise Exception(f"Claude vision failed: {str(e)}")

    elif provider == "groq":
        raise NotImplementedError("Groq vision not yet implemented (Phase 5)")

    else:
        raise ValueError(f"Unknown LLM_PROVIDER: {provider}. Valid: huggingface, gemini, claude, groq")