agentbee / src /tools /vision.py
mangubee's picture
fix: correct author name formatting in multiple files
e7b4937
"""
Vision Tool - Image analysis using multimodal LLMs
Author: @mangubee
Date: 2026-01-02
Provides image analysis functionality using:
- HuggingFace Inference API (Gemini-3-27B, recommended)
- Gemini 2.0 Flash (fallback)
- Claude Sonnet 4.5 (fallback)
Supports:
- Image file loading and encoding
- Question answering about images
- Object detection/description
- Text extraction (OCR)
- Visual reasoning
"""
import os
import base64
import logging
from pathlib import Path
from typing import Dict, Optional
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
from src.config.settings import Settings
# ============================================================================
# CONFIG
# ============================================================================
MAX_RETRIES = 3
RETRY_MIN_WAIT = 1 # seconds
RETRY_MAX_WAIT = 10 # seconds
MAX_IMAGE_SIZE_MB = 10 # Maximum image size in MB
SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}
HF_VISION_MODEL = os.getenv("HF_VISION_MODEL", "google/gemma-3-27b-it:scaleway")
HF_TIMEOUT = 120 # seconds for large images
# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)
# ============================================================================
# Image Loading and Encoding
# ============================================================================
def load_and_encode_image(image_path: str) -> Dict[str, str]:
"""
Load image file and encode as base64.
Args:
image_path: Path to image file
Returns:
Dict with structure: {
"data": str, # Base64 encoded image
"mime_type": str, # MIME type (e.g., "image/jpeg")
"size_mb": float, # File size in MB
}
Raises:
FileNotFoundError: If image doesn't exist
ValueError: If file is not a supported image format or too large
"""
path = Path(image_path)
if not path.exists():
raise FileNotFoundError(f"Image file not found: {image_path}")
# Check file extension
extension = path.suffix.lower()
if extension not in SUPPORTED_IMAGE_FORMATS:
raise ValueError(
f"Unsupported image format: {extension}. "
f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}"
)
# Check file size
size_bytes = path.stat().st_size
size_mb = size_bytes / (1024 * 1024)
if size_mb > MAX_IMAGE_SIZE_MB:
raise ValueError(
f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB"
)
# Read and encode image
with open(path, 'rb') as f:
image_data = f.read()
encoded = base64.b64encode(image_data).decode('utf-8')
# Determine MIME type
mime_types = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp',
'.bmp': 'image/bmp',
}
mime_type = mime_types.get(extension, 'image/jpeg')
logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})")
return {
"data": encoded,
"mime_type": mime_type,
"size_mb": size_mb,
}
# ============================================================================
# Gemini Vision
# ============================================================================
@retry(
stop=stop_after_attempt(MAX_RETRIES),
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
retry=retry_if_exception_type((ConnectionError, TimeoutError)),
reraise=True,
)
def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict:
"""
Analyze image using Gemini 2.0 Flash.
Args:
image_path: Path to image file
question: Optional question about the image (default: "Describe this image")
Returns:
Dict with structure: {
"answer": str, # LLM's analysis/answer
"model": "gemini-2.0-flash",
"image_path": str,
"question": str
}
Raises:
ValueError: If API key not configured or image invalid
ConnectionError: If API connection fails (triggers retry)
"""
try:
import google.genai as genai
settings = Settings()
api_key = settings.google_api_key
if not api_key:
raise ValueError("GOOGLE_API_KEY not configured in settings")
# Load and encode image
image_data = load_and_encode_image(image_path)
# Default question
if not question:
question = "Describe this image in detail."
logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'")
# Configure Gemini client
client = genai.Client(api_key=api_key)
# Create content with image and text
response = client.models.generate_content(
model='gemini-2.0-flash-exp',
contents=[
question,
{
"mime_type": image_data["mime_type"],
"data": image_data["data"]
}
]
)
answer = response.text.strip()
logger.info(f"Gemini vision successful: {len(answer)} chars")
return {
"answer": answer,
"model": "gemini-2.0-flash",
"image_path": image_path,
"question": question,
}
except ValueError as e:
logger.error(f"Gemini configuration/input error: {e}")
raise
except (ConnectionError, TimeoutError) as e:
logger.warning(f"Gemini connection error (will retry): {e}")
raise
except Exception as e:
logger.error(f"Gemini vision error: {e}")
raise Exception(f"Gemini vision failed: {str(e)}")
# ============================================================================
# Claude Vision (Fallback)
# ============================================================================
@retry(
stop=stop_after_attempt(MAX_RETRIES),
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
retry=retry_if_exception_type((ConnectionError, TimeoutError)),
reraise=True,
)
def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict:
"""
Analyze image using Claude Sonnet 4.5.
Args:
image_path: Path to image file
question: Optional question about the image (default: "Describe this image")
Returns:
Dict with structure: {
"answer": str, # LLM's analysis/answer
"model": "claude-sonnet-4.5",
"image_path": str,
"question": str
}
Raises:
ValueError: If API key not configured or image invalid
ConnectionError: If API connection fails (triggers retry)
"""
try:
from anthropic import Anthropic
settings = Settings()
api_key = settings.anthropic_api_key
if not api_key:
raise ValueError("ANTHROPIC_API_KEY not configured in settings")
# Load and encode image
image_data = load_and_encode_image(image_path)
# Default question
if not question:
question = "Describe this image in detail."
logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'")
# Configure Claude client
client = Anthropic(api_key=api_key)
# Create message with image
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": image_data["mime_type"],
"data": image_data["data"],
},
},
{
"type": "text",
"text": question
}
],
}
],
)
answer = response.content[0].text.strip()
logger.info(f"Claude vision successful: {len(answer)} chars")
return {
"answer": answer,
"model": "claude-sonnet-4.5",
"image_path": image_path,
"question": question,
}
except ValueError as e:
logger.error(f"Claude configuration/input error: {e}")
raise
except (ConnectionError, TimeoutError) as e:
logger.warning(f"Claude connection error (will retry): {e}")
raise
except Exception as e:
logger.error(f"Claude vision error: {e}")
raise Exception(f"Claude vision failed: {str(e)}")
# ============================================================================
# HuggingFace Vision
# ============================================================================
@retry(
stop=stop_after_attempt(MAX_RETRIES),
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
retry=retry_if_exception_type((ConnectionError, TimeoutError)),
reraise=True,
)
def analyze_image_hf(image_path: str, question: Optional[str] = None) -> Dict:
"""
Analyze image using HuggingFace Inference API.
Validated models (Phase 0 testing):
- google/gemma-3-27b-it:scaleway (recommended, ~6s)
- CohereLabs/aya-vision-32b (~7s)
- Qwen/Qwen3-VL-30B-A3B-Instruct:novita (~14s)
Args:
image_path: Path to image file
question: Optional question about the image (default: "Describe this image")
Returns:
Dict with structure: {
"answer": str,
"model": str,
"image_path": str,
"question": str
}
Raises:
ValueError: If HF_TOKEN not configured or image invalid
ConnectionError: If API connection fails (triggers retry)
"""
try:
from huggingface_hub import InferenceClient
settings = Settings()
hf_token = settings.hf_token
if not hf_token:
raise ValueError("HF_TOKEN not configured in settings")
# Load and encode image
image_data = load_and_encode_image(image_path)
# Default question
if not question:
question = "Describe this image in detail."
logger.info(f"HF vision analysis: {Path(image_path).name} - '{question}'")
logger.info(f"Using model: {HF_VISION_MODEL}")
# Configure HF client
client = InferenceClient(token=hf_token)
# Create messages with base64 image
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:{image_data['mime_type']};base64,{image_data['data']}"
}
}
]
}
]
# Call chat completion
response = client.chat_completion(
model=HF_VISION_MODEL,
messages=messages,
max_tokens=1024,
)
answer = response.choices[0].message.content.strip()
logger.info(f"HF vision successful: {len(answer)} chars")
return {
"answer": answer,
"model": HF_VISION_MODEL,
"image_path": image_path,
"question": question,
}
except ValueError as e:
logger.error(f"HF configuration/input error: {e}")
raise
except (ConnectionError, TimeoutError) as e:
logger.warning(f"HF connection error (will retry): {e}")
raise
except Exception as e:
logger.error(f"HF vision error: {e}")
raise Exception(f"HF vision failed: {str(e)}")
# ============================================================================
# Unified Vision Analysis
# ============================================================================
def analyze_image(image_path: str, question: Optional[str] = None) -> Dict:
"""
Analyze image using provider specified by LLM_PROVIDER environment variable.
Respects LLM_PROVIDER setting:
- "huggingface" -> Uses HF Inference API
- "gemini" -> Uses Gemini 2.0 Flash
- "claude" -> Uses Claude Sonnet 4.5
- "groq" -> Not yet implemented
Args:
image_path: Path to image file
question: Optional question about the image
Returns:
Dict with analysis results from selected provider
Raises:
Exception: If selected provider fails or is not configured
"""
provider = os.getenv("LLM_PROVIDER", "gemini").lower()
settings = Settings()
logger.info(f"Vision analysis with provider: {provider}")
# Route to selected provider (each fails independently - NO fallback chains)
if provider == "huggingface":
try:
return analyze_image_hf(image_path, question)
except Exception as e:
logger.error(f"HF vision failed: {e}")
raise Exception(f"HF vision failed: {str(e)}")
elif provider == "gemini":
if not settings.google_api_key:
raise ValueError("GOOGLE_API_KEY not configured for Gemini provider")
try:
return analyze_image_gemini(image_path, question)
except Exception as e:
logger.error(f"Gemini vision failed: {e}")
raise Exception(f"Gemini vision failed: {str(e)}")
elif provider == "claude":
if not settings.anthropic_api_key:
raise ValueError("ANTHROPIC_API_KEY not configured for Claude provider")
try:
return analyze_image_claude(image_path, question)
except Exception as e:
logger.error(f"Claude vision failed: {e}")
raise Exception(f"Claude vision failed: {str(e)}")
elif provider == "groq":
raise NotImplementedError("Groq vision not yet implemented (Phase 5)")
else:
raise ValueError(f"Unknown LLM_PROVIDER: {provider}. Valid: huggingface, gemini, claude, groq")