agentbee

Sleeping

App Files Files Community

agentbee / src /tools /vision.py

mangubee

Stage 2: Implement tool development with retry logic and error handling

1041734 about 2 months ago

raw

history blame

10.3 kB

	"""
	Vision Tool - Image analysis using multimodal LLMs
	Author: @mangobee
	Date: 2026-01-02

	Provides image analysis functionality using:
	- Gemini 2.0 Flash (default, free tier)
	- Claude Sonnet 4.5 (fallback, if configured)

	Supports:
	- Image file loading and encoding
	- Question answering about images
	- Object detection/description
	- Text extraction (OCR)
	- Visual reasoning
	"""

	import base64
	import logging
	from pathlib import Path
	from typing import Dict, Optional
	from tenacity import (
	retry,
	stop_after_attempt,
	wait_exponential,
	retry_if_exception_type,
	)

	from src.config.settings import Settings

	# ============================================================================
	# CONFIG
	# ============================================================================
	MAX_RETRIES = 3
	RETRY_MIN_WAIT = 1 # seconds
	RETRY_MAX_WAIT = 10 # seconds
	MAX_IMAGE_SIZE_MB = 10 # Maximum image size in MB
	SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}

	# ============================================================================
	# Logging Setup
	# ============================================================================
	logger = logging.getLogger(__name__)


	# ============================================================================
	# Image Loading and Encoding
	# ============================================================================

	def load_and_encode_image(image_path: str) -> Dict[str, str]:
	"""
	Load image file and encode as base64.

	Args:
	image_path: Path to image file

	Returns:
	Dict with structure: {
	"data": str, # Base64 encoded image
	"mime_type": str, # MIME type (e.g., "image/jpeg")
	"size_mb": float, # File size in MB
	}

	Raises:
	FileNotFoundError: If image doesn't exist
	ValueError: If file is not a supported image format or too large
	"""
	path = Path(image_path)

	if not path.exists():
	raise FileNotFoundError(f"Image file not found: {image_path}")

	# Check file extension
	extension = path.suffix.lower()
	if extension not in SUPPORTED_IMAGE_FORMATS:
	raise ValueError(
	f"Unsupported image format: {extension}. "
	f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}"
	)

	# Check file size
	size_bytes = path.stat().st_size
	size_mb = size_bytes / (1024 * 1024)

	if size_mb > MAX_IMAGE_SIZE_MB:
	raise ValueError(
	f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB"
	)

	# Read and encode image
	with open(path, 'rb') as f:
	image_data = f.read()

	encoded = base64.b64encode(image_data).decode('utf-8')

	# Determine MIME type
	mime_types = {
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.png': 'image/png',
	'.gif': 'image/gif',
	'.webp': 'image/webp',
	'.bmp': 'image/bmp',
	}
	mime_type = mime_types.get(extension, 'image/jpeg')

	logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})")

	return {
	"data": encoded,
	"mime_type": mime_type,
	"size_mb": size_mb,
	}


	# ============================================================================
	# Gemini Vision
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((ConnectionError, TimeoutError)),
	reraise=True,
	)
	def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict:
	"""
	Analyze image using Gemini 2.0 Flash.

	Args:
	image_path: Path to image file
	question: Optional question about the image (default: "Describe this image")

	Returns:
	Dict with structure: {
	"answer": str, # LLM's analysis/answer
	"model": "gemini-2.0-flash",
	"image_path": str,
	"question": str
	}

	Raises:
	ValueError: If API key not configured or image invalid
	ConnectionError: If API connection fails (triggers retry)
	"""
	try:
	import google.genai as genai

	settings = Settings()
	api_key = settings.google_api_key

	if not api_key:
	raise ValueError("GOOGLE_API_KEY not configured in settings")

	# Load and encode image
	image_data = load_and_encode_image(image_path)

	# Default question
	if not question:
	question = "Describe this image in detail."

	logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'")

	# Configure Gemini client
	client = genai.Client(api_key=api_key)

	# Create content with image and text
	response = client.models.generate_content(
	model='gemini-2.0-flash-exp',
	contents=[
	question,
	{
	"mime_type": image_data["mime_type"],
	"data": image_data["data"]
	}
	]
	)

	answer = response.text.strip()

	logger.info(f"Gemini vision successful: {len(answer)} chars")

	return {
	"answer": answer,
	"model": "gemini-2.0-flash",
	"image_path": image_path,
	"question": question,
	}

	except ValueError as e:
	logger.error(f"Gemini configuration/input error: {e}")
	raise
	except (ConnectionError, TimeoutError) as e:
	logger.warning(f"Gemini connection error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"Gemini vision error: {e}")
	raise Exception(f"Gemini vision failed: {str(e)}")


	# ============================================================================
	# Claude Vision (Fallback)
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((ConnectionError, TimeoutError)),
	reraise=True,
	)
	def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict:
	"""
	Analyze image using Claude Sonnet 4.5.

	Args:
	image_path: Path to image file
	question: Optional question about the image (default: "Describe this image")

	Returns:
	Dict with structure: {
	"answer": str, # LLM's analysis/answer
	"model": "claude-sonnet-4.5",
	"image_path": str,
	"question": str
	}

	Raises:
	ValueError: If API key not configured or image invalid
	ConnectionError: If API connection fails (triggers retry)
	"""
	try:
	from anthropic import Anthropic

	settings = Settings()
	api_key = settings.anthropic_api_key

	if not api_key:
	raise ValueError("ANTHROPIC_API_KEY not configured in settings")

	# Load and encode image
	image_data = load_and_encode_image(image_path)

	# Default question
	if not question:
	question = "Describe this image in detail."

	logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'")

	# Configure Claude client
	client = Anthropic(api_key=api_key)

	# Create message with image
	response = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=1024,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": image_data["mime_type"],
	"data": image_data["data"],
	},
	},
	{
	"type": "text",
	"text": question
	}
	],
	}
	],
	)

	answer = response.content[0].text.strip()

	logger.info(f"Claude vision successful: {len(answer)} chars")

	return {
	"answer": answer,
	"model": "claude-sonnet-4.5",
	"image_path": image_path,
	"question": question,
	}

	except ValueError as e:
	logger.error(f"Claude configuration/input error: {e}")
	raise
	except (ConnectionError, TimeoutError) as e:
	logger.warning(f"Claude connection error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"Claude vision error: {e}")
	raise Exception(f"Claude vision failed: {str(e)}")


	# ============================================================================
	# Unified Vision Analysis
	# ============================================================================

	def analyze_image(image_path: str, question: Optional[str] = None) -> Dict:
	"""
	Analyze image using available multimodal LLM.

	Tries Gemini first (free tier), falls back to Claude if configured.

	Args:
	image_path: Path to image file
	question: Optional question about the image

	Returns:
	Dict with analysis results from either Gemini or Claude

	Raises:
	Exception: If both Gemini and Claude fail or are not configured
	"""
	settings = Settings()

	# Try Gemini first (default, free tier)
	if settings.google_api_key:
	try:
	return analyze_image_gemini(image_path, question)
	except Exception as e:
	logger.warning(f"Gemini failed, trying Claude: {e}")

	# Fallback to Claude
	if settings.anthropic_api_key:
	try:
	return analyze_image_claude(image_path, question)
	except Exception as e:
	logger.error(f"Claude also failed: {e}")
	raise Exception(f"Vision analysis failed - Gemini and Claude both failed")

	# No API keys configured
	raise ValueError(
	"No vision API configured. Please set GOOGLE_API_KEY or ANTHROPIC_API_KEY"
	)