agentbee

Running

App Files Files Community

agentbee / src /tools /vision.py

mangubee

fix: correct author name formatting in multiple files

e7b4937 9 days ago

raw

history blame contribute delete

14.5 kB

	"""
	Vision Tool - Image analysis using multimodal LLMs
	Author: @mangubee
	Date: 2026-01-02

	Provides image analysis functionality using:
	- HuggingFace Inference API (Gemini-3-27B, recommended)
	- Gemini 2.0 Flash (fallback)
	- Claude Sonnet 4.5 (fallback)

	Supports:
	- Image file loading and encoding
	- Question answering about images
	- Object detection/description
	- Text extraction (OCR)
	- Visual reasoning
	"""

	import os
	import base64
	import logging
	from pathlib import Path
	from typing import Dict, Optional
	from tenacity import (
	retry,
	stop_after_attempt,
	wait_exponential,
	retry_if_exception_type,
	)

	from src.config.settings import Settings

	# ============================================================================
	# CONFIG
	# ============================================================================
	MAX_RETRIES = 3
	RETRY_MIN_WAIT = 1 # seconds
	RETRY_MAX_WAIT = 10 # seconds
	MAX_IMAGE_SIZE_MB = 10 # Maximum image size in MB
	SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'}
	HF_VISION_MODEL = os.getenv("HF_VISION_MODEL", "google/gemma-3-27b-it:scaleway")
	HF_TIMEOUT = 120 # seconds for large images

	# ============================================================================
	# Logging Setup
	# ============================================================================
	logger = logging.getLogger(__name__)


	# ============================================================================
	# Image Loading and Encoding
	# ============================================================================

	def load_and_encode_image(image_path: str) -> Dict[str, str]:
	"""
	Load image file and encode as base64.

	Args:
	image_path: Path to image file

	Returns:
	Dict with structure: {
	"data": str, # Base64 encoded image
	"mime_type": str, # MIME type (e.g., "image/jpeg")
	"size_mb": float, # File size in MB
	}

	Raises:
	FileNotFoundError: If image doesn't exist
	ValueError: If file is not a supported image format or too large
	"""
	path = Path(image_path)

	if not path.exists():
	raise FileNotFoundError(f"Image file not found: {image_path}")

	# Check file extension
	extension = path.suffix.lower()
	if extension not in SUPPORTED_IMAGE_FORMATS:
	raise ValueError(
	f"Unsupported image format: {extension}. "
	f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}"
	)

	# Check file size
	size_bytes = path.stat().st_size
	size_mb = size_bytes / (1024 * 1024)

	if size_mb > MAX_IMAGE_SIZE_MB:
	raise ValueError(
	f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB"
	)

	# Read and encode image
	with open(path, 'rb') as f:
	image_data = f.read()

	encoded = base64.b64encode(image_data).decode('utf-8')

	# Determine MIME type
	mime_types = {
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.png': 'image/png',
	'.gif': 'image/gif',
	'.webp': 'image/webp',
	'.bmp': 'image/bmp',
	}
	mime_type = mime_types.get(extension, 'image/jpeg')

	logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})")

	return {
	"data": encoded,
	"mime_type": mime_type,
	"size_mb": size_mb,
	}


	# ============================================================================
	# Gemini Vision
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((ConnectionError, TimeoutError)),
	reraise=True,
	)
	def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict:
	"""
	Analyze image using Gemini 2.0 Flash.

	Args:
	image_path: Path to image file
	question: Optional question about the image (default: "Describe this image")

	Returns:
	Dict with structure: {
	"answer": str, # LLM's analysis/answer
	"model": "gemini-2.0-flash",
	"image_path": str,
	"question": str
	}

	Raises:
	ValueError: If API key not configured or image invalid
	ConnectionError: If API connection fails (triggers retry)
	"""
	try:
	import google.genai as genai

	settings = Settings()
	api_key = settings.google_api_key

	if not api_key:
	raise ValueError("GOOGLE_API_KEY not configured in settings")

	# Load and encode image
	image_data = load_and_encode_image(image_path)

	# Default question
	if not question:
	question = "Describe this image in detail."

	logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'")

	# Configure Gemini client
	client = genai.Client(api_key=api_key)

	# Create content with image and text
	response = client.models.generate_content(
	model='gemini-2.0-flash-exp',
	contents=[
	question,
	{
	"mime_type": image_data["mime_type"],
	"data": image_data["data"]
	}
	]
	)

	answer = response.text.strip()

	logger.info(f"Gemini vision successful: {len(answer)} chars")

	return {
	"answer": answer,
	"model": "gemini-2.0-flash",
	"image_path": image_path,
	"question": question,
	}

	except ValueError as e:
	logger.error(f"Gemini configuration/input error: {e}")
	raise
	except (ConnectionError, TimeoutError) as e:
	logger.warning(f"Gemini connection error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"Gemini vision error: {e}")
	raise Exception(f"Gemini vision failed: {str(e)}")


	# ============================================================================
	# Claude Vision (Fallback)
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((ConnectionError, TimeoutError)),
	reraise=True,
	)
	def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict:
	"""
	Analyze image using Claude Sonnet 4.5.

	Args:
	image_path: Path to image file
	question: Optional question about the image (default: "Describe this image")

	Returns:
	Dict with structure: {
	"answer": str, # LLM's analysis/answer
	"model": "claude-sonnet-4.5",
	"image_path": str,
	"question": str
	}

	Raises:
	ValueError: If API key not configured or image invalid
	ConnectionError: If API connection fails (triggers retry)
	"""
	try:
	from anthropic import Anthropic

	settings = Settings()
	api_key = settings.anthropic_api_key

	if not api_key:
	raise ValueError("ANTHROPIC_API_KEY not configured in settings")

	# Load and encode image
	image_data = load_and_encode_image(image_path)

	# Default question
	if not question:
	question = "Describe this image in detail."

	logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'")

	# Configure Claude client
	client = Anthropic(api_key=api_key)

	# Create message with image
	response = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=1024,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": image_data["mime_type"],
	"data": image_data["data"],
	},
	},
	{
	"type": "text",
	"text": question
	}
	],
	}
	],
	)

	answer = response.content[0].text.strip()

	logger.info(f"Claude vision successful: {len(answer)} chars")

	return {
	"answer": answer,
	"model": "claude-sonnet-4.5",
	"image_path": image_path,
	"question": question,
	}

	except ValueError as e:
	logger.error(f"Claude configuration/input error: {e}")
	raise
	except (ConnectionError, TimeoutError) as e:
	logger.warning(f"Claude connection error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"Claude vision error: {e}")
	raise Exception(f"Claude vision failed: {str(e)}")


	# ============================================================================
	# HuggingFace Vision
	# ============================================================================

	@retry(
	stop=stop_after_attempt(MAX_RETRIES),
	wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
	retry=retry_if_exception_type((ConnectionError, TimeoutError)),
	reraise=True,
	)
	def analyze_image_hf(image_path: str, question: Optional[str] = None) -> Dict:
	"""
	Analyze image using HuggingFace Inference API.

	Validated models (Phase 0 testing):
	- google/gemma-3-27b-it:scaleway (recommended, ~6s)
	- CohereLabs/aya-vision-32b (~7s)
	- Qwen/Qwen3-VL-30B-A3B-Instruct:novita (~14s)

	Args:
	image_path: Path to image file
	question: Optional question about the image (default: "Describe this image")

	Returns:
	Dict with structure: {
	"answer": str,
	"model": str,
	"image_path": str,
	"question": str
	}

	Raises:
	ValueError: If HF_TOKEN not configured or image invalid
	ConnectionError: If API connection fails (triggers retry)
	"""
	try:
	from huggingface_hub import InferenceClient

	settings = Settings()
	hf_token = settings.hf_token

	if not hf_token:
	raise ValueError("HF_TOKEN not configured in settings")

	# Load and encode image
	image_data = load_and_encode_image(image_path)

	# Default question
	if not question:
	question = "Describe this image in detail."

	logger.info(f"HF vision analysis: {Path(image_path).name} - '{question}'")
	logger.info(f"Using model: {HF_VISION_MODEL}")

	# Configure HF client
	client = InferenceClient(token=hf_token)

	# Create messages with base64 image
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:{image_data['mime_type']};base64,{image_data['data']}"
	}
	}
	]
	}
	]

	# Call chat completion
	response = client.chat_completion(
	model=HF_VISION_MODEL,
	messages=messages,
	max_tokens=1024,
	)

	answer = response.choices[0].message.content.strip()

	logger.info(f"HF vision successful: {len(answer)} chars")

	return {
	"answer": answer,
	"model": HF_VISION_MODEL,
	"image_path": image_path,
	"question": question,
	}

	except ValueError as e:
	logger.error(f"HF configuration/input error: {e}")
	raise
	except (ConnectionError, TimeoutError) as e:
	logger.warning(f"HF connection error (will retry): {e}")
	raise
	except Exception as e:
	logger.error(f"HF vision error: {e}")
	raise Exception(f"HF vision failed: {str(e)}")


	# ============================================================================
	# Unified Vision Analysis
	# ============================================================================

	def analyze_image(image_path: str, question: Optional[str] = None) -> Dict:
	"""
	Analyze image using provider specified by LLM_PROVIDER environment variable.

	Respects LLM_PROVIDER setting:
	- "huggingface" -> Uses HF Inference API
	- "gemini" -> Uses Gemini 2.0 Flash
	- "claude" -> Uses Claude Sonnet 4.5
	- "groq" -> Not yet implemented

	Args:
	image_path: Path to image file
	question: Optional question about the image

	Returns:
	Dict with analysis results from selected provider

	Raises:
	Exception: If selected provider fails or is not configured
	"""
	provider = os.getenv("LLM_PROVIDER", "gemini").lower()
	settings = Settings()

	logger.info(f"Vision analysis with provider: {provider}")

	# Route to selected provider (each fails independently - NO fallback chains)
	if provider == "huggingface":
	try:
	return analyze_image_hf(image_path, question)
	except Exception as e:
	logger.error(f"HF vision failed: {e}")
	raise Exception(f"HF vision failed: {str(e)}")

	elif provider == "gemini":
	if not settings.google_api_key:
	raise ValueError("GOOGLE_API_KEY not configured for Gemini provider")
	try:
	return analyze_image_gemini(image_path, question)
	except Exception as e:
	logger.error(f"Gemini vision failed: {e}")
	raise Exception(f"Gemini vision failed: {str(e)}")

	elif provider == "claude":
	if not settings.anthropic_api_key:
	raise ValueError("ANTHROPIC_API_KEY not configured for Claude provider")
	try:
	return analyze_image_claude(image_path, question)
	except Exception as e:
	logger.error(f"Claude vision failed: {e}")
	raise Exception(f"Claude vision failed: {str(e)}")

	elif provider == "groq":
	raise NotImplementedError("Groq vision not yet implemented (Phase 5)")

	else:
	raise ValueError(f"Unknown LLM_PROVIDER: {provider}. Valid: huggingface, gemini, claude, groq")