""" Vision Tool - Image analysis using multimodal LLMs Author: @mangubee Date: 2026-01-02 Provides image analysis functionality using: - HuggingFace Inference API (Gemini-3-27B, recommended) - Gemini 2.0 Flash (fallback) - Claude Sonnet 4.5 (fallback) Supports: - Image file loading and encoding - Question answering about images - Object detection/description - Text extraction (OCR) - Visual reasoning """ import os import base64 import logging from pathlib import Path from typing import Dict, Optional from tenacity import ( retry, stop_after_attempt, wait_exponential, retry_if_exception_type, ) from src.config.settings import Settings # ============================================================================ # CONFIG # ============================================================================ MAX_RETRIES = 3 RETRY_MIN_WAIT = 1 # seconds RETRY_MAX_WAIT = 10 # seconds MAX_IMAGE_SIZE_MB = 10 # Maximum image size in MB SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'} HF_VISION_MODEL = os.getenv("HF_VISION_MODEL", "google/gemma-3-27b-it:scaleway") HF_TIMEOUT = 120 # seconds for large images # ============================================================================ # Logging Setup # ============================================================================ logger = logging.getLogger(__name__) # ============================================================================ # Image Loading and Encoding # ============================================================================ def load_and_encode_image(image_path: str) -> Dict[str, str]: """ Load image file and encode as base64. Args: image_path: Path to image file Returns: Dict with structure: { "data": str, # Base64 encoded image "mime_type": str, # MIME type (e.g., "image/jpeg") "size_mb": float, # File size in MB } Raises: FileNotFoundError: If image doesn't exist ValueError: If file is not a supported image format or too large """ path = Path(image_path) if not path.exists(): raise FileNotFoundError(f"Image file not found: {image_path}") # Check file extension extension = path.suffix.lower() if extension not in SUPPORTED_IMAGE_FORMATS: raise ValueError( f"Unsupported image format: {extension}. " f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}" ) # Check file size size_bytes = path.stat().st_size size_mb = size_bytes / (1024 * 1024) if size_mb > MAX_IMAGE_SIZE_MB: raise ValueError( f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB" ) # Read and encode image with open(path, 'rb') as f: image_data = f.read() encoded = base64.b64encode(image_data).decode('utf-8') # Determine MIME type mime_types = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.webp': 'image/webp', '.bmp': 'image/bmp', } mime_type = mime_types.get(extension, 'image/jpeg') logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})") return { "data": encoded, "mime_type": mime_type, "size_mb": size_mb, } # ============================================================================ # Gemini Vision # ============================================================================ @retry( stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), retry=retry_if_exception_type((ConnectionError, TimeoutError)), reraise=True, ) def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict: """ Analyze image using Gemini 2.0 Flash. Args: image_path: Path to image file question: Optional question about the image (default: "Describe this image") Returns: Dict with structure: { "answer": str, # LLM's analysis/answer "model": "gemini-2.0-flash", "image_path": str, "question": str } Raises: ValueError: If API key not configured or image invalid ConnectionError: If API connection fails (triggers retry) """ try: import google.genai as genai settings = Settings() api_key = settings.google_api_key if not api_key: raise ValueError("GOOGLE_API_KEY not configured in settings") # Load and encode image image_data = load_and_encode_image(image_path) # Default question if not question: question = "Describe this image in detail." logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'") # Configure Gemini client client = genai.Client(api_key=api_key) # Create content with image and text response = client.models.generate_content( model='gemini-2.0-flash-exp', contents=[ question, { "mime_type": image_data["mime_type"], "data": image_data["data"] } ] ) answer = response.text.strip() logger.info(f"Gemini vision successful: {len(answer)} chars") return { "answer": answer, "model": "gemini-2.0-flash", "image_path": image_path, "question": question, } except ValueError as e: logger.error(f"Gemini configuration/input error: {e}") raise except (ConnectionError, TimeoutError) as e: logger.warning(f"Gemini connection error (will retry): {e}") raise except Exception as e: logger.error(f"Gemini vision error: {e}") raise Exception(f"Gemini vision failed: {str(e)}") # ============================================================================ # Claude Vision (Fallback) # ============================================================================ @retry( stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), retry=retry_if_exception_type((ConnectionError, TimeoutError)), reraise=True, ) def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict: """ Analyze image using Claude Sonnet 4.5. Args: image_path: Path to image file question: Optional question about the image (default: "Describe this image") Returns: Dict with structure: { "answer": str, # LLM's analysis/answer "model": "claude-sonnet-4.5", "image_path": str, "question": str } Raises: ValueError: If API key not configured or image invalid ConnectionError: If API connection fails (triggers retry) """ try: from anthropic import Anthropic settings = Settings() api_key = settings.anthropic_api_key if not api_key: raise ValueError("ANTHROPIC_API_KEY not configured in settings") # Load and encode image image_data = load_and_encode_image(image_path) # Default question if not question: question = "Describe this image in detail." logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'") # Configure Claude client client = Anthropic(api_key=api_key) # Create message with image response = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=1024, messages=[ { "role": "user", "content": [ { "type": "image", "source": { "type": "base64", "media_type": image_data["mime_type"], "data": image_data["data"], }, }, { "type": "text", "text": question } ], } ], ) answer = response.content[0].text.strip() logger.info(f"Claude vision successful: {len(answer)} chars") return { "answer": answer, "model": "claude-sonnet-4.5", "image_path": image_path, "question": question, } except ValueError as e: logger.error(f"Claude configuration/input error: {e}") raise except (ConnectionError, TimeoutError) as e: logger.warning(f"Claude connection error (will retry): {e}") raise except Exception as e: logger.error(f"Claude vision error: {e}") raise Exception(f"Claude vision failed: {str(e)}") # ============================================================================ # HuggingFace Vision # ============================================================================ @retry( stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), retry=retry_if_exception_type((ConnectionError, TimeoutError)), reraise=True, ) def analyze_image_hf(image_path: str, question: Optional[str] = None) -> Dict: """ Analyze image using HuggingFace Inference API. Validated models (Phase 0 testing): - google/gemma-3-27b-it:scaleway (recommended, ~6s) - CohereLabs/aya-vision-32b (~7s) - Qwen/Qwen3-VL-30B-A3B-Instruct:novita (~14s) Args: image_path: Path to image file question: Optional question about the image (default: "Describe this image") Returns: Dict with structure: { "answer": str, "model": str, "image_path": str, "question": str } Raises: ValueError: If HF_TOKEN not configured or image invalid ConnectionError: If API connection fails (triggers retry) """ try: from huggingface_hub import InferenceClient settings = Settings() hf_token = settings.hf_token if not hf_token: raise ValueError("HF_TOKEN not configured in settings") # Load and encode image image_data = load_and_encode_image(image_path) # Default question if not question: question = "Describe this image in detail." logger.info(f"HF vision analysis: {Path(image_path).name} - '{question}'") logger.info(f"Using model: {HF_VISION_MODEL}") # Configure HF client client = InferenceClient(token=hf_token) # Create messages with base64 image messages = [ { "role": "user", "content": [ {"type": "text", "text": question}, { "type": "image_url", "image_url": { "url": f"data:{image_data['mime_type']};base64,{image_data['data']}" } } ] } ] # Call chat completion response = client.chat_completion( model=HF_VISION_MODEL, messages=messages, max_tokens=1024, ) answer = response.choices[0].message.content.strip() logger.info(f"HF vision successful: {len(answer)} chars") return { "answer": answer, "model": HF_VISION_MODEL, "image_path": image_path, "question": question, } except ValueError as e: logger.error(f"HF configuration/input error: {e}") raise except (ConnectionError, TimeoutError) as e: logger.warning(f"HF connection error (will retry): {e}") raise except Exception as e: logger.error(f"HF vision error: {e}") raise Exception(f"HF vision failed: {str(e)}") # ============================================================================ # Unified Vision Analysis # ============================================================================ def analyze_image(image_path: str, question: Optional[str] = None) -> Dict: """ Analyze image using provider specified by LLM_PROVIDER environment variable. Respects LLM_PROVIDER setting: - "huggingface" -> Uses HF Inference API - "gemini" -> Uses Gemini 2.0 Flash - "claude" -> Uses Claude Sonnet 4.5 - "groq" -> Not yet implemented Args: image_path: Path to image file question: Optional question about the image Returns: Dict with analysis results from selected provider Raises: Exception: If selected provider fails or is not configured """ provider = os.getenv("LLM_PROVIDER", "gemini").lower() settings = Settings() logger.info(f"Vision analysis with provider: {provider}") # Route to selected provider (each fails independently - NO fallback chains) if provider == "huggingface": try: return analyze_image_hf(image_path, question) except Exception as e: logger.error(f"HF vision failed: {e}") raise Exception(f"HF vision failed: {str(e)}") elif provider == "gemini": if not settings.google_api_key: raise ValueError("GOOGLE_API_KEY not configured for Gemini provider") try: return analyze_image_gemini(image_path, question) except Exception as e: logger.error(f"Gemini vision failed: {e}") raise Exception(f"Gemini vision failed: {str(e)}") elif provider == "claude": if not settings.anthropic_api_key: raise ValueError("ANTHROPIC_API_KEY not configured for Claude provider") try: return analyze_image_claude(image_path, question) except Exception as e: logger.error(f"Claude vision failed: {e}") raise Exception(f"Claude vision failed: {str(e)}") elif provider == "groq": raise NotImplementedError("Groq vision not yet implemented (Phase 5)") else: raise ValueError(f"Unknown LLM_PROVIDER: {provider}. Valid: huggingface, gemini, claude, groq")