| | """ |
| | Vision Tool - Image analysis using multimodal LLMs |
| | Author: @mangobee |
| | Date: 2026-01-02 |
| | |
| | Provides image analysis functionality using: |
| | - Gemini 2.0 Flash (default, free tier) |
| | - Claude Sonnet 4.5 (fallback, if configured) |
| | |
| | Supports: |
| | - Image file loading and encoding |
| | - Question answering about images |
| | - Object detection/description |
| | - Text extraction (OCR) |
| | - Visual reasoning |
| | """ |
| |
|
| | import base64 |
| | import logging |
| | from pathlib import Path |
| | from typing import Dict, Optional |
| | from tenacity import ( |
| | retry, |
| | stop_after_attempt, |
| | wait_exponential, |
| | retry_if_exception_type, |
| | ) |
| |
|
| | from src.config.settings import Settings |
| |
|
| | |
| | |
| | |
| | MAX_RETRIES = 3 |
| | RETRY_MIN_WAIT = 1 |
| | RETRY_MAX_WAIT = 10 |
| | MAX_IMAGE_SIZE_MB = 10 |
| | SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'} |
| |
|
| | |
| | |
| | |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def load_and_encode_image(image_path: str) -> Dict[str, str]: |
| | """ |
| | Load image file and encode as base64. |
| | |
| | Args: |
| | image_path: Path to image file |
| | |
| | Returns: |
| | Dict with structure: { |
| | "data": str, # Base64 encoded image |
| | "mime_type": str, # MIME type (e.g., "image/jpeg") |
| | "size_mb": float, # File size in MB |
| | } |
| | |
| | Raises: |
| | FileNotFoundError: If image doesn't exist |
| | ValueError: If file is not a supported image format or too large |
| | """ |
| | path = Path(image_path) |
| |
|
| | if not path.exists(): |
| | raise FileNotFoundError(f"Image file not found: {image_path}") |
| |
|
| | |
| | extension = path.suffix.lower() |
| | if extension not in SUPPORTED_IMAGE_FORMATS: |
| | raise ValueError( |
| | f"Unsupported image format: {extension}. " |
| | f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}" |
| | ) |
| |
|
| | |
| | size_bytes = path.stat().st_size |
| | size_mb = size_bytes / (1024 * 1024) |
| |
|
| | if size_mb > MAX_IMAGE_SIZE_MB: |
| | raise ValueError( |
| | f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB" |
| | ) |
| |
|
| | |
| | with open(path, 'rb') as f: |
| | image_data = f.read() |
| |
|
| | encoded = base64.b64encode(image_data).decode('utf-8') |
| |
|
| | |
| | mime_types = { |
| | '.jpg': 'image/jpeg', |
| | '.jpeg': 'image/jpeg', |
| | '.png': 'image/png', |
| | '.gif': 'image/gif', |
| | '.webp': 'image/webp', |
| | '.bmp': 'image/bmp', |
| | } |
| | mime_type = mime_types.get(extension, 'image/jpeg') |
| |
|
| | logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})") |
| |
|
| | return { |
| | "data": encoded, |
| | "mime_type": mime_type, |
| | "size_mb": size_mb, |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @retry( |
| | stop=stop_after_attempt(MAX_RETRIES), |
| | wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| | retry=retry_if_exception_type((ConnectionError, TimeoutError)), |
| | reraise=True, |
| | ) |
| | def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict: |
| | """ |
| | Analyze image using Gemini 2.0 Flash. |
| | |
| | Args: |
| | image_path: Path to image file |
| | question: Optional question about the image (default: "Describe this image") |
| | |
| | Returns: |
| | Dict with structure: { |
| | "answer": str, # LLM's analysis/answer |
| | "model": "gemini-2.0-flash", |
| | "image_path": str, |
| | "question": str |
| | } |
| | |
| | Raises: |
| | ValueError: If API key not configured or image invalid |
| | ConnectionError: If API connection fails (triggers retry) |
| | """ |
| | try: |
| | import google.genai as genai |
| |
|
| | settings = Settings() |
| | api_key = settings.google_api_key |
| |
|
| | if not api_key: |
| | raise ValueError("GOOGLE_API_KEY not configured in settings") |
| |
|
| | |
| | image_data = load_and_encode_image(image_path) |
| |
|
| | |
| | if not question: |
| | question = "Describe this image in detail." |
| |
|
| | logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'") |
| |
|
| | |
| | client = genai.Client(api_key=api_key) |
| |
|
| | |
| | response = client.models.generate_content( |
| | model='gemini-2.0-flash-exp', |
| | contents=[ |
| | question, |
| | { |
| | "mime_type": image_data["mime_type"], |
| | "data": image_data["data"] |
| | } |
| | ] |
| | ) |
| |
|
| | answer = response.text.strip() |
| |
|
| | logger.info(f"Gemini vision successful: {len(answer)} chars") |
| |
|
| | return { |
| | "answer": answer, |
| | "model": "gemini-2.0-flash", |
| | "image_path": image_path, |
| | "question": question, |
| | } |
| |
|
| | except ValueError as e: |
| | logger.error(f"Gemini configuration/input error: {e}") |
| | raise |
| | except (ConnectionError, TimeoutError) as e: |
| | logger.warning(f"Gemini connection error (will retry): {e}") |
| | raise |
| | except Exception as e: |
| | logger.error(f"Gemini vision error: {e}") |
| | raise Exception(f"Gemini vision failed: {str(e)}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @retry( |
| | stop=stop_after_attempt(MAX_RETRIES), |
| | wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| | retry=retry_if_exception_type((ConnectionError, TimeoutError)), |
| | reraise=True, |
| | ) |
| | def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict: |
| | """ |
| | Analyze image using Claude Sonnet 4.5. |
| | |
| | Args: |
| | image_path: Path to image file |
| | question: Optional question about the image (default: "Describe this image") |
| | |
| | Returns: |
| | Dict with structure: { |
| | "answer": str, # LLM's analysis/answer |
| | "model": "claude-sonnet-4.5", |
| | "image_path": str, |
| | "question": str |
| | } |
| | |
| | Raises: |
| | ValueError: If API key not configured or image invalid |
| | ConnectionError: If API connection fails (triggers retry) |
| | """ |
| | try: |
| | from anthropic import Anthropic |
| |
|
| | settings = Settings() |
| | api_key = settings.anthropic_api_key |
| |
|
| | if not api_key: |
| | raise ValueError("ANTHROPIC_API_KEY not configured in settings") |
| |
|
| | |
| | image_data = load_and_encode_image(image_path) |
| |
|
| | |
| | if not question: |
| | question = "Describe this image in detail." |
| |
|
| | logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'") |
| |
|
| | |
| | client = Anthropic(api_key=api_key) |
| |
|
| | |
| | response = client.messages.create( |
| | model="claude-sonnet-4-20250514", |
| | max_tokens=1024, |
| | messages=[ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "source": { |
| | "type": "base64", |
| | "media_type": image_data["mime_type"], |
| | "data": image_data["data"], |
| | }, |
| | }, |
| | { |
| | "type": "text", |
| | "text": question |
| | } |
| | ], |
| | } |
| | ], |
| | ) |
| |
|
| | answer = response.content[0].text.strip() |
| |
|
| | logger.info(f"Claude vision successful: {len(answer)} chars") |
| |
|
| | return { |
| | "answer": answer, |
| | "model": "claude-sonnet-4.5", |
| | "image_path": image_path, |
| | "question": question, |
| | } |
| |
|
| | except ValueError as e: |
| | logger.error(f"Claude configuration/input error: {e}") |
| | raise |
| | except (ConnectionError, TimeoutError) as e: |
| | logger.warning(f"Claude connection error (will retry): {e}") |
| | raise |
| | except Exception as e: |
| | logger.error(f"Claude vision error: {e}") |
| | raise Exception(f"Claude vision failed: {str(e)}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def analyze_image(image_path: str, question: Optional[str] = None) -> Dict: |
| | """ |
| | Analyze image using available multimodal LLM. |
| | |
| | Tries Gemini first (free tier), falls back to Claude if configured. |
| | |
| | Args: |
| | image_path: Path to image file |
| | question: Optional question about the image |
| | |
| | Returns: |
| | Dict with analysis results from either Gemini or Claude |
| | |
| | Raises: |
| | Exception: If both Gemini and Claude fail or are not configured |
| | """ |
| | settings = Settings() |
| |
|
| | |
| | if settings.google_api_key: |
| | try: |
| | return analyze_image_gemini(image_path, question) |
| | except Exception as e: |
| | logger.warning(f"Gemini failed, trying Claude: {e}") |
| |
|
| | |
| | if settings.anthropic_api_key: |
| | try: |
| | return analyze_image_claude(image_path, question) |
| | except Exception as e: |
| | logger.error(f"Claude also failed: {e}") |
| | raise Exception(f"Vision analysis failed - Gemini and Claude both failed") |
| |
|
| | |
| | raise ValueError( |
| | "No vision API configured. Please set GOOGLE_API_KEY or ANTHROPIC_API_KEY" |
| | ) |
| |
|