|
|
""" |
|
|
Vision Tool - Image analysis using multimodal LLMs |
|
|
Author: @mangubee |
|
|
Date: 2026-01-02 |
|
|
|
|
|
Provides image analysis functionality using: |
|
|
- HuggingFace Inference API (Gemini-3-27B, recommended) |
|
|
- Gemini 2.0 Flash (fallback) |
|
|
- Claude Sonnet 4.5 (fallback) |
|
|
|
|
|
Supports: |
|
|
- Image file loading and encoding |
|
|
- Question answering about images |
|
|
- Object detection/description |
|
|
- Text extraction (OCR) |
|
|
- Visual reasoning |
|
|
""" |
|
|
|
|
|
import os |
|
|
import base64 |
|
|
import logging |
|
|
from pathlib import Path |
|
|
from typing import Dict, Optional |
|
|
from tenacity import ( |
|
|
retry, |
|
|
stop_after_attempt, |
|
|
wait_exponential, |
|
|
retry_if_exception_type, |
|
|
) |
|
|
|
|
|
from src.config.settings import Settings |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MAX_RETRIES = 3 |
|
|
RETRY_MIN_WAIT = 1 |
|
|
RETRY_MAX_WAIT = 10 |
|
|
MAX_IMAGE_SIZE_MB = 10 |
|
|
SUPPORTED_IMAGE_FORMATS = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp'} |
|
|
HF_VISION_MODEL = os.getenv("HF_VISION_MODEL", "google/gemma-3-27b-it:scaleway") |
|
|
HF_TIMEOUT = 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_and_encode_image(image_path: str) -> Dict[str, str]: |
|
|
""" |
|
|
Load image file and encode as base64. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"data": str, # Base64 encoded image |
|
|
"mime_type": str, # MIME type (e.g., "image/jpeg") |
|
|
"size_mb": float, # File size in MB |
|
|
} |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If image doesn't exist |
|
|
ValueError: If file is not a supported image format or too large |
|
|
""" |
|
|
path = Path(image_path) |
|
|
|
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Image file not found: {image_path}") |
|
|
|
|
|
|
|
|
extension = path.suffix.lower() |
|
|
if extension not in SUPPORTED_IMAGE_FORMATS: |
|
|
raise ValueError( |
|
|
f"Unsupported image format: {extension}. " |
|
|
f"Supported: {', '.join(SUPPORTED_IMAGE_FORMATS)}" |
|
|
) |
|
|
|
|
|
|
|
|
size_bytes = path.stat().st_size |
|
|
size_mb = size_bytes / (1024 * 1024) |
|
|
|
|
|
if size_mb > MAX_IMAGE_SIZE_MB: |
|
|
raise ValueError( |
|
|
f"Image too large: {size_mb:.2f}MB. Maximum: {MAX_IMAGE_SIZE_MB}MB" |
|
|
) |
|
|
|
|
|
|
|
|
with open(path, 'rb') as f: |
|
|
image_data = f.read() |
|
|
|
|
|
encoded = base64.b64encode(image_data).decode('utf-8') |
|
|
|
|
|
|
|
|
mime_types = { |
|
|
'.jpg': 'image/jpeg', |
|
|
'.jpeg': 'image/jpeg', |
|
|
'.png': 'image/png', |
|
|
'.gif': 'image/gif', |
|
|
'.webp': 'image/webp', |
|
|
'.bmp': 'image/bmp', |
|
|
} |
|
|
mime_type = mime_types.get(extension, 'image/jpeg') |
|
|
|
|
|
logger.info(f"Image loaded: {path.name} ({size_mb:.2f}MB, {mime_type})") |
|
|
|
|
|
return { |
|
|
"data": encoded, |
|
|
"mime_type": mime_type, |
|
|
"size_mb": size_mb, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(MAX_RETRIES), |
|
|
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
|
|
retry=retry_if_exception_type((ConnectionError, TimeoutError)), |
|
|
reraise=True, |
|
|
) |
|
|
def analyze_image_gemini(image_path: str, question: Optional[str] = None) -> Dict: |
|
|
""" |
|
|
Analyze image using Gemini 2.0 Flash. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
question: Optional question about the image (default: "Describe this image") |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"answer": str, # LLM's analysis/answer |
|
|
"model": "gemini-2.0-flash", |
|
|
"image_path": str, |
|
|
"question": str |
|
|
} |
|
|
|
|
|
Raises: |
|
|
ValueError: If API key not configured or image invalid |
|
|
ConnectionError: If API connection fails (triggers retry) |
|
|
""" |
|
|
try: |
|
|
import google.genai as genai |
|
|
|
|
|
settings = Settings() |
|
|
api_key = settings.google_api_key |
|
|
|
|
|
if not api_key: |
|
|
raise ValueError("GOOGLE_API_KEY not configured in settings") |
|
|
|
|
|
|
|
|
image_data = load_and_encode_image(image_path) |
|
|
|
|
|
|
|
|
if not question: |
|
|
question = "Describe this image in detail." |
|
|
|
|
|
logger.info(f"Gemini vision analysis: {Path(image_path).name} - '{question}'") |
|
|
|
|
|
|
|
|
client = genai.Client(api_key=api_key) |
|
|
|
|
|
|
|
|
response = client.models.generate_content( |
|
|
model='gemini-2.0-flash-exp', |
|
|
contents=[ |
|
|
question, |
|
|
{ |
|
|
"mime_type": image_data["mime_type"], |
|
|
"data": image_data["data"] |
|
|
} |
|
|
] |
|
|
) |
|
|
|
|
|
answer = response.text.strip() |
|
|
|
|
|
logger.info(f"Gemini vision successful: {len(answer)} chars") |
|
|
|
|
|
return { |
|
|
"answer": answer, |
|
|
"model": "gemini-2.0-flash", |
|
|
"image_path": image_path, |
|
|
"question": question, |
|
|
} |
|
|
|
|
|
except ValueError as e: |
|
|
logger.error(f"Gemini configuration/input error: {e}") |
|
|
raise |
|
|
except (ConnectionError, TimeoutError) as e: |
|
|
logger.warning(f"Gemini connection error (will retry): {e}") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Gemini vision error: {e}") |
|
|
raise Exception(f"Gemini vision failed: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(MAX_RETRIES), |
|
|
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
|
|
retry=retry_if_exception_type((ConnectionError, TimeoutError)), |
|
|
reraise=True, |
|
|
) |
|
|
def analyze_image_claude(image_path: str, question: Optional[str] = None) -> Dict: |
|
|
""" |
|
|
Analyze image using Claude Sonnet 4.5. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
question: Optional question about the image (default: "Describe this image") |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"answer": str, # LLM's analysis/answer |
|
|
"model": "claude-sonnet-4.5", |
|
|
"image_path": str, |
|
|
"question": str |
|
|
} |
|
|
|
|
|
Raises: |
|
|
ValueError: If API key not configured or image invalid |
|
|
ConnectionError: If API connection fails (triggers retry) |
|
|
""" |
|
|
try: |
|
|
from anthropic import Anthropic |
|
|
|
|
|
settings = Settings() |
|
|
api_key = settings.anthropic_api_key |
|
|
|
|
|
if not api_key: |
|
|
raise ValueError("ANTHROPIC_API_KEY not configured in settings") |
|
|
|
|
|
|
|
|
image_data = load_and_encode_image(image_path) |
|
|
|
|
|
|
|
|
if not question: |
|
|
question = "Describe this image in detail." |
|
|
|
|
|
logger.info(f"Claude vision analysis: {Path(image_path).name} - '{question}'") |
|
|
|
|
|
|
|
|
client = Anthropic(api_key=api_key) |
|
|
|
|
|
|
|
|
response = client.messages.create( |
|
|
model="claude-sonnet-4-20250514", |
|
|
max_tokens=1024, |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "image", |
|
|
"source": { |
|
|
"type": "base64", |
|
|
"media_type": image_data["mime_type"], |
|
|
"data": image_data["data"], |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
"type": "text", |
|
|
"text": question |
|
|
} |
|
|
], |
|
|
} |
|
|
], |
|
|
) |
|
|
|
|
|
answer = response.content[0].text.strip() |
|
|
|
|
|
logger.info(f"Claude vision successful: {len(answer)} chars") |
|
|
|
|
|
return { |
|
|
"answer": answer, |
|
|
"model": "claude-sonnet-4.5", |
|
|
"image_path": image_path, |
|
|
"question": question, |
|
|
} |
|
|
|
|
|
except ValueError as e: |
|
|
logger.error(f"Claude configuration/input error: {e}") |
|
|
raise |
|
|
except (ConnectionError, TimeoutError) as e: |
|
|
logger.warning(f"Claude connection error (will retry): {e}") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Claude vision error: {e}") |
|
|
raise Exception(f"Claude vision failed: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(MAX_RETRIES), |
|
|
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
|
|
retry=retry_if_exception_type((ConnectionError, TimeoutError)), |
|
|
reraise=True, |
|
|
) |
|
|
def analyze_image_hf(image_path: str, question: Optional[str] = None) -> Dict: |
|
|
""" |
|
|
Analyze image using HuggingFace Inference API. |
|
|
|
|
|
Validated models (Phase 0 testing): |
|
|
- google/gemma-3-27b-it:scaleway (recommended, ~6s) |
|
|
- CohereLabs/aya-vision-32b (~7s) |
|
|
- Qwen/Qwen3-VL-30B-A3B-Instruct:novita (~14s) |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
question: Optional question about the image (default: "Describe this image") |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"answer": str, |
|
|
"model": str, |
|
|
"image_path": str, |
|
|
"question": str |
|
|
} |
|
|
|
|
|
Raises: |
|
|
ValueError: If HF_TOKEN not configured or image invalid |
|
|
ConnectionError: If API connection fails (triggers retry) |
|
|
""" |
|
|
try: |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
settings = Settings() |
|
|
hf_token = settings.hf_token |
|
|
|
|
|
if not hf_token: |
|
|
raise ValueError("HF_TOKEN not configured in settings") |
|
|
|
|
|
|
|
|
image_data = load_and_encode_image(image_path) |
|
|
|
|
|
|
|
|
if not question: |
|
|
question = "Describe this image in detail." |
|
|
|
|
|
logger.info(f"HF vision analysis: {Path(image_path).name} - '{question}'") |
|
|
logger.info(f"Using model: {HF_VISION_MODEL}") |
|
|
|
|
|
|
|
|
client = InferenceClient(token=hf_token) |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": question}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": { |
|
|
"url": f"data:{image_data['mime_type']};base64,{image_data['data']}" |
|
|
} |
|
|
} |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
response = client.chat_completion( |
|
|
model=HF_VISION_MODEL, |
|
|
messages=messages, |
|
|
max_tokens=1024, |
|
|
) |
|
|
|
|
|
answer = response.choices[0].message.content.strip() |
|
|
|
|
|
logger.info(f"HF vision successful: {len(answer)} chars") |
|
|
|
|
|
return { |
|
|
"answer": answer, |
|
|
"model": HF_VISION_MODEL, |
|
|
"image_path": image_path, |
|
|
"question": question, |
|
|
} |
|
|
|
|
|
except ValueError as e: |
|
|
logger.error(f"HF configuration/input error: {e}") |
|
|
raise |
|
|
except (ConnectionError, TimeoutError) as e: |
|
|
logger.warning(f"HF connection error (will retry): {e}") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"HF vision error: {e}") |
|
|
raise Exception(f"HF vision failed: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_image(image_path: str, question: Optional[str] = None) -> Dict: |
|
|
""" |
|
|
Analyze image using provider specified by LLM_PROVIDER environment variable. |
|
|
|
|
|
Respects LLM_PROVIDER setting: |
|
|
- "huggingface" -> Uses HF Inference API |
|
|
- "gemini" -> Uses Gemini 2.0 Flash |
|
|
- "claude" -> Uses Claude Sonnet 4.5 |
|
|
- "groq" -> Not yet implemented |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
question: Optional question about the image |
|
|
|
|
|
Returns: |
|
|
Dict with analysis results from selected provider |
|
|
|
|
|
Raises: |
|
|
Exception: If selected provider fails or is not configured |
|
|
""" |
|
|
provider = os.getenv("LLM_PROVIDER", "gemini").lower() |
|
|
settings = Settings() |
|
|
|
|
|
logger.info(f"Vision analysis with provider: {provider}") |
|
|
|
|
|
|
|
|
if provider == "huggingface": |
|
|
try: |
|
|
return analyze_image_hf(image_path, question) |
|
|
except Exception as e: |
|
|
logger.error(f"HF vision failed: {e}") |
|
|
raise Exception(f"HF vision failed: {str(e)}") |
|
|
|
|
|
elif provider == "gemini": |
|
|
if not settings.google_api_key: |
|
|
raise ValueError("GOOGLE_API_KEY not configured for Gemini provider") |
|
|
try: |
|
|
return analyze_image_gemini(image_path, question) |
|
|
except Exception as e: |
|
|
logger.error(f"Gemini vision failed: {e}") |
|
|
raise Exception(f"Gemini vision failed: {str(e)}") |
|
|
|
|
|
elif provider == "claude": |
|
|
if not settings.anthropic_api_key: |
|
|
raise ValueError("ANTHROPIC_API_KEY not configured for Claude provider") |
|
|
try: |
|
|
return analyze_image_claude(image_path, question) |
|
|
except Exception as e: |
|
|
logger.error(f"Claude vision failed: {e}") |
|
|
raise Exception(f"Claude vision failed: {str(e)}") |
|
|
|
|
|
elif provider == "groq": |
|
|
raise NotImplementedError("Groq vision not yet implemented (Phase 5)") |
|
|
|
|
|
else: |
|
|
raise ValueError(f"Unknown LLM_PROVIDER: {provider}. Valid: huggingface, gemini, claude, groq") |
|
|
|