import base64 import io import os from typing import Optional import requests from PIL import Image from utils.logger import get_logger try: from utils.api.rotator import APIKeyRotator # available in full repo except Exception: # standalone fallback class APIKeyRotator: # type: ignore def __init__(self, prefix: str = "NVIDIA_API_", max_slots: int = 5): self.keys = [] for i in range(1, max_slots + 1): k = os.getenv(f"{prefix}{i}") if k: self.keys.append(k) if not self.keys: single = os.getenv(prefix.rstrip("_")) if single: self.keys.append(single) self._idx = 0 def get_key(self) -> Optional[str]: if not self.keys: return None k = self.keys[self._idx % len(self.keys)] self._idx += 1 return k logger = get_logger("MAVERICK_CAPTIONER", __name__) def _normalize_caption(text: str) -> str: if not text: return "" t = text.strip() # Remove common conversational/openers and meta phrases banned_prefixes = [ "sure,", "sure.", "sure", "here is", "here are", "this image", "the image", "image shows", "the picture", "the photo", "the text describes", "the text describe", "it shows", "it depicts", "caption:", "description:", "output:", "result:", "answer:", "analysis:", "observation:", ] t_lower = t.lower() for p in banned_prefixes: if t_lower.startswith(p): t = t[len(p):].lstrip(" :-\u2014\u2013") t_lower = t.lower() # Strip surrounding quotes and markdown artifacts t = t.strip().strip('"').strip("'").strip() # Collapse whitespace t = " ".join(t.split()) return t class NvidiaMaverickCaptioner: """Caption images using NVIDIA Integrate API (meta/llama-4-maverick-17b-128e-instruct).""" def __init__(self, rotator: Optional[APIKeyRotator] = None, model: Optional[str] = None): self.rotator = rotator or APIKeyRotator(prefix="NVIDIA_API_", max_slots=5) self.model = model or os.getenv("NVIDIA_MAVERICK_MODEL", "meta/llama-4-maverick-17b-128e-instruct") self.invoke_url = "https://integrate.api.nvidia.com/v1/chat/completions" def _encode_image_jpeg_b64(self, image: Image.Image) -> str: buf = io.BytesIO() # Convert to RGB to ensure JPEG-compatible image.convert("RGB").save(buf, format="JPEG", quality=90) return base64.b64encode(buf.getvalue()).decode("utf-8") def caption_image(self, image: Image.Image) -> str: try: key = self.rotator.get_key() if not key: logger.warning("NVIDIA API key not available; skipping image caption.") return "" img_b64 = self._encode_image_jpeg_b64(image) # Strict, non-conversational system prompt system_prompt = ( "You are an expert vision captioner. Produce a precise, information-dense caption of the image. " "Do not include conversational phrases, prefaces, meta commentary, or apologies. " "Avoid starting with phrases like 'The image/picture/photo shows' or 'Here is'. " "Write a single concise paragraph with concrete entities, text in the image, and notable details." ) user_prompt = ( "Caption this image at the finest level of detail. Include any visible text verbatim. " "Return only the caption text." ) # Multimodal content format for NVIDIA Integrate API messages = [ {"role": "system", "content": system_prompt}, { "role": "user", "content": [ {"type": "text", "text": user_prompt}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{img_b64}" } }, ] }, ] payload = { "model": self.model, "messages": messages, "max_tokens": 512, "temperature": 0.2, "top_p": 0.9, "frequency_penalty": 0.0, "presence_penalty": 0.0, "stream": False, } headers = { "Authorization": f"Bearer {key}", "Accept": "application/json", "Content-Type": "application/json", } resp = requests.post(self.invoke_url, headers=headers, json=payload, timeout=60) if resp.status_code >= 400: logger.warning(f"Maverick caption API error {resp.status_code}: {resp.text[:200]}") return "" data = resp.json() text = data.get("choices", [{}])[0].get("message", {}).get("content", "") return _normalize_caption(text) except Exception as e: logger.warning(f"Maverick caption failed: {e}") return ""