EdSummariser / ingestion_python /services /maverick_captioner.py
LiamKhoaLe's picture
Merge commit 'e34edc7cd55f292dd0b192dc00b782c22208fde6' as 'ingestion_python'
ee39cc9
import base64
import io
import os
from typing import Optional
import requests
from PIL import Image
from utils.logger import get_logger
try:
from utils.api.rotator import APIKeyRotator # available in full repo
except Exception: # standalone fallback
class APIKeyRotator: # type: ignore
def __init__(self, prefix: str = "NVIDIA_API_", max_slots: int = 5):
self.keys = []
for i in range(1, max_slots + 1):
k = os.getenv(f"{prefix}{i}")
if k:
self.keys.append(k)
if not self.keys:
single = os.getenv(prefix.rstrip("_"))
if single:
self.keys.append(single)
self._idx = 0
def get_key(self) -> Optional[str]:
if not self.keys:
return None
k = self.keys[self._idx % len(self.keys)]
self._idx += 1
return k
logger = get_logger("MAVERICK_CAPTIONER", __name__)
def _normalize_caption(text: str) -> str:
if not text:
return ""
t = text.strip()
# Remove common conversational/openers and meta phrases
banned_prefixes = [
"sure,", "sure.", "sure", "here is", "here are", "this image", "the image", "image shows",
"the picture", "the photo", "the text describes", "the text describe", "it shows", "it depicts",
"caption:", "description:", "output:", "result:", "answer:", "analysis:", "observation:",
]
t_lower = t.lower()
for p in banned_prefixes:
if t_lower.startswith(p):
t = t[len(p):].lstrip(" :-\u2014\u2013")
t_lower = t.lower()
# Strip surrounding quotes and markdown artifacts
t = t.strip().strip('"').strip("'").strip()
# Collapse whitespace
t = " ".join(t.split())
return t
class NvidiaMaverickCaptioner:
"""Caption images using NVIDIA Integrate API (meta/llama-4-maverick-17b-128e-instruct)."""
def __init__(self, rotator: Optional[APIKeyRotator] = None, model: Optional[str] = None):
self.rotator = rotator or APIKeyRotator(prefix="NVIDIA_API_", max_slots=5)
self.model = model or os.getenv("NVIDIA_MAVERICK_MODEL", "meta/llama-4-maverick-17b-128e-instruct")
self.invoke_url = "https://integrate.api.nvidia.com/v1/chat/completions"
def _encode_image_jpeg_b64(self, image: Image.Image) -> str:
buf = io.BytesIO()
# Convert to RGB to ensure JPEG-compatible
image.convert("RGB").save(buf, format="JPEG", quality=90)
return base64.b64encode(buf.getvalue()).decode("utf-8")
def caption_image(self, image: Image.Image) -> str:
try:
key = self.rotator.get_key()
if not key:
logger.warning("NVIDIA API key not available; skipping image caption.")
return ""
img_b64 = self._encode_image_jpeg_b64(image)
# Strict, non-conversational system prompt
system_prompt = (
"You are an expert vision captioner. Produce a precise, information-dense caption of the image. "
"Do not include conversational phrases, prefaces, meta commentary, or apologies. "
"Avoid starting with phrases like 'The image/picture/photo shows' or 'Here is'. "
"Write a single concise paragraph with concrete entities, text in the image, and notable details."
)
user_prompt = (
"Caption this image at the finest level of detail. Include any visible text verbatim. "
"Return only the caption text."
)
# Multimodal content format for NVIDIA Integrate API
messages = [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_b64}"
}
},
]
},
]
payload = {
"model": self.model,
"messages": messages,
"max_tokens": 512,
"temperature": 0.2,
"top_p": 0.9,
"frequency_penalty": 0.0,
"presence_penalty": 0.0,
"stream": False,
}
headers = {
"Authorization": f"Bearer {key}",
"Accept": "application/json",
"Content-Type": "application/json",
}
resp = requests.post(self.invoke_url, headers=headers, json=payload, timeout=60)
if resp.status_code >= 400:
logger.warning(f"Maverick caption API error {resp.status_code}: {resp.text[:200]}")
return ""
data = resp.json()
text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
return _normalize_caption(text)
except Exception as e:
logger.warning(f"Maverick caption failed: {e}")
return ""