import base64 import requests from typing import List, Dict class VisionAnalyzer: def __init__(self, api_key: str, group_id: str): self.api_key = api_key self.group_id = group_id self.base_url = "https://api.minimaxi.chat/v1" self.prompt = """Describe this video frame in one concise sentence. Focus on: - Who/what is shown (people, products, text overlays) - Setting/environment - Actions or emotions displayed - Any visible brand elements or text Be factual and specific. Do not interpret or add assumptions.""" def _encode_image(self, image_path: str) -> str: """Encode image to base64.""" with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode('utf-8') def describe_frame(self, image_path: str, timestamp: float) -> str: """ Generate description of a single frame. Returns: Description string, e.g., "Woman looking frustrated in messy kitchen" """ url = f"{self.base_url}/text/chatcompletion_v2" headers = { 'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json' } image_data = self._encode_image(image_path) payload = { "model": "MiniMax-Text-01", "messages": [ { "role": "user", "content": [ {"type": "text", "text": self.prompt}, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"} } ] } ] } response = requests.post(url, headers=headers, json=payload) if response.status_code != 200: print(f"Vision API error: {response.text}") return f"[Frame at {timestamp}s - description unavailable]" result = response.json() try: return result['choices'][0]['message']['content'] except (KeyError, IndexError): return f"[Frame at {timestamp}s - description unavailable]" def describe_frames_batch(self, frames: List[Dict]) -> List[Dict]: """ Describe all frames. Args: frames: [{"timestamp": 0.0, "path": "/tmp/frame_001.jpg"}, ...] Returns: [{"timestamp": 0.0, "path": "...", "description": "Woman looking..."}, ...] """ results = [] for frame in frames: description = self.describe_frame(frame['path'], frame['timestamp']) results.append({ "timestamp": frame['timestamp'], "path": frame['path'], "description": description }) return results