| import base64 | |
| import requests | |
| from typing import List, Dict | |
| class VisionAnalyzer: | |
| def __init__(self, api_key: str, group_id: str): | |
| self.api_key = api_key | |
| self.group_id = group_id | |
| self.base_url = "https://api.minimaxi.chat/v1" | |
| self.prompt = """Describe this video frame in one concise sentence. Focus on: | |
| - Who/what is shown (people, products, text overlays) | |
| - Setting/environment | |
| - Actions or emotions displayed | |
| - Any visible brand elements or text | |
| Be factual and specific. Do not interpret or add assumptions.""" | |
| def _encode_image(self, image_path: str) -> str: | |
| """Encode image to base64.""" | |
| with open(image_path, "rb") as f: | |
| return base64.b64encode(f.read()).decode('utf-8') | |
| def describe_frame(self, image_path: str, timestamp: float) -> str: | |
| """ | |
| Generate description of a single frame. | |
| Returns: | |
| Description string, e.g., "Woman looking frustrated in messy kitchen" | |
| """ | |
| url = f"{self.base_url}/text/chatcompletion_v2" | |
| headers = { | |
| 'Authorization': f'Bearer {self.api_key}', | |
| 'Content-Type': 'application/json' | |
| } | |
| image_data = self._encode_image(image_path) | |
| payload = { | |
| "model": "MiniMax-Text-01", | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": self.prompt}, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/jpeg;base64,{image_data}"} | |
| } | |
| ] | |
| } | |
| ] | |
| } | |
| response = requests.post(url, headers=headers, json=payload) | |
| if response.status_code != 200: | |
| print(f"Vision API error: {response.text}") | |
| return f"[Frame at {timestamp}s - description unavailable]" | |
| result = response.json() | |
| try: | |
| return result['choices'][0]['message']['content'] | |
| except (KeyError, IndexError): | |
| return f"[Frame at {timestamp}s - description unavailable]" | |
| def describe_frames_batch(self, frames: List[Dict]) -> List[Dict]: | |
| """ | |
| Describe all frames. | |
| Args: | |
| frames: [{"timestamp": 0.0, "path": "/tmp/frame_001.jpg"}, ...] | |
| Returns: | |
| [{"timestamp": 0.0, "path": "...", "description": "Woman looking..."}, ...] | |
| """ | |
| results = [] | |
| for frame in frames: | |
| description = self.describe_frame(frame['path'], frame['timestamp']) | |
| results.append({ | |
| "timestamp": frame['timestamp'], | |
| "path": frame['path'], | |
| "description": description | |
| }) | |
| return results | |