StoryLens / vision_analyzer.py
Marek4321's picture
Upload 13 files
6bdfadc verified
import base64
import requests
from typing import List, Dict
class VisionAnalyzer:
def __init__(self, api_key: str, group_id: str):
self.api_key = api_key
self.group_id = group_id
self.base_url = "https://api.minimaxi.chat/v1"
self.prompt = """Describe this video frame in one concise sentence. Focus on:
- Who/what is shown (people, products, text overlays)
- Setting/environment
- Actions or emotions displayed
- Any visible brand elements or text
Be factual and specific. Do not interpret or add assumptions."""
def _encode_image(self, image_path: str) -> str:
"""Encode image to base64."""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode('utf-8')
def describe_frame(self, image_path: str, timestamp: float) -> str:
"""
Generate description of a single frame.
Returns:
Description string, e.g., "Woman looking frustrated in messy kitchen"
"""
url = f"{self.base_url}/text/chatcompletion_v2"
headers = {
'Authorization': f'Bearer {self.api_key}',
'Content-Type': 'application/json'
}
image_data = self._encode_image(image_path)
payload = {
"model": "MiniMax-Text-01",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": self.prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
}
]
}
]
}
response = requests.post(url, headers=headers, json=payload)
if response.status_code != 200:
print(f"Vision API error: {response.text}")
return f"[Frame at {timestamp}s - description unavailable]"
result = response.json()
try:
return result['choices'][0]['message']['content']
except (KeyError, IndexError):
return f"[Frame at {timestamp}s - description unavailable]"
def describe_frames_batch(self, frames: List[Dict]) -> List[Dict]:
"""
Describe all frames.
Args:
frames: [{"timestamp": 0.0, "path": "/tmp/frame_001.jpg"}, ...]
Returns:
[{"timestamp": 0.0, "path": "...", "description": "Woman looking..."}, ...]
"""
results = []
for frame in frames:
description = self.describe_frame(frame['path'], frame['timestamp'])
results.append({
"timestamp": frame['timestamp'],
"path": frame['path'],
"description": description
})
return results