Spaces:

Marek4321
/

StoryLens

Running

App Files Files Community

StoryLens / vision_analyzer.py

Marek4321

Upload 13 files

6bdfadc verified 18 days ago

raw

history blame contribute delete

2.84 kB

	import base64
	import requests
	from typing import List, Dict


	class VisionAnalyzer:
	def __init__(self, api_key: str, group_id: str):
	self.api_key = api_key
	self.group_id = group_id
	self.base_url = "https://api.minimaxi.chat/v1"

	self.prompt = """Describe this video frame in one concise sentence. Focus on:
	- Who/what is shown (people, products, text overlays)
	- Setting/environment
	- Actions or emotions displayed
	- Any visible brand elements or text

	Be factual and specific. Do not interpret or add assumptions."""

	def _encode_image(self, image_path: str) -> str:
	"""Encode image to base64."""
	with open(image_path, "rb") as f:
	return base64.b64encode(f.read()).decode('utf-8')

	def describe_frame(self, image_path: str, timestamp: float) -> str:
	"""
	Generate description of a single frame.

	Returns:
	Description string, e.g., "Woman looking frustrated in messy kitchen"
	"""
	url = f"{self.base_url}/text/chatcompletion_v2"

	headers = {
	'Authorization': f'Bearer {self.api_key}',
	'Content-Type': 'application/json'
	}

	image_data = self._encode_image(image_path)

	payload = {
	"model": "MiniMax-Text-01",
	"messages": [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": self.prompt},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
	}
	]
	}
	]
	}

	response = requests.post(url, headers=headers, json=payload)

	if response.status_code != 200:
	print(f"Vision API error: {response.text}")
	return f"[Frame at {timestamp}s - description unavailable]"

	result = response.json()

	try:
	return result['choices'][0]['message']['content']
	except (KeyError, IndexError):
	return f"[Frame at {timestamp}s - description unavailable]"

	def describe_frames_batch(self, frames: List[Dict]) -> List[Dict]:
	"""
	Describe all frames.

	Args:
	frames: [{"timestamp": 0.0, "path": "/tmp/frame_001.jpg"}, ...]

	Returns:
	[{"timestamp": 0.0, "path": "...", "description": "Woman looking..."}, ...]
	"""
	results = []

	for frame in frames:
	description = self.describe_frame(frame['path'], frame['timestamp'])

	results.append({
	"timestamp": frame['timestamp'],
	"path": frame['path'],
	"description": description
	})

	return results