Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Runtime error

App Files Files Community

ElevenClip-AI / backend /src /analysis /vision.py

jakgritb

fix: keep faces centered in HRE crops

eb1790e verified about 2 months ago

Raw

History Blame Contribute Delete

14.4 kB

	"""Qwen2.5-VL multimodal scene analysis via vLLM OpenAI-compatible API.

	Sends video frames + transcript text together (true multimodal fusion).
	Outputs: excitement_score, face_bbox, action_type, humor_level, emotion.
	All scenes analyzed concurrently — vLLM handles GPU batching internally.
	"""
	import asyncio
	import base64
	import json
	import os
	from pathlib import Path
	from typing import Optional
	from loguru import logger

	VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1")
	VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
	VLLM_API_KEY = os.getenv("VLLM_API_KEY", "EMPTY")

	ANALYSIS_PROMPT = """You are a TikTok content expert analyzing a livestream segment for highlight potential.
	Analyze the provided video frames and transcript text together as a unified multimodal signal.

	Respond ONLY with valid JSON matching this exact schema — no markdown, no explanation:
	{{
	"excitement_score": <0.0-1.0>,
	"humor_level": <0.0-1.0>,
	"emotion": "<neutral\|happy\|surprised\|angry\|sad\|excited\|funny>",
	"action_type": "<talking\|gaming\|reaction\|tutorial\|entertainment\|sports\|other>",
	"has_face": <true\|false>,
	"face_bbox": [<x1_pct>, <y1_pct>, <x2_pct>, <y2_pct>] or null,
	"highlight_reason": "<one sentence: why this IS or isn't a good TikTok highlight>",
	"tiktok_potential": <0.0-1.0>
	}}

	Channel context: {channel_description}
	Requested clip style: {clip_style}

	Rules:
	- If a visible human is present, prioritize the speaker/person over products, screens, logos, or background objects.
	- Set has_face=true only for a real visible human face, and make face_bbox cover the visible face/head area.
	- If there is no visible human face, set has_face=false and face_bbox=null.
	"""


	def _encode_image(image_path: str) -> str:
	with open(image_path, "rb") as f:
	return base64.b64encode(f.read()).decode("utf-8")


	def analyze_scene(
	scene: dict,
	transcript_text: str = "",
	channel_description: str = "",
	clip_style: str = "entertaining",
	) -> dict:
	"""Analyze a single scene using Qwen2.5-VL (vision + text multimodal fusion).

	Sends up to 3 representative frames + transcript context to vLLM.
	Returns analysis dict with excitement_score, face_bbox, etc.
	"""
	try:
	from openai import OpenAI

	client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
	frame_paths = scene.get("frame_paths", [])
	if not frame_paths:
	return _default_analysis()

	content = []

	# Add up to 3 frames as base64 images
	for frame_path in frame_paths[:3]:
	if Path(frame_path).exists():
	b64 = _encode_image(frame_path)
	content.append({
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
	})

	if not content:
	return _default_analysis()

	prompt = ANALYSIS_PROMPT.format(
	channel_description=channel_description or "General content creator",
	clip_style=clip_style,
	)
	if transcript_text.strip():
	prompt += f"\n\nTranscript for this segment:\n\"{transcript_text.strip()}\""

	content.append({"type": "text", "text": prompt})

	response = client.chat.completions.create(
	model=VLLM_MODEL,
	messages=[{"role": "user", "content": content}],
	max_tokens=300,
	temperature=0.1,
	)

	raw = response.choices[0].message.content.strip()
	# Strip markdown code fences if present
	if raw.startswith("```"):
	parts = raw.split("```")
	raw = parts[1] if len(parts) > 1 else raw
	if raw.startswith("json"):
	raw = raw[4:]

	analysis = json.loads(raw.strip())
	logger.debug(
	f"Scene [{scene['start']:.1f}s-{scene['end']:.1f}s]: "
	f"excitement={analysis.get('excitement_score', 0):.2f} "
	f"tiktok={analysis.get('tiktok_potential', 0):.2f} \| "
	f"{analysis.get('highlight_reason', '')[:60]}"
	)
	try:
	from src.gpu.vllm_manager import vllm_touch
	vllm_touch()
	except Exception:
	pass
	return analysis

	except Exception as e:
	logger.warning(f"Vision analysis failed at {scene.get('start', 0):.1f}s: {e}")
	return _default_analysis()


	async def analyze_scenes_batch_async(
	scenes_with_frames: list[dict],
	transcript_segments: list[dict],
	channel_description: str = "",
	clip_style: str = "entertaining",
	) -> list[dict]:
	"""Analyze all scenes concurrently.

	Sends all vLLM requests in parallel — the server queues and batches them
	internally, giving full GPU utilization on AMD MI300X.
	Each result includes 'vision_analysis' and 'transcript_text' for scoring.
	"""
	loop = asyncio.get_running_loop()

	async def _analyze_one(scene: dict) -> dict:
	scene_text = " ".join(
	seg["text"] for seg in transcript_segments
	if seg["start"] < scene["end"] and seg["end"] > scene["start"]
	)
	analysis = await loop.run_in_executor(
	None,
	lambda s=scene, t=scene_text: analyze_scene(s, t, channel_description, clip_style),
	)
	return {**scene, "vision_analysis": analysis, "transcript_text": scene_text}

	results = await asyncio.gather(*[_analyze_one(s) for s in scenes_with_frames])
	logger.info(f"Vision analysis complete: {len(results)} scenes")
	return list(results)


	def _default_analysis() -> dict:
	"""Fallback analysis when vLLM is unavailable (keeps pipeline running)."""
	return {
	"excitement_score": 0.5,
	"humor_level": 0.3,
	"emotion": "neutral",
	"action_type": "talking",
	"has_face": False,
	"face_bbox": None,
	"highlight_reason": "Vision model unavailable — using audio+text signals only",
	"tiktok_potential": 0.4,
	}


	HRE_SEGMENT_PROMPT = """Analyze these video frames for high-retention TikTok editing decisions.

	Segment {seg_idx} of {n_total}. Transcript: "{context}"

	Respond ONLY with valid JSON — no markdown:
	{{
	"zoom_direction": "<in\|out\|hold>",
	"zoom_speed": "<fast\|slow>",
	"face_detected": <true\|false>,
	"face_cx": <0.0-1.0>,
	"face_cy": <0.0-1.0>,
	"subject_bbox": [<x1>, <y1>, <x2>, <y2>] or null,
	"zoom_anchor_x": <0.0-1.0>,
	"zoom_anchor_y": <0.0-1.0>,
	"subtitle_position": "<top\|bottom\|left\|right\|center>",
	"caption_x": <0.10-0.90>,
	"caption_y": <0.12-0.88>,
	"caption_anchor": <1-9>,
	"caption_max_width_pct": <0.35-0.82>,
	"subtitle_mode": "<word\|phrase\|sentence>",
	"subtitle_emphasis": "<pop\|punch\|calm>",
	"subtitle_color": "<white\|yellow\|cyan\|orange\|green>",
	"energy_level": "<high\|medium\|low>",
	"moment_type": "<hook\|punchline\|context\|reaction\|transition>"
	}}

	Rules:
	- Primary priority: keep the speaker/person visible. If a person exists, subject_bbox and zoom_anchor must target the person/face before products/screens.
	- If there is no person, target the product/object being discussed.
	- For key statements, punchlines, surprising claims, numbers, product names, or memorable highlight words:
	zoom_direction=in, zoom_speed=fast or slow, subtitle_mode=word, subtitle_emphasis=punch, energy_level=high.
	- For normal explanatory speech:
	zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
	- Use zoom OUT only as breathing room after an intense/key moment.
	- Sentence captions should sit around center-bottom: caption_x about 0.50, caption_y about 0.68-0.74, caption_anchor=2.
	- Word highlight captions can sit center, mid-upper, mid-left, or mid-right with larger text, as long as they avoid the face/product.
	- subtitle WORD: short hooks, reactions, punchlines, important keywords
	- subtitle PHRASE: fast but understandable speech, 2-4 words at a time
	- subtitle SENTENCE: explanation, normal conversation, low/medium energy
	- subject_bbox: main face/person/product/object box in normalized frame coordinates, or null if unclear
	- zoom_anchor_x/y: center of the face/person/product to keep important content in frame; never choose a blank wall/window
	- caption_x/y: choose an actually empty readable area in this frame, not just fixed top/bottom
	- caption_anchor: ASS anchor 1-9 matching caption_x/y (1 bottom-left, 5 center, 9 top-right)
	- caption_max_width_pct: smaller when the empty space is narrow; captions must stay fully inside the 9:16 frame
	- Keep captions away from face, product, hands, and important screen/object regions.
	- Avoid choosing the exact same caption_x/y and subtitle_mode for every segment.
	- face_cx/face_cy: face center as 0.0-1.0 fraction of frame
	"""


	def analyze_frames_for_hre(
	frame_paths: list["Path"],
	context: str = "",
	seg_idx: int = 0,
	n_total: int = 1,
	) -> dict:
	"""Per-segment HRE: zoom, caption placement, caption mode, and color."""
	try:
	from openai import OpenAI

	client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
	valid_frames = [Path(p) for p in frame_paths[:3] if Path(p).exists()]
	if not valid_frames:
	return _default_hre_analysis(seg_idx, n_total)

	content = []
	for frame_path in valid_frames:
	b64 = _encode_image(str(frame_path))
	content.append({
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
	})

	prompt = HRE_SEGMENT_PROMPT.format(
	seg_idx=seg_idx, n_total=n_total, context=context[:320]
	)
	content.append({"type": "text", "text": prompt})
	response = client.chat.completions.create(
	model=VLLM_MODEL,
	messages=[{
	"role": "user",
	"content": content,
	}],
	max_tokens=380,
	temperature=0.1,
	)
	raw = response.choices[0].message.content.strip()
	if raw.startswith("```"):
	parts = raw.split("```")
	raw = parts[1] if len(parts) > 1 else raw
	if raw.startswith("json"):
	raw = raw[4:]

	analysis = {_default_hre_analysis(seg_idx, n_total), json.loads(raw.strip())}
	logger.debug(
	f"HRE seg {seg_idx}/{n_total}: "
	f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
	f"caption=({analysis.get('caption_x')},{analysis.get('caption_y')}) "
	f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
	f"{analysis.get('subtitle_color')} "
	f"type={analysis.get('moment_type')}"
	)
	try:
	from src.gpu.vllm_manager import vllm_touch
	vllm_touch()
	except Exception:
	pass
	return analysis

	except Exception as e:
	logger.warning(f"HRE frame analysis failed (seg {seg_idx}): {e}")
	return _default_hre_analysis(seg_idx, n_total)


	def analyze_frame_for_hre(
	frame_path: "Path",
	context: str = "",
	seg_idx: int = 0,
	n_total: int = 1,
	) -> dict:
	"""Backward-compatible wrapper for callers that provide one frame."""
	return analyze_frames_for_hre([frame_path], context, seg_idx, n_total)


	def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
	"""Fallback with varied decisions based on position in clip."""
	if seg_idx == 0:
	zoom_dir, zoom_speed, moment = "in", "fast", "hook"
	elif seg_idx == n_total - 1:
	zoom_dir, zoom_speed, moment = "out", "slow", "transition"
	elif seg_idx % 3 == 1:
	zoom_dir, zoom_speed, moment = "hold", "slow", "context"
	else:
	zoom_dir, zoom_speed, moment = "in", "slow", "reaction"

	_colors = ["yellow", "white", "cyan", "orange", "white", "yellow"]
	_positions = ["bottom", "top", "left", "bottom", "right", "top"]
	_coords = [(0.50, 0.76), (0.50, 0.18), (0.28, 0.56), (0.50, 0.72), (0.72, 0.56), (0.50, 0.20)]
	_anchors = [2, 8, 4, 2, 6, 8]
	_modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
	_emphasis = ["punch", "calm", "pop", "punch", "calm", "pop"]
	caption_x, caption_y = _coords[seg_idx % len(_coords)]

	return {
	"zoom_direction": zoom_dir,
	"zoom_speed": zoom_speed,
	"face_detected": False,
	"face_cx": 0.5,
	"face_cy": 0.38,
	"subject_bbox": None,
	"zoom_anchor_x": 0.5,
	"zoom_anchor_y": 0.38,
	"subtitle_position": _positions[seg_idx % len(_positions)],
	"caption_x": caption_x,
	"caption_y": caption_y,
	"caption_anchor": _anchors[seg_idx % len(_anchors)],
	"caption_max_width_pct": 0.62,
	"subtitle_mode": _modes[seg_idx % len(_modes)],
	"subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
	"subtitle_color": _colors[seg_idx % len(_colors)],
	"energy_level": "medium",
	"moment_type": moment,
	}


	def get_emoji_for_scene(scene_text: str, emotion: str, action_type: str) -> str:
	"""Use the configured Qwen2.5-VL model as a text prompt to select an emoji."""
	try:
	from openai import OpenAI
	client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)

	response = client.chat.completions.create(
	model=VLLM_MODEL,
	messages=[{"role": "user", "content": (
	f"Select ONE emoji for this TikTok moment.\n"
	f"Emotion: {emotion}\nAction: {action_type}\n"
	f"Text: \"{scene_text[:200]}\"\n"
	f"Reply with ONLY the emoji character, nothing else."
	)}],
	max_tokens=5,
	temperature=0.3,
	)
	emoji = response.choices[0].message.content.strip()
	if len(emoji) <= 4:
	return emoji
	except Exception:
	pass

	emoji_map = {
	"happy": "😄", "excited": "🔥", "funny": "😂",
	"surprised": "😲", "angry": "😤", "sad": "😢",
	"neutral": "💡", "gaming": "🎮", "tutorial": "📚",
	"entertainment": "✨", "reaction": "😱",
	}
	return emoji_map.get(emotion) or emoji_map.get(action_type, "⚡")