jakgritb's picture
fix: keep faces centered in HRE crops
eb1790e verified
Raw
History Blame Contribute Delete
14.4 kB
"""Qwen2.5-VL multimodal scene analysis via vLLM OpenAI-compatible API.
Sends video frames + transcript text together (true multimodal fusion).
Outputs: excitement_score, face_bbox, action_type, humor_level, emotion.
All scenes analyzed concurrently — vLLM handles GPU batching internally.
"""
import asyncio
import base64
import json
import os
from pathlib import Path
from typing import Optional
from loguru import logger
VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://localhost:8000/v1")
VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
VLLM_API_KEY = os.getenv("VLLM_API_KEY", "EMPTY")
ANALYSIS_PROMPT = """You are a TikTok content expert analyzing a livestream segment for highlight potential.
Analyze the provided video frames and transcript text together as a unified multimodal signal.
Respond ONLY with valid JSON matching this exact schema — no markdown, no explanation:
{{
"excitement_score": <0.0-1.0>,
"humor_level": <0.0-1.0>,
"emotion": "<neutral|happy|surprised|angry|sad|excited|funny>",
"action_type": "<talking|gaming|reaction|tutorial|entertainment|sports|other>",
"has_face": <true|false>,
"face_bbox": [<x1_pct>, <y1_pct>, <x2_pct>, <y2_pct>] or null,
"highlight_reason": "<one sentence: why this IS or isn't a good TikTok highlight>",
"tiktok_potential": <0.0-1.0>
}}
Channel context: {channel_description}
Requested clip style: {clip_style}
Rules:
- If a visible human is present, prioritize the speaker/person over products, screens, logos, or background objects.
- Set has_face=true only for a real visible human face, and make face_bbox cover the visible face/head area.
- If there is no visible human face, set has_face=false and face_bbox=null.
"""
def _encode_image(image_path: str) -> str:
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def analyze_scene(
scene: dict,
transcript_text: str = "",
channel_description: str = "",
clip_style: str = "entertaining",
) -> dict:
"""Analyze a single scene using Qwen2.5-VL (vision + text multimodal fusion).
Sends up to 3 representative frames + transcript context to vLLM.
Returns analysis dict with excitement_score, face_bbox, etc.
"""
try:
from openai import OpenAI
client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
frame_paths = scene.get("frame_paths", [])
if not frame_paths:
return _default_analysis()
content = []
# Add up to 3 frames as base64 images
for frame_path in frame_paths[:3]:
if Path(frame_path).exists():
b64 = _encode_image(frame_path)
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
})
if not content:
return _default_analysis()
prompt = ANALYSIS_PROMPT.format(
channel_description=channel_description or "General content creator",
clip_style=clip_style,
)
if transcript_text.strip():
prompt += f"\n\nTranscript for this segment:\n\"{transcript_text.strip()}\""
content.append({"type": "text", "text": prompt})
response = client.chat.completions.create(
model=VLLM_MODEL,
messages=[{"role": "user", "content": content}],
max_tokens=300,
temperature=0.1,
)
raw = response.choices[0].message.content.strip()
# Strip markdown code fences if present
if raw.startswith("```"):
parts = raw.split("```")
raw = parts[1] if len(parts) > 1 else raw
if raw.startswith("json"):
raw = raw[4:]
analysis = json.loads(raw.strip())
logger.debug(
f"Scene [{scene['start']:.1f}s-{scene['end']:.1f}s]: "
f"excitement={analysis.get('excitement_score', 0):.2f} "
f"tiktok={analysis.get('tiktok_potential', 0):.2f} | "
f"{analysis.get('highlight_reason', '')[:60]}"
)
try:
from src.gpu.vllm_manager import vllm_touch
vllm_touch()
except Exception:
pass
return analysis
except Exception as e:
logger.warning(f"Vision analysis failed at {scene.get('start', 0):.1f}s: {e}")
return _default_analysis()
async def analyze_scenes_batch_async(
scenes_with_frames: list[dict],
transcript_segments: list[dict],
channel_description: str = "",
clip_style: str = "entertaining",
) -> list[dict]:
"""Analyze all scenes concurrently.
Sends all vLLM requests in parallel — the server queues and batches them
internally, giving full GPU utilization on AMD MI300X.
Each result includes 'vision_analysis' and 'transcript_text' for scoring.
"""
loop = asyncio.get_running_loop()
async def _analyze_one(scene: dict) -> dict:
scene_text = " ".join(
seg["text"] for seg in transcript_segments
if seg["start"] < scene["end"] and seg["end"] > scene["start"]
)
analysis = await loop.run_in_executor(
None,
lambda s=scene, t=scene_text: analyze_scene(s, t, channel_description, clip_style),
)
return {**scene, "vision_analysis": analysis, "transcript_text": scene_text}
results = await asyncio.gather(*[_analyze_one(s) for s in scenes_with_frames])
logger.info(f"Vision analysis complete: {len(results)} scenes")
return list(results)
def _default_analysis() -> dict:
"""Fallback analysis when vLLM is unavailable (keeps pipeline running)."""
return {
"excitement_score": 0.5,
"humor_level": 0.3,
"emotion": "neutral",
"action_type": "talking",
"has_face": False,
"face_bbox": None,
"highlight_reason": "Vision model unavailable — using audio+text signals only",
"tiktok_potential": 0.4,
}
HRE_SEGMENT_PROMPT = """Analyze these video frames for high-retention TikTok editing decisions.
Segment {seg_idx} of {n_total}. Transcript: "{context}"
Respond ONLY with valid JSON — no markdown:
{{
"zoom_direction": "<in|out|hold>",
"zoom_speed": "<fast|slow>",
"face_detected": <true|false>,
"face_cx": <0.0-1.0>,
"face_cy": <0.0-1.0>,
"subject_bbox": [<x1>, <y1>, <x2>, <y2>] or null,
"zoom_anchor_x": <0.0-1.0>,
"zoom_anchor_y": <0.0-1.0>,
"subtitle_position": "<top|bottom|left|right|center>",
"caption_x": <0.10-0.90>,
"caption_y": <0.12-0.88>,
"caption_anchor": <1-9>,
"caption_max_width_pct": <0.35-0.82>,
"subtitle_mode": "<word|phrase|sentence>",
"subtitle_emphasis": "<pop|punch|calm>",
"subtitle_color": "<white|yellow|cyan|orange|green>",
"energy_level": "<high|medium|low>",
"moment_type": "<hook|punchline|context|reaction|transition>"
}}
Rules:
- Primary priority: keep the speaker/person visible. If a person exists, subject_bbox and zoom_anchor must target the person/face before products/screens.
- If there is no person, target the product/object being discussed.
- For key statements, punchlines, surprising claims, numbers, product names, or memorable highlight words:
zoom_direction=in, zoom_speed=fast or slow, subtitle_mode=word, subtitle_emphasis=punch, energy_level=high.
- For normal explanatory speech:
zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
- Use zoom OUT only as breathing room after an intense/key moment.
- Sentence captions should sit around center-bottom: caption_x about 0.50, caption_y about 0.68-0.74, caption_anchor=2.
- Word highlight captions can sit center, mid-upper, mid-left, or mid-right with larger text, as long as they avoid the face/product.
- subtitle WORD: short hooks, reactions, punchlines, important keywords
- subtitle PHRASE: fast but understandable speech, 2-4 words at a time
- subtitle SENTENCE: explanation, normal conversation, low/medium energy
- subject_bbox: main face/person/product/object box in normalized frame coordinates, or null if unclear
- zoom_anchor_x/y: center of the face/person/product to keep important content in frame; never choose a blank wall/window
- caption_x/y: choose an actually empty readable area in this frame, not just fixed top/bottom
- caption_anchor: ASS anchor 1-9 matching caption_x/y (1 bottom-left, 5 center, 9 top-right)
- caption_max_width_pct: smaller when the empty space is narrow; captions must stay fully inside the 9:16 frame
- Keep captions away from face, product, hands, and important screen/object regions.
- Avoid choosing the exact same caption_x/y and subtitle_mode for every segment.
- face_cx/face_cy: face center as 0.0-1.0 fraction of frame
"""
def analyze_frames_for_hre(
frame_paths: list["Path"],
context: str = "",
seg_idx: int = 0,
n_total: int = 1,
) -> dict:
"""Per-segment HRE: zoom, caption placement, caption mode, and color."""
try:
from openai import OpenAI
client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
valid_frames = [Path(p) for p in frame_paths[:3] if Path(p).exists()]
if not valid_frames:
return _default_hre_analysis(seg_idx, n_total)
content = []
for frame_path in valid_frames:
b64 = _encode_image(str(frame_path))
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
})
prompt = HRE_SEGMENT_PROMPT.format(
seg_idx=seg_idx, n_total=n_total, context=context[:320]
)
content.append({"type": "text", "text": prompt})
response = client.chat.completions.create(
model=VLLM_MODEL,
messages=[{
"role": "user",
"content": content,
}],
max_tokens=380,
temperature=0.1,
)
raw = response.choices[0].message.content.strip()
if raw.startswith("```"):
parts = raw.split("```")
raw = parts[1] if len(parts) > 1 else raw
if raw.startswith("json"):
raw = raw[4:]
analysis = {**_default_hre_analysis(seg_idx, n_total), **json.loads(raw.strip())}
logger.debug(
f"HRE seg {seg_idx}/{n_total}: "
f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
f"caption=({analysis.get('caption_x')},{analysis.get('caption_y')}) "
f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
f"{analysis.get('subtitle_color')} "
f"type={analysis.get('moment_type')}"
)
try:
from src.gpu.vllm_manager import vllm_touch
vllm_touch()
except Exception:
pass
return analysis
except Exception as e:
logger.warning(f"HRE frame analysis failed (seg {seg_idx}): {e}")
return _default_hre_analysis(seg_idx, n_total)
def analyze_frame_for_hre(
frame_path: "Path",
context: str = "",
seg_idx: int = 0,
n_total: int = 1,
) -> dict:
"""Backward-compatible wrapper for callers that provide one frame."""
return analyze_frames_for_hre([frame_path], context, seg_idx, n_total)
def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
"""Fallback with varied decisions based on position in clip."""
if seg_idx == 0:
zoom_dir, zoom_speed, moment = "in", "fast", "hook"
elif seg_idx == n_total - 1:
zoom_dir, zoom_speed, moment = "out", "slow", "transition"
elif seg_idx % 3 == 1:
zoom_dir, zoom_speed, moment = "hold", "slow", "context"
else:
zoom_dir, zoom_speed, moment = "in", "slow", "reaction"
_colors = ["yellow", "white", "cyan", "orange", "white", "yellow"]
_positions = ["bottom", "top", "left", "bottom", "right", "top"]
_coords = [(0.50, 0.76), (0.50, 0.18), (0.28, 0.56), (0.50, 0.72), (0.72, 0.56), (0.50, 0.20)]
_anchors = [2, 8, 4, 2, 6, 8]
_modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
_emphasis = ["punch", "calm", "pop", "punch", "calm", "pop"]
caption_x, caption_y = _coords[seg_idx % len(_coords)]
return {
"zoom_direction": zoom_dir,
"zoom_speed": zoom_speed,
"face_detected": False,
"face_cx": 0.5,
"face_cy": 0.38,
"subject_bbox": None,
"zoom_anchor_x": 0.5,
"zoom_anchor_y": 0.38,
"subtitle_position": _positions[seg_idx % len(_positions)],
"caption_x": caption_x,
"caption_y": caption_y,
"caption_anchor": _anchors[seg_idx % len(_anchors)],
"caption_max_width_pct": 0.62,
"subtitle_mode": _modes[seg_idx % len(_modes)],
"subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
"subtitle_color": _colors[seg_idx % len(_colors)],
"energy_level": "medium",
"moment_type": moment,
}
def get_emoji_for_scene(scene_text: str, emotion: str, action_type: str) -> str:
"""Use the configured Qwen2.5-VL model as a text prompt to select an emoji."""
try:
from openai import OpenAI
client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
response = client.chat.completions.create(
model=VLLM_MODEL,
messages=[{"role": "user", "content": (
f"Select ONE emoji for this TikTok moment.\n"
f"Emotion: {emotion}\nAction: {action_type}\n"
f"Text: \"{scene_text[:200]}\"\n"
f"Reply with ONLY the emoji character, nothing else."
)}],
max_tokens=5,
temperature=0.3,
)
emoji = response.choices[0].message.content.strip()
if len(emoji) <= 4:
return emoji
except Exception:
pass
emoji_map = {
"happy": "😄", "excited": "🔥", "funny": "😂",
"surprised": "😲", "angry": "😤", "sad": "😢",
"neutral": "💡", "gaming": "🎮", "tutorial": "📚",
"entertainment": "✨", "reaction": "😱",
}
return emoji_map.get(emotion) or emoji_map.get(action_type, "⚡")