Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Runtime error

App Files Files Community

ElevenClip-AI / backend /src /processing /high_retention.py

jakgritb

fix: keep faces centered in HRE crops

eb1790e verified about 2 months ago

Raw

History Blame Contribute Delete

38.6 kB

	"""High-Retention Editing pipeline — per-segment AI decisions.

	Each 3-5s segment gets its own zoom direction, subtitle position,
	subtitle mode, and caption color driven by Qwen2.5-VL analyzing one
	frame plus the local transcript for that segment.

	Pipeline per clip:
	1. Segment clip at speech pauses (3-5s chunks)
	2. Extract midpoint frame from each segment
	3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions
	4. ffmpeg filter_complex: per-segment zoompan + concat
	5. ASS subtitles with per-segment alignment/color/mode override tags
	"""
	import json
	import subprocess
	import tempfile
	from pathlib import Path
	from loguru import logger


	# ─── Video metadata ────────────────────────────────────────────────────────────

	def _probe_dimensions(video_path: Path) -> tuple[int, int]:
	probe = subprocess.run(
	["ffprobe", "-v", "error", "-select_streams", "v:0",
	"-show_entries", "stream=width,height", "-of", "csv=p=0",
	str(video_path)],
	capture_output=True, text=True,
	)
	try:
	w, h = map(int, probe.stdout.strip().split(","))
	return w, h
	except Exception:
	return 1080, 1920


	def _probe_duration(video_path: Path) -> float:
	probe = subprocess.run(
	["ffprobe", "-v", "error", "-show_entries", "format=duration",
	"-of", "csv=p=0", str(video_path)],
	capture_output=True, text=True,
	)
	try:
	return float(probe.stdout.strip())
	except Exception:
	return 0.0


	def _has_audio_stream(video_path: Path) -> bool:
	probe = subprocess.run(
	["ffprobe", "-v", "error", "-select_streams", "a",
	"-show_entries", "stream=codec_type", "-of", "csv=p=0",
	str(video_path)],
	capture_output=True, text=True,
	)
	return bool(probe.stdout.strip())


	# ─── Segmentation ─────────────────────────────────────────────────────────────

	def _segment_clip(
	duration: float,
	transcript: dict,
	clip_start: float,
	max_seg: float = 4.5,
	) -> list[dict]:
	"""Divide clip into segments at speech pauses, max_seg seconds each."""
	words: list[dict] = []
	for seg in transcript.get("segments", []):
	words.extend(seg.get("words", []))

	if clip_start > 0:
	words = [
	{**w, "start": max(0.0, w["start"] - clip_start),
	"end": max(0.0, w["end"] - clip_start)}
	for w in words
	]
	words = [w for w in words if w["end"] > 0 and w["start"] < duration]

	# Collect pause midpoints as candidate cut times
	cuts = [0.0]
	for i in range(len(words) - 1):
	gap = words[i + 1]["start"] - words[i]["end"]
	if gap > 0.2:
	cuts.append((words[i]["end"] + words[i + 1]["start"]) / 2.0)
	cuts.append(duration)
	cuts = sorted(set(cuts))

	# Merge short intervals, split long ones
	segs: list[dict] = []
	start = 0.0
	for cut in cuts[1:]:
	seg_len = cut - start
	if seg_len < 1.5 and cut < duration:
	continue # too short — extend to next cut
	if seg_len > max_seg:
	t = start
	while t + max_seg < cut:
	segs.append({"start": t, "end": t + max_seg})
	t += max_seg
	if cut - t > 0.5:
	segs.append({"start": t, "end": cut})
	start = cut
	else:
	segs.append({"start": start, "end": cut})
	start = cut

	# Fallback: split evenly if not enough segments
	if len(segs) < 2:
	n = max(2, round(duration / 4.0))
	d = duration / n
	segs = [{"start": i * d, "end": min((i + 1) * d, duration)} for i in range(n)]

	return segs


	# ─── Frame extraction ─────────────────────────────────────────────────────────

	def _extract_frame(video_path: Path, t: float, out_path: Path) -> bool:
	cmd = [
	"ffmpeg", "-y", "-ss", f"{t:.3f}", "-i", str(video_path),
	"-vframes", "1", "-q:v", "3", str(out_path),
	]
	result = subprocess.run(cmd, capture_output=True, timeout=30)
	return result.returncode == 0 and out_path.exists()


	def _extract_segment_frames(video_path: Path, seg: dict, seg_idx: int, tmp_dir: Path) -> list[Path]:
	"""Extract a few representative frames so HRE decisions see motion, not one random still."""
	start = float(seg["start"])
	end = float(seg["end"])
	duration = max(0.1, end - start)
	times = [
	start + duration * 0.25,
	start + duration * 0.50,
	start + duration * 0.75,
	]
	frames: list[Path] = []
	for j, t in enumerate(times):
	frame_path = tmp_dir / f"seg_{seg_idx:03d}_{j}.jpg"
	if _extract_frame(video_path, min(max(start, t), max(start, end - 0.05)), frame_path):
	frames.append(frame_path)
	return frames


	def _detect_face_bbox_in_image(image_path: Path) -> list[float] \| None:
	"""Detect a human face in one frame and return a normalized padded bbox."""
	try:
	import cv2
	except Exception:
	return None

	image = cv2.imread(str(image_path))
	if image is None:
	return None

	fh, fw = image.shape[:2]
	if fw <= 0 or fh <= 0:
	return None

	cascade_paths = [
	Path(cv2.data.haarcascades) / "haarcascade_frontalface_default.xml",
	Path(cv2.data.haarcascades) / "haarcascade_profileface.xml",
	]
	cascades = [cv2.CascadeClassifier(str(p)) for p in cascade_paths if p.exists()]
	cascades = [c for c in cascades if not c.empty()]
	if not cascades:
	return None

	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	gray = cv2.equalizeHist(gray)
	candidates: list[tuple[int, int, int, int]] = []
	min_size = (max(34, fw // 46), max(34, fh // 46))

	for cascade in cascades:
	faces = cascade.detectMultiScale(
	gray,
	scaleFactor=1.08,
	minNeighbors=4,
	minSize=min_size,
	)
	candidates.extend(tuple(map(int, face)) for face in faces)

	flipped = cv2.flip(gray, 1)
	flipped_faces = cascade.detectMultiScale(
	flipped,
	scaleFactor=1.08,
	minNeighbors=4,
	minSize=min_size,
	)
	for x, y, w, h in flipped_faces:
	candidates.append((fw - int(x) - int(w), int(y), int(w), int(h)))

	best: tuple[int, int, int, int] \| None = None
	best_score = 0.0
	for x, y, w, h in candidates:
	area = w * h
	if area <= 0:
	continue
	face_cy = (y + h / 2) / fh
	centrality = 1.0 - min(0.55, abs(face_cy - 0.38))
	score = area * centrality
	if score > best_score:
	best = (x, y, w, h)
	best_score = score

	if not best:
	return None

	x, y, w, h = best
	pad_x = w * 0.34
	pad_y_top = h * 0.46
	pad_y_bottom = h * 0.70
	return [
	max(0.0, (x - pad_x) / fw),
	max(0.0, (y - pad_y_top) / fh),
	min(1.0, (x + w + pad_x) / fw),
	min(1.0, (y + h + pad_y_bottom) / fh),
	]


	def _detect_segment_face_bbox(frame_paths: list[Path]) -> list[float] \| None:
	"""Pick the strongest face box across the sampled frames for a segment."""
	best_bbox: list[float] \| None = None
	best_area = 0.0
	for frame_path in frame_paths:
	bbox = _detect_face_bbox_in_image(frame_path)
	if not bbox:
	continue
	area = max(0.0, bbox[2] - bbox[0]) * max(0.0, bbox[3] - bbox[1])
	if area > best_area:
	best_bbox = bbox
	best_area = area

	if best_bbox:
	logger.info(
	"HRE face zoom target: "
	f"x={((best_bbox[0] + best_bbox[2]) / 2):.2f} "
	f"y={((best_bbox[1] + best_bbox[3]) / 2):.2f}"
	)
	return best_bbox


	def _apply_detected_face_override(analysis: dict, face_bbox: list[float] \| None) -> dict:
	if not face_bbox:
	return analysis
	x1, y1, x2, y2 = face_bbox
	face_cx = (x1 + x2) / 2.0
	face_cy = (y1 + y2) / 2.0
	return {
	**analysis,
	"face_detected": True,
	"subject_bbox": face_bbox,
	"face_cx": face_cx,
	"face_cy": face_cy,
	"zoom_anchor_x": face_cx,
	"zoom_anchor_y": face_cy,
	}


	# ─── Per-segment AI analysis ──────────────────────────────────────────────────

	def _analyze_segment(
	video_path: Path,
	seg: dict,
	seg_idx: int,
	n_total: int,
	transcript: dict,
	clip_start: float,
	tmp_dir: Path,
	) -> dict:
	from src.analysis.vision import analyze_frames_for_hre, _default_hre_analysis

	frame_paths = _extract_segment_frames(video_path, seg, seg_idx, tmp_dir)
	if not frame_paths:
	return _default_hre_analysis(seg_idx, n_total)

	words_all: list[dict] = []
	for s in transcript.get("segments", []):
	words_all.extend(s.get("words", []))

	abs_start = seg["start"] + clip_start
	abs_end = seg["end"] + clip_start
	context = " ".join(
	w.get("word", w.get("text", ""))
	for w in words_all
	if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start
	).strip()

	analysis = analyze_frames_for_hre(frame_paths, context, seg_idx, n_total)
	return _apply_detected_face_override(analysis, _detect_segment_face_bbox(frame_paths))


	# ─── Zoom expression builders ─────────────────────────────────────────────────

	def _build_zoom_exprs(
	analysis: dict,
	w: int,
	h: int,
	) -> tuple[str, str, str]:
	"""Return (z_expr, x_expr, y_expr) for ffmpeg zoompan from HRE analysis.
	Note: \\, escapes comma inside ffmpeg filter expressions.
	"""
	direction = analysis.get("zoom_direction", "in")
	speed = analysis.get("zoom_speed", "slow")
	zoom_anchor_x = _clamp_float(analysis.get("zoom_anchor_x"), _clamp_float(analysis.get("face_cx"), 0.5))
	zoom_anchor_y = _clamp_float(analysis.get("zoom_anchor_y"), _clamp_float(analysis.get("face_cy"), 0.38))

	if direction == "in":
	if speed == "fast":
	z_expr, max_zoom = "min(1.0+on*0.0100\\,1.45)", 1.45
	else:
	z_expr, max_zoom = "min(1.0+on*0.0035\\,1.28)", 1.28
	elif direction == "out":
	if speed == "fast":
	z_expr, max_zoom = "max(1.45-on*0.0100\\,1.0)", 1.45
	else:
	z_expr, max_zoom = "max(1.28-on*0.0040\\,1.0)", 1.28
	else: # hold
	z_expr, max_zoom = "1.08", 1.08

	if direction == "in" and max_zoom > 1.05:
	x_expr = f"max(0\\,min(iw-iw/zoom\\,iw*{zoom_anchor_x:.3f}-iw/zoom/2))"
	y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{zoom_anchor_y:.3f}-ih/zoom/2))"
	else:
	x_expr = "iw/2-(iw/zoom/2)"
	if direction == "in":
	y_bias = min(zoom_anchor_y, 0.5) if zoom_anchor_y < 0.55 else 0.38
	y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{y_bias:.2f}-(ih/zoom/2)))"
	else:
	y_expr = "ih/2-(ih/zoom/2)"

	return z_expr, x_expr, y_expr


	# ─── Per-segment zoom via filter_complex ──────────────────────────────────────

	def _apply_per_segment_zoom(
	input_path: Path,
	segments: list[dict],
	analyses: list[dict],
	w: int,
	h: int,
	output_path: Path,
	has_audio: bool = True,
	) -> Path:
	"""Apply different zoompan to each segment, concat into single stream."""
	filter_parts: list[str] = []
	v_labels: list[str] = []
	a_labels: list[str] = []

	for i, (seg, analysis) in enumerate(zip(segments, analyses)):
	s = f"{seg['start']:.3f}"
	e = f"{seg['end']:.3f}"
	z, x, y = _build_zoom_exprs(analysis, w, h)
	zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30"
	filter_parts.append(
	f"[0:v]trim=start={s}:end={e},setpts=PTS-STARTPTS,fps=30,{zp},setpts=PTS-STARTPTS[v{i}]"
	)
	v_labels.append(f"[v{i}]")
	if has_audio:
	filter_parts.append(f"[0:a]atrim=start={s}:end={e},asetpts=PTS-STARTPTS[a{i}]")
	a_labels.append(f"[a{i}]")

	n = len(segments)
	filter_parts.append("".join(v_labels) + f"concat=n={n}:v=1:a=0[vout]")
	if has_audio:
	filter_parts.append("".join(a_labels) + f"concat=n={n}:v=0:a=1[aout]")

	cmd = [
	"ffmpeg", "-y", "-i", str(input_path),
	"-filter_complex", ";".join(filter_parts),
	"-map", "[vout]",
	]
	if has_audio:
	cmd += ["-map", "[aout]", "-c:a", "aac"]
	cmd += ["-c:v", "libx264", "-movflags", "+faststart", str(output_path)]

	result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
	if result.returncode == 0 and output_path.exists():
	logger.info(f"Per-segment zoom: {n} segments, {w}x{h}")
	return output_path
	logger.warning(f"Per-segment zoom failed: {result.stderr[-800:]}")
	return input_path


	# ─── Per-segment ASS subtitles ────────────────────────────────────────────────

	_ASS_COLORS = {
	"white": "&H00FFFFFF",
	"yellow": "&H0000FFFF",
	"cyan": "&H00FFFF00",
	"orange": "&H000066FF",
	"green": "&H0000FF00",
	"red": "&H000000FF",
	}

	_POSITIONS = {"top", "bottom", "left", "right", "center", "free"}
	_MODES = {"word", "phrase", "sentence"}
	_EMPHASIS = {"pop", "punch", "calm"}
	_ANCHORS = set(range(1, 10))


	def _ts(t: float) -> str:
	total_cs = max(0, int(round(t * 100)))
	h = total_cs // 360000
	total_cs %= 360000
	m = total_cs // 6000
	total_cs %= 6000
	s = total_cs // 100
	cs = total_cs % 100
	return f"{h}:{m:02d}:{s:02d}.{cs:02d}"


	def _pick(value: object, allowed: set[str], fallback: str) -> str:
	v = str(value or "").strip().lower()
	return v if v in allowed else fallback


	def _clamp_float(value: object, fallback: float, low: float = 0.0, high: float = 1.0) -> float:
	try:
	return min(high, max(low, float(value)))
	except Exception:
	return fallback


	def _clamp_int(value: object, fallback: int, allowed: set[int]) -> int:
	try:
	v = int(value)
	except Exception:
	return fallback
	return v if v in allowed else fallback


	def _normalise_bbox(value: object) -> list[float] \| None:
	if not isinstance(value, (list, tuple)) or len(value) != 4:
	return None
	try:
	coords = [float(v) for v in value]
	except Exception:
	return None
	if max(abs(v) for v in coords) > 1.5:
	return None
	x1, y1, x2, y2 = coords
	x1, x2 = sorted((min(1.0, max(0.0, x1)), min(1.0, max(0.0, x2))))
	y1, y2 = sorted((min(1.0, max(0.0, y1)), min(1.0, max(0.0, y2))))
	if x2 - x1 < 0.02 or y2 - y1 < 0.02:
	return None
	return [x1, y1, x2, y2]


	def _caption_anchor_for(x: float, y: float) -> int:
	if y < 0.34:
	return 8 if 0.30 <= x <= 0.70 else 7 if x < 0.5 else 9
	if y > 0.66:
	return 2 if 0.30 <= x <= 0.70 else 1 if x < 0.5 else 3
	return 5 if 0.34 <= x <= 0.66 else 4 if x < 0.5 else 6


	def _safe_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tuple[float, float, int]:
	"""Pick a varied but readable empty-ish zone opposite the main subject."""
	left_side = subject_x < 0.50
	high_subject = subject_y < 0.42
	low_subject = subject_y > 0.62

	candidates = [
	(0.68 if left_side else 0.32, 0.72 if high_subject else 0.24 if low_subject else 0.76),
	(0.72 if left_side else 0.28, 0.50),
	(0.50, 0.18 if subject_y > 0.45 else 0.82),
	(0.50, 0.72),
	]
	x, y = candidates[seg_idx % len(candidates)]
	return x, y, _caption_anchor_for(x, y)


	def _word_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tuple[float, float, int]:
	"""Put highlight words in punchy mid-frame zones instead of ordinary subtitle zones."""
	candidates = [
	(0.50, 0.42),
	(0.50, 0.26),
	(0.28 if subject_x > 0.55 else 0.72, 0.46),
	(0.30 if subject_x > 0.50 else 0.70, 0.58),
	]
	x, y = candidates[seg_idx % len(candidates)]
	if abs(x - subject_x) < 0.18 and abs(y - subject_y) < 0.18:
	x = 0.25 if subject_x > 0.5 else 0.75
	return x, y, _caption_anchor_for(x, y)


	def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
	"""Validate model output and fill HRE fields used by the renderer."""
	an = dict(analysis or {})
	subject_bbox = _normalise_bbox(an.get("subject_bbox"))
	energy = _pick(an.get("energy_level"), {"high", "medium", "low"}, "medium")
	moment = _pick(
	an.get("moment_type"),
	{"hook", "punchline", "context", "reaction", "transition"},
	"context",
	)

	fallback_mode = "word" if energy == "high" or moment in {"hook", "punchline", "reaction"} else "sentence"
	if energy == "medium" and moment not in {"context", "transition"}:
	fallback_mode = "phrase"

	if subject_bbox:
	subject_x = (subject_bbox[0] + subject_bbox[2]) / 2.0
	subject_y = (subject_bbox[1] + subject_bbox[3]) / 2.0
	else:
	subject_x = _clamp_float(an.get("face_cx"), 0.5)
	subject_y = _clamp_float(an.get("face_cy"), 0.38)

	pos = _pick(an.get("subtitle_position"), _POSITIONS, "free")
	mode = _pick(an.get("subtitle_mode"), _MODES, fallback_mode)
	emphasis = _pick(an.get("subtitle_emphasis"), _EMPHASIS, "punch" if mode == "word" else "calm")
	color = _pick(an.get("subtitle_color"), set(_ASS_COLORS), "white")
	zoom_direction = _pick(an.get("zoom_direction"), {"in", "out", "hold"}, "in")
	zoom_speed = _pick(an.get("zoom_speed"), {"fast", "slow"}, "slow")

	face_cx = _clamp_float(an.get("face_cx"), subject_x)
	face_cy = _clamp_float(an.get("face_cy"), subject_y)
	zoom_anchor_x = _clamp_float(an.get("zoom_anchor_x"), face_cx)
	zoom_anchor_y = _clamp_float(an.get("zoom_anchor_y"), face_cy)

	fallback_x, fallback_y, fallback_anchor = _safe_caption_point(subject_x, subject_y, seg_idx)
	caption_x = _clamp_float(an.get("caption_x"), fallback_x, 0.10, 0.90)
	caption_y = _clamp_float(an.get("caption_y"), fallback_y, 0.12, 0.88)
	caption_anchor = _clamp_int(an.get("caption_anchor"), fallback_anchor, _ANCHORS)
	caption_max_width_pct = _clamp_float(
	an.get("caption_max_width_pct"),
	0.58 if mode != "sentence" else 0.72,
	0.35,
	0.82,
	)

	if mode == "sentence":
	caption_x = 0.50
	caption_y = _clamp_float(an.get("caption_y"), 0.70, 0.64, 0.74)
	caption_anchor = 2
	caption_max_width_pct = max(caption_max_width_pct, 0.68)
	elif mode == "word":
	word_x, word_y, word_anchor = _word_caption_point(subject_x, subject_y, seg_idx)
	if caption_y > 0.66 or (abs(caption_x - subject_x) < 0.14 and abs(caption_y - subject_y) < 0.14):
	caption_x, caption_y, caption_anchor = word_x, word_y, word_anchor
	caption_max_width_pct = min(caption_max_width_pct, 0.56)

	if subject_bbox:
	x1, y1, x2, y2 = subject_bbox
	overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
	if overlaps_subject:
	if mode == "sentence":
	caption_x, caption_y, caption_anchor = 0.50, 0.70, 2
	elif mode == "word":
	caption_x, caption_y, caption_anchor = _word_caption_point(subject_x, subject_y, seg_idx + 1)
	else:
	caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor

	if seg_idx == 0:
	zoom_direction, zoom_speed = "in", "fast"
	if mode == "sentence":
	mode = "word"
	if emphasis == "calm":
	emphasis = "punch"

	if mode == "word" or moment in {"hook", "punchline", "reaction"}:
	zoom_direction = "in"
	zoom_speed = "fast" if energy == "high" else "slow"
	emphasis = "punch" if emphasis == "calm" else emphasis
	elif mode == "sentence" and moment in {"context", "transition"}:
	zoom_direction = "hold"
	zoom_speed = "slow"
	emphasis = "calm"

	return {
	**an,
	"zoom_direction": zoom_direction,
	"zoom_speed": zoom_speed,
	"face_detected": bool(an.get("face_detected", False)),
	"face_cx": face_cx,
	"face_cy": face_cy,
	"subject_bbox": subject_bbox,
	"zoom_anchor_x": zoom_anchor_x,
	"zoom_anchor_y": zoom_anchor_y,
	"subtitle_position": pos,
	"caption_x": caption_x,
	"caption_y": caption_y,
	"caption_anchor": caption_anchor,
	"caption_max_width_pct": caption_max_width_pct,
	"subtitle_mode": mode,
	"subtitle_emphasis": emphasis,
	"subtitle_color": color,
	"energy_level": energy,
	"moment_type": moment,
	}


	def _build_hre_plan(segments: list[dict], analyses: list[dict]) -> list[dict]:
	plan = []
	n_total = len(segments)
	for i, (seg, analysis) in enumerate(zip(segments, analyses)):
	an = _normalise_analysis(analysis, i, n_total)
	plan.append({**an, "segment_index": i, "start": seg["start"], "end": seg["end"]})

	# If the model repeats the same caption treatment for every segment, rotate
	# through safe defaults so HRE visibly changes across the clip.
	if len(plan) > 1 and len({(round(p["caption_x"], 2), round(p["caption_y"], 2), p["subtitle_mode"]) for p in plan}) == 1:
	positions = ["free", "free", "free", "free", "free", "free"]
	coords = [(0.50, 0.76), (0.50, 0.18), (0.28, 0.56), (0.72, 0.52), (0.50, 0.82), (0.50, 0.22)]
	modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
	for i, p in enumerate(plan):
	p["subtitle_position"] = positions[i % len(positions)]
	p["caption_x"], p["caption_y"] = coords[i % len(coords)]
	p["caption_anchor"] = _caption_anchor_for(p["caption_x"], p["caption_y"])
	p["subtitle_mode"] = modes[i % len(modes)]
	if p["subtitle_mode"] == "word":
	p["subtitle_emphasis"] = "punch"

	return plan


	def _ass_escape(text: str) -> str:
	return (
	text.replace("{", "(")
	.replace("}", ")")
	.replace("\r", " ")
	.replace("\n", " ")
	.strip()
	)


	def _wrap_text(text: str, max_chars: int) -> str:
	text = _ass_escape(text)
	if len(text) <= max_chars:
	return text

	words = text.split()
	if len(words) <= 1:
	return r"\N".join(text[i:i + max_chars] for i in range(0, len(text), max_chars))

	lines: list[str] = []
	line = ""
	for word in words:
	candidate = f"{line} {word}".strip()
	if line and len(candidate) > max_chars:
	lines.append(line)
	line = word
	else:
	line = candidate
	if line:
	lines.append(line)

	if len(lines) <= 2:
	return r"\N".join(lines)
	return r"\N".join([lines[0], " ".join(lines[1:])])


	def _collect_clip_words(transcript: dict, clip_start: float, duration: float) -> list[dict]:
	words: list[dict] = []
	for seg in transcript.get("segments", []):
	seg_start = float(seg.get("start", clip_start)) - clip_start
	seg_end = float(seg.get("end", clip_start)) - clip_start
	for word in seg.get("words", []):
	text = str(word.get("word", word.get("text", ""))).strip()
	if not text:
	continue
	start = float(word.get("start", seg_start + clip_start)) - clip_start
	end = float(word.get("end", word.get("start", seg_end + clip_start))) - clip_start
	if end <= start:
	end = start + 0.24
	if end <= 0 or start >= duration:
	continue
	words.append({
	"start": max(0.0, start),
	"end": min(duration, end),
	"text": text,
	})
	return sorted(words, key=lambda w: (w["start"], w["end"]))


	def _segment_text(transcript: dict, clip_start: float, seg: dict) -> str:
	parts: list[str] = []
	for item in transcript.get("segments", []):
	start = float(item.get("start", clip_start)) - clip_start
	end = float(item.get("end", clip_start)) - clip_start
	if start < seg["end"] and end > seg["start"]:
	text = str(item.get("text", "")).strip()
	if text:
	parts.append(text)
	return " ".join(parts).strip()


	def _words_in_segment(words: list[dict], seg: dict) -> list[dict]:
	return [
	w for w in words
	if w["start"] < seg["end"] and w["end"] > seg["start"]
	]


	def _display_text(text: str, mode: str, emphasis: str) -> str:
	text = text.strip()
	if mode == "sentence" and emphasis == "calm":
	return text
	return text.upper()


	def _append_event(events: list[dict], start: float, end: float, text: str, plan: dict) -> None:
	start = max(float(plan["start"]), start)
	end = min(float(plan["end"]), end)
	if end - start < 0.08 or not text.strip():
	return
	events.append({
	"start": start,
	"end": end,
	"text": text.strip(),
	"plan": plan,
	})


	def _word_events(words: list[dict], seg: dict, plan: dict) -> list[dict]:
	events: list[dict] = []
	cursor = seg["start"]
	min_d = 0.14 if plan["energy_level"] == "high" else 0.18
	max_d = 0.72 if plan["energy_level"] == "high" else 0.95

	for i, word in enumerate(words):
	start = max(seg["start"], word["start"], cursor)
	next_start = words[i + 1]["start"] if i + 1 < len(words) else seg["end"]
	natural_end = max(word["end"], start + min_d)
	end = min(seg["end"], natural_end, start + max_d)
	if next_start > start:
	end = min(end, max(start + min_d, next_start - 0.015))
	if end <= start:
	end = min(seg["end"], start + min_d)

	_append_event(events, start, end, word["text"], plan)
	cursor = end + 0.015
	if cursor >= seg["end"]:
	break

	return events


	def _line_events(
	words: list[dict],
	seg: dict,
	plan: dict,
	max_words: int,
	max_duration: float,
	max_chars: int,
	) -> list[dict]:
	events: list[dict] = []
	i = 0
	cursor = seg["start"]

	while i < len(words) and cursor < seg["end"] - 0.08:
	group: list[dict] = []
	start = max(seg["start"], words[i]["start"], cursor)
	end = start
	chars = 0

	while i < len(words):
	word = words[i]
	proposed_end = min(seg["end"], max(word["end"], word["start"] + 0.2))
	proposed_chars = chars + len(word["text"]) + (1 if group else 0)
	if group and (
	len(group) >= max_words
	or proposed_end - start > max_duration
	or proposed_chars > max_chars
	):
	break
	group.append(word)
	chars = proposed_chars
	end = max(end, proposed_end)
	i += 1

	if not group:
	i += 1
	continue

	end = min(seg["end"], max(end, start + 0.55))
	text = " ".join(w["text"] for w in group)
	_append_event(events, start, end, text, plan)
	cursor = end + 0.04

	return events


	def _fallback_text_events(text: str, seg: dict, plan: dict) -> list[dict]:
	if not text:
	return []

	mode = plan["subtitle_mode"]
	if mode == "word":
	chunk_size = 1
	elif mode == "phrase":
	chunk_size = 3
	else:
	chunk_size = 7

	units = text.split()
	if len(units) <= 1 and len(text) > 20:
	step = 10 if mode == "word" else 24 if mode == "phrase" else 36
	units = [text[i:i + step] for i in range(0, len(text), step)]

	chunks = [" ".join(units[i:i + chunk_size]) for i in range(0, len(units), chunk_size)]
	chunks = [c for c in chunks if c.strip()]
	if not chunks:
	return []

	events: list[dict] = []
	seg_d = max(0.1, seg["end"] - seg["start"])
	dur = seg_d / len(chunks)
	for i, chunk in enumerate(chunks):
	start = seg["start"] + i * dur
	end = seg["start"] + (i + 1) * dur
	_append_event(events, start, end, chunk, plan)
	return events


	def _build_subtitle_events(
	transcript: dict,
	clip_start: float,
	duration: float,
	segments: list[dict],
	plan: list[dict],
	) -> list[dict]:
	words = _collect_clip_words(transcript, clip_start, duration)
	events: list[dict] = []

	for seg, seg_plan in zip(segments, plan):
	seg_words = _words_in_segment(words, seg)
	mode = seg_plan["subtitle_mode"]

	if seg_words and mode == "word":
	seg_events = _word_events(seg_words, seg, seg_plan)
	elif seg_words and mode == "phrase":
	seg_events = _line_events(seg_words, seg, seg_plan, max_words=3, max_duration=1.7, max_chars=28)
	elif seg_words:
	seg_events = _line_events(seg_words, seg, seg_plan, max_words=7, max_duration=2.8, max_chars=44)
	else:
	seg_events = []

	if not seg_events:
	seg_events = _fallback_text_events(_segment_text(transcript, clip_start, seg), seg, seg_plan)
	events.extend(seg_events)

	events = sorted(events, key=lambda ev: (ev["start"], ev["end"]))

	# ASS draws all active events at once; keep one visible caption event at a
	# time so word/phrase/sentence modes never stack on top of each other.
	cleaned: list[dict] = []
	cursor = 0.0
	for ev in events:
	start = max(ev["start"], cursor)
	end = min(duration, ev["end"])
	if end - start < 0.08:
	continue
	cleaned.append({**ev, "start": start, "end": end})
	cursor = end + 0.01
	return cleaned


	def _subtitle_tag(plan: dict) -> tuple[str, int]:
	mode = plan["subtitle_mode"]
	energy = plan["energy_level"]
	emphasis = plan["subtitle_emphasis"]
	color = _ASS_COLORS.get(plan["subtitle_color"], "&H00FFFFFF")
	alignment = int(plan.get("caption_anchor", 5))
	x = round(_clamp_float(plan.get("caption_x"), 0.5, 0.08, 0.92) * 1080)
	y = round(_clamp_float(plan.get("caption_y"), 0.75, 0.10, 0.90) * 1920)
	max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))

	if mode == "sentence":
	font_size = 54 if energy != "high" else 60
	elif mode == "phrase":
	font_size = 68 if energy != "low" else 62
	else:
	font_size = 96 if energy == "high" else 84

	if alignment in {4, 5, 6}:
	font_size = max(54, font_size - 4)

	max_chars = max(8, min(34, int(max_width_px / (font_size * 0.58))))

	base = (
	f"{{\\an{alignment}\\pos({x},{y})\\1c{color}&\\fs{font_size}"
	"\\b1\\bord5\\shad1\\q2\\fad(30,70)}"
	)
	if emphasis == "punch" or mode == "word":
	base += "{\\fscx132\\fscy132\\frz-2\\t(0,140,\\fscx100\\fscy100\\frz0)}"
	elif emphasis == "pop":
	base += "{\\fscx118\\fscy118\\t(0,120,\\fscx100\\fscy100)}"
	return base, max_chars


	def _generate_per_segment_subtitles(
	transcript: dict,
	ass_path: Path,
	clip_start: float,
	segments: list[dict],
	analyses: list[dict],
	) -> None:
	"""Write one ASS file from the HRE plan.

	The important rule is that HRE can change style every segment, but it must
	never emit simultaneous caption events at the same timestamp.
	"""
	duration = max((float(seg["end"]) for seg in segments), default=0.0)
	plan = _build_hre_plan(segments, analyses)
	events = _build_subtitle_events(transcript, clip_start, duration, segments, plan)

	lines = [
	"[Script Info]",
	"ScriptType: v4.00+",
	"PlayResX: 1080",
	"PlayResY: 1920",
	"ScaledBorderAndShadow: yes",
	"",
	"[V4+ Styles]",
	"Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
	"OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
	"ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
	"Alignment, MarginL, MarginR, MarginV, Encoding",
	"Style: Default,Noto Sans,82,&H00FFFFFF,&H0000FFFF,&H00000000,&H80000000,"
	"-1,0,0,0,100,100,0,0,1,5,1,2,40,40,200,1",
	"",
	"[Events]",
	"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
	]

	for ev in events:
	seg_plan = ev["plan"]
	tag, max_chars = _subtitle_tag(seg_plan)
	text = _display_text(ev["text"], seg_plan["subtitle_mode"], seg_plan["subtitle_emphasis"])
	text = _wrap_text(text, max_chars)

	lines.append(
	f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])},"
	f"Default,,0,0,0,,{tag}{text}"
	)

	ass_path.write_text("\n".join(lines), encoding="utf-8")
	plan_path = ass_path.with_suffix(".hre_plan.json")
	plan_path.write_text(json.dumps(plan, ensure_ascii=False, indent=2), encoding="utf-8")
	logger.debug(f"ASS: {len(events)} events across {len(segments)} HRE segments")


	# ─── Emoji ─────────────────────────────────────────────────────────────────────

	def _get_emoji(clip_data: dict, analyses: list[dict] \| None = None) -> str:
	if analyses:
	energy_rank = {"high": 3, "medium": 2, "low": 1}
	best = max(analyses, key=lambda a: energy_rank.get(a.get("energy_level", "low"), 1))
	moment_emoji = {
	"hook": "🔥", "punchline": "😂", "reaction": "😲",
	"context": "💡", "transition": "✨",
	}
	if emoji := moment_emoji.get(best.get("moment_type", "")):
	return emoji

	a = clip_data.get("vision_analysis", {})
	emotion = a.get("emotion", "excited")
	action = a.get("action_type", "entertainment")
	transcript_text = clip_data.get("transcript_text", "")
	if transcript_text:
	try:
	from src.analysis.vision import get_emoji_for_scene
	return get_emoji_for_scene(transcript_text, emotion, action)
	except Exception:
	pass

	fb = {"happy": "😄", "excited": "🔥", "funny": "😂", "surprised": "😲",
	"gaming": "🎮", "tutorial": "📚", "angry": "😤", "sad": "😢"}
	return fb.get(emotion, fb.get(action, "⚡"))


	# ─── Final render ─────────────────────────────────────────────────────────────

	def _render_final(
	video_path: Path,
	ass_path: Path,
	emoji: str,
	output_path: Path,
	) -> None:
	ass_str = str(ass_path).replace("\\", "/").replace(":", "\\:")
	emoji_filter = (
	f"drawtext=text='{emoji}':fontsize=80:x=w-100:y=50"
	f":enable='between(t\\,0\\,3)'"
	)
	vf = f"ass='{ass_str}',{emoji_filter}"

	cmd = [
	"ffmpeg", "-y", "-i", str(video_path),
	"-vf", vf, "-c:v", "libx264", "-c:a", "copy",
	"-movflags", "+faststart", str(output_path),
	]
	result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
	if result.returncode != 0:
	cmd2 = [
	"ffmpeg", "-y", "-i", str(video_path),
	"-vf", f"ass='{ass_str}'",
	"-c:v", "libx264", "-c:a", "copy", str(output_path),
	]
	result2 = subprocess.run(cmd2, capture_output=True, text=True, timeout=300)
	if result2.returncode != 0:
	logger.error(f"HRE render failed: {result2.stderr[-300:]}")
	return
	logger.info(f"HRE render complete → {output_path.name}")


	# ─── Main pipeline ────────────────────────────────────────────────────────────

	def apply_hre(
	clip_path: Path,
	clip_data: dict,
	transcript: dict,
	output_path: Path,
	) -> Path:
	"""Apply per-segment AI-driven HRE with varied zoom and caption plans."""
	output_path.parent.mkdir(parents=True, exist_ok=True)
	clip_start = clip_data.get("start", 0.0)

	with tempfile.TemporaryDirectory() as _tmp:
	tmp_dir = Path(_tmp)
	tmp_zoomed = tmp_dir / "zoomed.mp4"

	w, h = _probe_dimensions(clip_path)
	duration = _probe_duration(clip_path)
	if duration <= 0:
	duration = float(clip_data.get("end", clip_start + 30)) - clip_start
	has_audio = _has_audio_stream(clip_path)

	# 1. Segment at speech pauses
	segments = _segment_clip(duration, transcript, clip_start)
	n = len(segments)
	logger.info(
	f"HRE clip {clip_data.get('index', '?')}: "
	f"{duration:.1f}s → {n} segments (AI analyzing each)"
	)

	# 2. Qwen2.5-VL analyzes each segment
	analyses = [
	_analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir)
	for i, seg in enumerate(segments)
	]
	plan = _build_hre_plan(segments, analyses)

	for i, (seg, an) in enumerate(zip(segments, plan)):
	logger.info(
	f" [{seg['start']:.1f}s-{seg['end']:.1f}s] "
	f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) "
	f"sub={an.get('subtitle_position')}/{an.get('subtitle_mode')}/"
	f"{an.get('subtitle_color')} "
	f"type={an.get('moment_type')} energy={an.get('energy_level')}"
	)

	# 3. Per-segment zoom via filter_complex
	zoomed = _apply_per_segment_zoom(
	clip_path, segments, plan, w, h, tmp_zoomed, has_audio=has_audio
	)

	# 4. Per-segment ASS subtitles
	ass_path = output_path.with_suffix(".ass")
	_generate_per_segment_subtitles(transcript, ass_path, clip_start, segments, plan)

	# 5. Emoji from highest-energy segment
	emoji = _get_emoji(clip_data, plan)

	# 6. Render
	_render_final(zoomed, ass_path, emoji, output_path)

	return output_path