Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Runtime error

App Files Files Community

jakgritb commited on 15 days ago

Commit

e82c4ed

verified ·

1 Parent(s): 936060a

fix: improve HRE spatial editing

Browse files

Files changed (3) hide show

backend/src/analysis/vision.py +52 -15
backend/src/processing/clip_extractor.py +23 -4
backend/src/processing/high_retention.py +140 -42

backend/src/analysis/vision.py CHANGED Viewed

@@ -161,7 +161,7 @@ def _default_analysis() -> dict:
     }
-HRE_SEGMENT_PROMPT = """Analyze this video frame for high-retention TikTok editing decisions.
 Segment {seg_idx} of {n_total}. Transcript: "{context}"
@@ -172,7 +172,14 @@ Respond ONLY with valid JSON — no markdown:
   "face_detected": <true|false>,
   "face_cx": <0.0-1.0>,
   "face_cy": <0.0-1.0>,
   "subtitle_position": "<top|bottom|left|right|center>",
   "subtitle_mode": "<word|phrase|sentence>",
   "subtitle_emphasis": "<pop|punch|calm>",
   "subtitle_color": "<white|yellow|cyan|orange|green>",
@@ -189,16 +196,19 @@ Rules:
 - subtitle WORD: short hooks, reactions, punchlines, important keywords
 - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
 - subtitle SENTENCE: explanation, normal conversation, low/medium energy
-- subtitle TOP: face is in bottom half
-- subtitle BOTTOM: face is in top half
-- subtitle LEFT/RIGHT: face or main object is on the opposite side
-- Avoid choosing the exact same subtitle_position and subtitle_mode for every segment.
 - face_cx/face_cy: face center as 0.0-1.0 fraction of frame
 """
-def analyze_frame_for_hre(
-    frame_path: "Path",
     context: str = "",
     seg_idx: int = 0,
     n_total: int = 1,
@@ -208,23 +218,29 @@ def analyze_frame_for_hre(
         from openai import OpenAI
         client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
-        if not Path(frame_path).exists():
             return _default_hre_analysis(seg_idx, n_total)
-        b64 = _encode_image(str(frame_path))
         prompt = HRE_SEGMENT_PROMPT.format(
-            seg_idx=seg_idx, n_total=n_total, context=context[:200]
         )
         response = client.chat.completions.create(
             model=VLLM_MODEL,
             messages=[{
                 "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
-                    {"type": "text", "text": prompt},
-                ],
             }],
-            max_tokens=200,
             temperature=0.1,
         )
         raw = response.choices[0].message.content.strip()
@@ -238,6 +254,7 @@ def analyze_frame_for_hre(
         logger.debug(
             f"HRE seg {seg_idx}/{n_total}: "
             f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
             f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
             f"{analysis.get('subtitle_color')} "
             f"type={analysis.get('moment_type')}"
@@ -254,6 +271,16 @@ def analyze_frame_for_hre(
         return _default_hre_analysis(seg_idx, n_total)
 def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
     """Fallback with varied decisions based on position in clip."""
     if seg_idx == 0:
@@ -267,8 +294,11 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
     _colors    = ["yellow", "white", "cyan", "orange", "white", "yellow"]
     _positions = ["bottom", "top", "left", "bottom", "right", "top"]
     _modes     = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
     _emphasis  = ["punch", "calm", "pop", "punch", "calm", "pop"]
     return {
         "zoom_direction":    zoom_dir,
@@ -276,7 +306,14 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
         "face_detected":     False,
         "face_cx":           0.5,
         "face_cy":           0.38,
         "subtitle_position": _positions[seg_idx % len(_positions)],
         "subtitle_mode":     _modes[seg_idx % len(_modes)],
         "subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
         "subtitle_color":    _colors[seg_idx % len(_colors)],

     }
+HRE_SEGMENT_PROMPT = """Analyze these video frames for high-retention TikTok editing decisions.
 Segment {seg_idx} of {n_total}. Transcript: "{context}"
   "face_detected": <true|false>,
   "face_cx": <0.0-1.0>,
   "face_cy": <0.0-1.0>,
+  "subject_bbox": [<x1>, <y1>, <x2>, <y2>] or null,
+  "zoom_anchor_x": <0.0-1.0>,
+  "zoom_anchor_y": <0.0-1.0>,
   "subtitle_position": "<top|bottom|left|right|center>",
+  "caption_x": <0.10-0.90>,
+  "caption_y": <0.12-0.88>,
+  "caption_anchor": <1-9>,
+  "caption_max_width_pct": <0.35-0.82>,
   "subtitle_mode": "<word|phrase|sentence>",
   "subtitle_emphasis": "<pop|punch|calm>",
   "subtitle_color": "<white|yellow|cyan|orange|green>",
 - subtitle WORD: short hooks, reactions, punchlines, important keywords
 - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
 - subtitle SENTENCE: explanation, normal conversation, low/medium energy
+- subject_bbox: main face/person/product/object box in normalized frame coordinates, or null if unclear
+- zoom_anchor_x/y: center of the face/person/product to keep important content in frame; never choose a blank wall/window
+- caption_x/y: choose an actually empty readable area in this frame, not just fixed top/bottom
+- caption_anchor: ASS anchor 1-9 matching caption_x/y (1 bottom-left, 5 center, 9 top-right)
+- caption_max_width_pct: smaller when the empty space is narrow; captions must stay fully inside the 9:16 frame
+- Keep captions away from face, product, hands, and important screen/object regions.
+- Avoid choosing the exact same caption_x/y and subtitle_mode for every segment.
 - face_cx/face_cy: face center as 0.0-1.0 fraction of frame
 """
+def analyze_frames_for_hre(
+    frame_paths: list["Path"],
     context: str = "",
     seg_idx: int = 0,
     n_total: int = 1,
         from openai import OpenAI
         client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
+        valid_frames = [Path(p) for p in frame_paths[:3] if Path(p).exists()]
+        if not valid_frames:
             return _default_hre_analysis(seg_idx, n_total)
+        content = []
+        for frame_path in valid_frames:
+            b64 = _encode_image(str(frame_path))
+            content.append({
+                "type": "image_url",
+                "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
+            })
         prompt = HRE_SEGMENT_PROMPT.format(
+            seg_idx=seg_idx, n_total=n_total, context=context[:320]
         )
+        content.append({"type": "text", "text": prompt})
         response = client.chat.completions.create(
             model=VLLM_MODEL,
             messages=[{
                 "role": "user",
+                "content": content,
             }],
+            max_tokens=380,
             temperature=0.1,
         )
         raw = response.choices[0].message.content.strip()
         logger.debug(
             f"HRE seg {seg_idx}/{n_total}: "
             f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
+            f"caption=({analysis.get('caption_x')},{analysis.get('caption_y')}) "
             f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
             f"{analysis.get('subtitle_color')} "
             f"type={analysis.get('moment_type')}"
         return _default_hre_analysis(seg_idx, n_total)
+def analyze_frame_for_hre(
+    frame_path: "Path",
+    context: str = "",
+    seg_idx: int = 0,
+    n_total: int = 1,
+) -> dict:
+    """Backward-compatible wrapper for callers that provide one frame."""
+    return analyze_frames_for_hre([frame_path], context, seg_idx, n_total)
 def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
     """Fallback with varied decisions based on position in clip."""
     if seg_idx == 0:
     _colors    = ["yellow", "white", "cyan", "orange", "white", "yellow"]
     _positions = ["bottom", "top", "left", "bottom", "right", "top"]
+    _coords    = [(0.50, 0.76), (0.50, 0.18), (0.28, 0.56), (0.50, 0.72), (0.72, 0.56), (0.50, 0.20)]
+    _anchors   = [2, 8, 4, 2, 6, 8]
     _modes     = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
     _emphasis  = ["punch", "calm", "pop", "punch", "calm", "pop"]
+    caption_x, caption_y = _coords[seg_idx % len(_coords)]
     return {
         "zoom_direction":    zoom_dir,
         "face_detected":     False,
         "face_cx":           0.5,
         "face_cy":           0.38,
+        "subject_bbox":      None,
+        "zoom_anchor_x":     0.5,
+        "zoom_anchor_y":     0.38,
         "subtitle_position": _positions[seg_idx % len(_positions)],
+        "caption_x":         caption_x,
+        "caption_y":         caption_y,
+        "caption_anchor":    _anchors[seg_idx % len(_anchors)],
+        "caption_max_width_pct": 0.62,
         "subtitle_mode":     _modes[seg_idx % len(_modes)],
         "subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
         "subtitle_color":    _colors[seg_idx % len(_colors)],

backend/src/processing/clip_extractor.py CHANGED Viewed

@@ -5,6 +5,26 @@ from pathlib import Path
 from loguru import logger
 def extract_clip(
     video_path: Path,
     start: float,
@@ -37,10 +57,9 @@ def extract_clip(
         else:
             # Crop: scale to 1920 height first, then center-crop to 1080 wide
             # Optionally center on face_bbox x when available
-            if face_bbox and len(face_bbox) == 4:
-                x1, _, x2, _ = face_bbox
-                face_cx = int((x1 + x2) / 2)
-                crop = f"scale=-1:1920,crop=1080:1920:max(0\\,min(iw-1080\\,{face_cx}*iw/in_w-540)):0"
             else:
                 crop = "scale=-1:1920,crop=1080:1920:(iw-1080)/2:0"
             vf_filters.append(crop)

 from loguru import logger
+def _face_center_expr(face_bbox: list | None) -> str | None:
+    """Return a crop expression x-center from Qwen's normalized face bbox."""
+    if not face_bbox or len(face_bbox) != 4:
+        return None
+    try:
+        x1, _, x2, _ = [float(v) for v in face_bbox]
+    except Exception:
+        return None
+    # Qwen prompt asks for normalized percentages. Older comments said pixels,
+    # so keep a conservative pixel fallback, but prefer normalized handling.
+    face_cx = (x1 + x2) / 2.0
+    if max(abs(x1), abs(x2)) <= 1.5:
+        face_cx = min(1.0, max(0.0, face_cx))
+        return f"{face_cx:.4f}*iw-540"
+    if 0 <= face_cx <= 1080:
+        return f"({face_cx:.1f}/1080)*iw-540"
+    return None
 def extract_clip(
     video_path: Path,
     start: float,
         else:
             # Crop: scale to 1920 height first, then center-crop to 1080 wide
             # Optionally center on face_bbox x when available
+            face_expr = _face_center_expr(face_bbox)
+            if face_expr:
+                crop = f"scale=-1:1920,crop=1080:1920:max(0\\,min(iw-1080\\,{face_expr})):0"
             else:
                 crop = "scale=-1:1920,crop=1080:1920:(iw-1080)/2:0"
             vf_filters.append(crop)

backend/src/processing/high_retention.py CHANGED Viewed

@@ -125,6 +125,24 @@ def _extract_frame(video_path: Path, t: float, out_path: Path) -> bool:
     return result.returncode == 0 and out_path.exists()
 # ─── Per-segment AI analysis ──────────────────────────────────────────────────
 def _analyze_segment(
@@ -136,12 +154,10 @@ def _analyze_segment(
     clip_start: float,
     tmp_dir: Path,
 ) -> dict:
-    from src.analysis.vision import analyze_frame_for_hre, _default_hre_analysis
-    mid_t = (seg["start"] + seg["end"]) / 2.0
-    frame_path = tmp_dir / f"seg_{seg_idx:03d}.jpg"
-    if not _extract_frame(video_path, mid_t, frame_path):
         return _default_hre_analysis(seg_idx, n_total)
     words_all: list[dict] = []
@@ -156,7 +172,7 @@ def _analyze_segment(
         if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start
     ).strip()
-    return analyze_frame_for_hre(frame_path, context, seg_idx, n_total)
 # ─── Zoom expression builders ─────────────────────────────────────────────────
@@ -171,9 +187,8 @@ def _build_zoom_exprs(
     """
     direction     = analysis.get("zoom_direction", "in")
     speed         = analysis.get("zoom_speed", "slow")
-    face_detected = bool(analysis.get("face_detected", False))
-    face_cx       = float(analysis.get("face_cx") or 0.5)
-    face_cy       = float(analysis.get("face_cy") or 0.38)
     if direction == "in":
         if speed == "fast":
@@ -188,13 +203,13 @@ def _build_zoom_exprs(
     else:  # hold
         z_expr, max_zoom = "1.08", 1.08
-    if face_detected and direction == "in" and max_zoom > 1.05:
-        x_expr = f"max(0\\,min(iw-iw/zoom\\,iw*{face_cx:.3f}-iw/zoom/2))"
-        y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{face_cy:.3f}-ih/zoom/2))"
     else:
         x_expr = "iw/2-(iw/zoom/2)"
         if direction == "in":
-            y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
             y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{y_bias:.2f}-(ih/zoom/2)))"
         else:
             y_expr = "ih/2-(ih/zoom/2)"
@@ -264,9 +279,10 @@ _ASS_COLORS = {
     "red":    "&H000000FF",
 }
-_POSITIONS = {"top", "bottom", "left", "right", "center"}
 _MODES = {"word", "phrase", "sentence"}
 _EMPHASIS = {"pop", "punch", "calm"}
 def _ts(t: float) -> str:
@@ -285,9 +301,66 @@ def _pick(value: object, allowed: set[str], fallback: str) -> str:
     return v if v in allowed else fallback
 def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
     """Validate model output and fill HRE fields used by the renderer."""
     an = dict(analysis or {})
     energy = _pick(an.get("energy_level"), {"high", "medium", "low"}, "medium")
     moment = _pick(
         an.get("moment_type"),
@@ -299,18 +372,41 @@ def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
     if energy == "medium" and moment not in {"context", "transition"}:
         fallback_mode = "phrase"
-    pos = _pick(an.get("subtitle_position"), _POSITIONS, "bottom")
     mode = _pick(an.get("subtitle_mode"), _MODES, fallback_mode)
     emphasis = _pick(an.get("subtitle_emphasis"), _EMPHASIS, "punch" if mode == "word" else "calm")
     color = _pick(an.get("subtitle_color"), set(_ASS_COLORS), "white")
     zoom_direction = _pick(an.get("zoom_direction"), {"in", "out", "hold"}, "in")
     zoom_speed = _pick(an.get("zoom_speed"), {"fast", "slow"}, "slow")
-    try:
-        face_cx = min(1.0, max(0.0, float(an.get("face_cx", 0.5))))
-        face_cy = min(1.0, max(0.0, float(an.get("face_cy", 0.38))))
-    except Exception:
-        face_cx, face_cy = 0.5, 0.38
     if seg_idx == 0:
         zoom_direction, zoom_speed = "in", "fast"
@@ -326,7 +422,14 @@ def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
         "face_detected": bool(an.get("face_detected", False)),
         "face_cx": face_cx,
         "face_cy": face_cy,
         "subtitle_position": pos,
         "subtitle_mode": mode,
         "subtitle_emphasis": emphasis,
         "subtitle_color": color,
@@ -344,11 +447,14 @@ def _build_hre_plan(segments: list[dict], analyses: list[dict]) -> list[dict]:
     # If the model repeats the same caption treatment for every segment, rotate
     # through safe defaults so HRE visibly changes across the clip.
-    if len(plan) > 1 and len({(p["subtitle_position"], p["subtitle_mode"]) for p in plan}) == 1:
-        positions = ["bottom", "top", "left", "bottom", "right", "top"]
         modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
         for i, p in enumerate(plan):
             p["subtitle_position"] = positions[i % len(positions)]
             p["subtitle_mode"] = modes[i % len(modes)]
             if p["subtitle_mode"] == "word":
                 p["subtitle_emphasis"] = "punch"
@@ -599,38 +705,30 @@ def _build_subtitle_events(
 def _subtitle_tag(plan: dict) -> tuple[str, int]:
-    pos = plan["subtitle_position"]
     mode = plan["subtitle_mode"]
     energy = plan["energy_level"]
     emphasis = plan["subtitle_emphasis"]
     color = _ASS_COLORS.get(plan["subtitle_color"], "&H00FFFFFF")
-    anchors = {
-        "top": (8, 540, 230),
-        "bottom": (2, 540, 1660),
-        "left": (4, 95, 960),
-        "right": (6, 985, 960),
-        "center": (5, 540, 960),
-    }
-    alignment, x, y = anchors.get(pos, anchors["bottom"])
     if mode == "sentence":
-        font_size = 66 if energy != "high" else 74
-        max_chars = 34
     elif mode == "phrase":
-        font_size = 82 if energy != "low" else 76
-        max_chars = 24
     else:
-        font_size = 102 if energy == "high" else 92
-        max_chars = 18
-    if pos in {"left", "right"}:
-        font_size -= 8
-        max_chars = min(max_chars, 22)
     base = (
         f"{{\\an{alignment}\\pos({x},{y})\\1c{color}&\\fs{font_size}"
-        "\\b1\\bord5\\shad1\\q2}}"
     )
     if emphasis in {"pop", "punch"} or mode == "word":
         base += "{\\fscx125\\fscy125\\t(0,120,\\fscx100\\fscy100)}"

     return result.returncode == 0 and out_path.exists()
+def _extract_segment_frames(video_path: Path, seg: dict, seg_idx: int, tmp_dir: Path) -> list[Path]:
+    """Extract a few representative frames so HRE decisions see motion, not one random still."""
+    start = float(seg["start"])
+    end = float(seg["end"])
+    duration = max(0.1, end - start)
+    times = [
+        start + duration * 0.25,
+        start + duration * 0.50,
+        start + duration * 0.75,
+    ]
+    frames: list[Path] = []
+    for j, t in enumerate(times):
+        frame_path = tmp_dir / f"seg_{seg_idx:03d}_{j}.jpg"
+        if _extract_frame(video_path, min(max(start, t), max(start, end - 0.05)), frame_path):
+            frames.append(frame_path)
+    return frames
 # ─── Per-segment AI analysis ──────────────────────────────────────────────────
 def _analyze_segment(
     clip_start: float,
     tmp_dir: Path,
 ) -> dict:
+    from src.analysis.vision import analyze_frames_for_hre, _default_hre_analysis
+    frame_paths = _extract_segment_frames(video_path, seg, seg_idx, tmp_dir)
+    if not frame_paths:
         return _default_hre_analysis(seg_idx, n_total)
     words_all: list[dict] = []
         if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start
     ).strip()
+    return analyze_frames_for_hre(frame_paths, context, seg_idx, n_total)
 # ─── Zoom expression builders ─────────────────────────────────────────────────
     """
     direction     = analysis.get("zoom_direction", "in")
     speed         = analysis.get("zoom_speed", "slow")
+    zoom_anchor_x = _clamp_float(analysis.get("zoom_anchor_x"), _clamp_float(analysis.get("face_cx"), 0.5))
+    zoom_anchor_y = _clamp_float(analysis.get("zoom_anchor_y"), _clamp_float(analysis.get("face_cy"), 0.38))
     if direction == "in":
         if speed == "fast":
     else:  # hold
         z_expr, max_zoom = "1.08", 1.08
+    if direction == "in" and max_zoom > 1.05:
+        x_expr = f"max(0\\,min(iw-iw/zoom\\,iw*{zoom_anchor_x:.3f}-iw/zoom/2))"
+        y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{zoom_anchor_y:.3f}-ih/zoom/2))"
     else:
         x_expr = "iw/2-(iw/zoom/2)"
         if direction == "in":
+            y_bias = min(zoom_anchor_y, 0.5) if zoom_anchor_y < 0.55 else 0.38
             y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{y_bias:.2f}-(ih/zoom/2)))"
         else:
             y_expr = "ih/2-(ih/zoom/2)"
     "red":    "&H000000FF",
 }
+_POSITIONS = {"top", "bottom", "left", "right", "center", "free"}
 _MODES = {"word", "phrase", "sentence"}
 _EMPHASIS = {"pop", "punch", "calm"}
+_ANCHORS = set(range(1, 10))
 def _ts(t: float) -> str:
     return v if v in allowed else fallback
+def _clamp_float(value: object, fallback: float, low: float = 0.0, high: float = 1.0) -> float:
+    try:
+        return min(high, max(low, float(value)))
+    except Exception:
+        return fallback
+def _clamp_int(value: object, fallback: int, allowed: set[int]) -> int:
+    try:
+        v = int(value)
+    except Exception:
+        return fallback
+    return v if v in allowed else fallback
+def _normalise_bbox(value: object) -> list[float] | None:
+    if not isinstance(value, (list, tuple)) or len(value) != 4:
+        return None
+    try:
+        coords = [float(v) for v in value]
+    except Exception:
+        return None
+    if max(abs(v) for v in coords) > 1.5:
+        return None
+    x1, y1, x2, y2 = coords
+    x1, x2 = sorted((min(1.0, max(0.0, x1)), min(1.0, max(0.0, x2))))
+    y1, y2 = sorted((min(1.0, max(0.0, y1)), min(1.0, max(0.0, y2))))
+    if x2 - x1 < 0.02 or y2 - y1 < 0.02:
+        return None
+    return [x1, y1, x2, y2]
+def _caption_anchor_for(x: float, y: float) -> int:
+    if y < 0.34:
+        return 8 if 0.30 <= x <= 0.70 else 7 if x < 0.5 else 9
+    if y > 0.66:
+        return 2 if 0.30 <= x <= 0.70 else 1 if x < 0.5 else 3
+    return 5 if 0.34 <= x <= 0.66 else 4 if x < 0.5 else 6
+def _safe_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tuple[float, float, int]:
+    """Pick a varied but readable empty-ish zone opposite the main subject."""
+    left_side = subject_x < 0.50
+    high_subject = subject_y < 0.42
+    low_subject = subject_y > 0.62
+    candidates = [
+        (0.68 if left_side else 0.32, 0.72 if high_subject else 0.24 if low_subject else 0.76),
+        (0.72 if left_side else 0.28, 0.50),
+        (0.50, 0.18 if subject_y > 0.45 else 0.82),
+        (0.50, 0.72),
+    ]
+    x, y = candidates[seg_idx % len(candidates)]
+    return x, y, _caption_anchor_for(x, y)
 def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
     """Validate model output and fill HRE fields used by the renderer."""
     an = dict(analysis or {})
+    subject_bbox = _normalise_bbox(an.get("subject_bbox"))
     energy = _pick(an.get("energy_level"), {"high", "medium", "low"}, "medium")
     moment = _pick(
         an.get("moment_type"),
     if energy == "medium" and moment not in {"context", "transition"}:
         fallback_mode = "phrase"
+    if subject_bbox:
+        subject_x = (subject_bbox[0] + subject_bbox[2]) / 2.0
+        subject_y = (subject_bbox[1] + subject_bbox[3]) / 2.0
+    else:
+        subject_x = _clamp_float(an.get("face_cx"), 0.5)
+        subject_y = _clamp_float(an.get("face_cy"), 0.38)
+    pos = _pick(an.get("subtitle_position"), _POSITIONS, "free")
     mode = _pick(an.get("subtitle_mode"), _MODES, fallback_mode)
     emphasis = _pick(an.get("subtitle_emphasis"), _EMPHASIS, "punch" if mode == "word" else "calm")
     color = _pick(an.get("subtitle_color"), set(_ASS_COLORS), "white")
     zoom_direction = _pick(an.get("zoom_direction"), {"in", "out", "hold"}, "in")
     zoom_speed = _pick(an.get("zoom_speed"), {"fast", "slow"}, "slow")
+    face_cx = _clamp_float(an.get("face_cx"), subject_x)
+    face_cy = _clamp_float(an.get("face_cy"), subject_y)
+    zoom_anchor_x = _clamp_float(an.get("zoom_anchor_x"), face_cx)
+    zoom_anchor_y = _clamp_float(an.get("zoom_anchor_y"), face_cy)
+    fallback_x, fallback_y, fallback_anchor = _safe_caption_point(subject_x, subject_y, seg_idx)
+    caption_x = _clamp_float(an.get("caption_x"), fallback_x, 0.10, 0.90)
+    caption_y = _clamp_float(an.get("caption_y"), fallback_y, 0.12, 0.88)
+    caption_anchor = _clamp_int(an.get("caption_anchor"), fallback_anchor, _ANCHORS)
+    caption_max_width_pct = _clamp_float(
+        an.get("caption_max_width_pct"),
+        0.58 if mode != "sentence" else 0.72,
+        0.35,
+        0.82,
+    )
+    if subject_bbox:
+        x1, y1, x2, y2 = subject_bbox
+        overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
+        if overlaps_subject:
+            caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor
     if seg_idx == 0:
         zoom_direction, zoom_speed = "in", "fast"
         "face_detected": bool(an.get("face_detected", False)),
         "face_cx": face_cx,
         "face_cy": face_cy,
+        "subject_bbox": subject_bbox,
+        "zoom_anchor_x": zoom_anchor_x,
+        "zoom_anchor_y": zoom_anchor_y,
         "subtitle_position": pos,
+        "caption_x": caption_x,
+        "caption_y": caption_y,
+        "caption_anchor": caption_anchor,
+        "caption_max_width_pct": caption_max_width_pct,
         "subtitle_mode": mode,
         "subtitle_emphasis": emphasis,
         "subtitle_color": color,
     # If the model repeats the same caption treatment for every segment, rotate
     # through safe defaults so HRE visibly changes across the clip.
+    if len(plan) > 1 and len({(round(p["caption_x"], 2), round(p["caption_y"], 2), p["subtitle_mode"]) for p in plan}) == 1:
+        positions = ["free", "free", "free", "free", "free", "free"]
+        coords = [(0.50, 0.76), (0.50, 0.18), (0.28, 0.56), (0.72, 0.52), (0.50, 0.82), (0.50, 0.22)]
         modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
         for i, p in enumerate(plan):
             p["subtitle_position"] = positions[i % len(positions)]
+            p["caption_x"], p["caption_y"] = coords[i % len(coords)]
+            p["caption_anchor"] = _caption_anchor_for(p["caption_x"], p["caption_y"])
             p["subtitle_mode"] = modes[i % len(modes)]
             if p["subtitle_mode"] == "word":
                 p["subtitle_emphasis"] = "punch"
 def _subtitle_tag(plan: dict) -> tuple[str, int]:
     mode = plan["subtitle_mode"]
     energy = plan["energy_level"]
     emphasis = plan["subtitle_emphasis"]
     color = _ASS_COLORS.get(plan["subtitle_color"], "&H00FFFFFF")
+    alignment = int(plan.get("caption_anchor", 5))
+    x = round(_clamp_float(plan.get("caption_x"), 0.5, 0.08, 0.92) * 1080)
+    y = round(_clamp_float(plan.get("caption_y"), 0.75, 0.10, 0.90) * 1920)
+    max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
     if mode == "sentence":
+        font_size = 56 if energy != "high" else 62
     elif mode == "phrase":
+        font_size = 68 if energy != "low" else 62
     else:
+        font_size = 80 if energy == "high" else 72
+    if alignment in {4, 5, 6}:
+        font_size = max(54, font_size - 4)
+    max_chars = max(8, min(34, int(max_width_px / (font_size * 0.58))))
     base = (
         f"{{\\an{alignment}\\pos({x},{y})\\1c{color}&\\fs{font_size}"
+        "\\b1\\bord5\\shad1\\q2}"
     )
     if emphasis in {"pop", "punch"} or mode == "word":
         base += "{\\fscx125\\fscy125\\t(0,120,\\fscx100\\fscy100)}"