Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Runtime error

App Files Files Community

jakgritb commited on 28 days ago

Commit

5543229

verified ·

1 Parent(s): 65a5f35

fix: tune HRE crop and caption timing

Browse files

Files changed (3) hide show

backend/main.py +3 -1
backend/src/analysis/vision.py +2 -0
backend/src/processing/high_retention.py +37 -7

backend/main.py CHANGED Viewed

@@ -354,7 +354,9 @@ async def _run_pipeline(
         # ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
         await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
-        extract_aspect_mode = "safe_fit" if settings.mode == "hre" and settings.aspect_mode == "crop" else settings.aspect_mode
         clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=extract_aspect_mode)
         # ── 8. Subtitles / HRE (all clips in parallel) ─────────────────

         # ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
         await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
+        # HRE needs a true TikTok crop, not a shrunken fit/letterbox frame.
+        # The crop extractor centers on Qwen's face/person bbox when available.
+        extract_aspect_mode = "crop" if settings.mode == "hre" else settings.aspect_mode
         clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=extract_aspect_mode)
         # ── 8. Subtitles / HRE (all clips in parallel) ─────────────────

backend/src/analysis/vision.py CHANGED Viewed

@@ -195,6 +195,8 @@ Rules:
 - For normal explanatory speech:
   zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
 - Use zoom OUT only as breathing room after an intense/key moment.
 - subtitle WORD: short hooks, reactions, punchlines, important keywords
 - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
 - subtitle SENTENCE: explanation, normal conversation, low/medium energy

 - For normal explanatory speech:
   zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
 - Use zoom OUT only as breathing room after an intense/key moment.
+- Sentence captions should sit around center-bottom: caption_x about 0.50, caption_y about 0.68-0.74, caption_anchor=2.
+- Word highlight captions can sit center, mid-upper, mid-left, or mid-right with larger text, as long as they avoid the face/product.
 - subtitle WORD: short hooks, reactions, punchlines, important keywords
 - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
 - subtitle SENTENCE: explanation, normal conversation, low/medium energy

backend/src/processing/high_retention.py CHANGED Viewed

@@ -192,14 +192,14 @@ def _build_zoom_exprs(
     if direction == "in":
         if speed == "fast":
-            z_expr, max_zoom = "min(1.12+on*0.0018\\,1.55)", 1.55
         else:
-            z_expr, max_zoom = "min(1.04+on*0.0009\\,1.32)", 1.32
     elif direction == "out":
         if speed == "fast":
-            z_expr, max_zoom = "max(1.48-on*0.0018\\,1.0)", 1.48
         else:
-            z_expr, max_zoom = "max(1.28-on*0.0009\\,1.0)", 1.28
     else:  # hold
         z_expr, max_zoom = "1.08", 1.08
@@ -357,6 +357,20 @@ def _safe_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tup
     return x, y, _caption_anchor_for(x, y)
 def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
     """Validate model output and fill HRE fields used by the renderer."""
     an = dict(analysis or {})
@@ -402,11 +416,27 @@ def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
         0.82,
     )
     if subject_bbox:
         x1, y1, x2, y2 = subject_bbox
         overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
         if overlaps_subject:
-            caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor
     if seg_idx == 0:
         zoom_direction, zoom_speed = "in", "fast"
@@ -724,11 +754,11 @@ def _subtitle_tag(plan: dict) -> tuple[str, int]:
     max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
     if mode == "sentence":
-        font_size = 56 if energy != "high" else 62
     elif mode == "phrase":
         font_size = 68 if energy != "low" else 62
     else:
-        font_size = 80 if energy == "high" else 72
     if alignment in {4, 5, 6}:
         font_size = max(54, font_size - 4)

     if direction == "in":
         if speed == "fast":
+            z_expr, max_zoom = "min(1.0+on*0.0100\\,1.45)", 1.45
         else:
+            z_expr, max_zoom = "min(1.0+on*0.0035\\,1.28)", 1.28
     elif direction == "out":
         if speed == "fast":
+            z_expr, max_zoom = "max(1.45-on*0.0100\\,1.0)", 1.45
         else:
+            z_expr, max_zoom = "max(1.28-on*0.0040\\,1.0)", 1.28
     else:  # hold
         z_expr, max_zoom = "1.08", 1.08
     return x, y, _caption_anchor_for(x, y)
+def _word_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tuple[float, float, int]:
+    """Put highlight words in punchy mid-frame zones instead of ordinary subtitle zones."""
+    candidates = [
+        (0.50, 0.42),
+        (0.50, 0.26),
+        (0.28 if subject_x > 0.55 else 0.72, 0.46),
+        (0.30 if subject_x > 0.50 else 0.70, 0.58),
+    ]
+    x, y = candidates[seg_idx % len(candidates)]
+    if abs(x - subject_x) < 0.18 and abs(y - subject_y) < 0.18:
+        x = 0.25 if subject_x > 0.5 else 0.75
+    return x, y, _caption_anchor_for(x, y)
 def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
     """Validate model output and fill HRE fields used by the renderer."""
     an = dict(analysis or {})
         0.82,
     )
+    if mode == "sentence":
+        caption_x = 0.50
+        caption_y = _clamp_float(an.get("caption_y"), 0.70, 0.64, 0.74)
+        caption_anchor = 2
+        caption_max_width_pct = max(caption_max_width_pct, 0.68)
+    elif mode == "word":
+        word_x, word_y, word_anchor = _word_caption_point(subject_x, subject_y, seg_idx)
+        if caption_y > 0.66 or (abs(caption_x - subject_x) < 0.14 and abs(caption_y - subject_y) < 0.14):
+            caption_x, caption_y, caption_anchor = word_x, word_y, word_anchor
+        caption_max_width_pct = min(caption_max_width_pct, 0.56)
     if subject_bbox:
         x1, y1, x2, y2 = subject_bbox
         overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
         if overlaps_subject:
+            if mode == "sentence":
+                caption_x, caption_y, caption_anchor = 0.50, 0.70, 2
+            elif mode == "word":
+                caption_x, caption_y, caption_anchor = _word_caption_point(subject_x, subject_y, seg_idx + 1)
+            else:
+                caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor
     if seg_idx == 0:
         zoom_direction, zoom_speed = "in", "fast"
     max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
     if mode == "sentence":
+        font_size = 54 if energy != "high" else 60
     elif mode == "phrase":
         font_size = 68 if energy != "low" else 62
     else:
+        font_size = 96 if energy == "high" else 84
     if alignment in {4, 5, 6}:
         font_size = max(54, font_size - 4)