Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Runtime error

App Files Files Community

jakgritb commited on 15 days ago

Commit

c511df7

verified ·

1 Parent(s): e896716

fix: add per-segment HRE edit plans

Browse files

Files changed (8) hide show

README.md +10 -9
backend/src/analysis/vision.py +20 -8
backend/src/processing/high_retention.py +410 -85
frontend/components/ClipSettings.tsx +6 -6
frontend/messages/en.json +1 -1
frontend/messages/th.json +1 -1
frontend/messages/zh.json +1 -1
frontend/next-env.d.ts +6 -0

README.md CHANGED Viewed

@@ -96,11 +96,11 @@ where:
 │                                                                   │
 │  Normal Mode              HRE (High-Retention Editing)           │
 │  ─────────────            ──────────────────────────────         │
-│  • pysubs2 ASS            • Silence removal (ffmpeg)             │
-│  • User style config      • Auto-zoom to face (zoompan)          │
-│  • Font/color/animation   • Jump cuts at boundaries              │
-│  • Karaoke/pop/fade       • Qwen2.5-VL emoji selection          │
-│  • AMD AMF encode         • Impact bold captions                 │
 └──────────────────────────────────────────────────────────────────┘
            │
            ▼
@@ -139,11 +139,12 @@ Full creative control over:
 ### High-Retention Editing (HRE)
 AI chooses everything:
-- Silence removal (`ffmpeg silenceremove`)
-- Auto-zoom to face region (`ffmpeg zoompan` using Qwen2.5-VL face_bbox)
-- Jump cuts at scene boundaries
 - Qwen2.5-VL selects contextually-appropriate emoji overlay
-- Impact 64px bold white captions, word-by-word, pop animation
 ---

 │                                                                   │
 │  Normal Mode              HRE (High-Retention Editing)           │
 │  ─────────────            ──────────────────────────────         │
+│  • pysubs2 ASS            • Per-segment AI edit plan             │
+│  • User style config      • Auto-zoom per segment (zoompan)      │
+│  • Font/color/animation   • Word / phrase / sentence captions    │
+│  • Karaoke/pop/fade       • Top / bottom / left / right captions │
+│  • AMD AMF encode         • Qwen2.5-VL emoji selection           │
 └──────────────────────────────────────────────────────────────────┘
            │
            ▼
 ### High-Retention Editing (HRE)
 AI chooses everything:
+- A per-segment edit plan with timestamps
+- Auto-zoom direction and speed per segment (`ffmpeg zoompan`)
+- Caption mode per segment: word, phrase, or sentence
+- Caption placement per segment: top, bottom, left, right, or center
+- Caption color, size, and pop emphasis based on segment energy
 - Qwen2.5-VL selects contextually-appropriate emoji overlay
 ---

backend/src/analysis/vision.py CHANGED Viewed

@@ -172,7 +172,9 @@ Respond ONLY with valid JSON — no markdown:
   "face_detected": <true|false>,
   "face_cx": <0.0-1.0>,
   "face_cy": <0.0-1.0>,
-  "subtitle_position": "<top|bottom>",
   "subtitle_color": "<white|yellow|cyan|orange|green>",
   "energy_level": "<high|medium|low>",
   "moment_type": "<hook|punchline|context|reaction|transition>"
@@ -184,8 +186,13 @@ Rules:
 - zoom IN slow: context, buildup, moderate energy
 - zoom OUT: reveals, breathing room after intensity
 - HOLD: stable content, text-heavy moments
-- subtitle TOP: face is in bottom half → put text at top
-- subtitle BOTTOM: face is in top half → text at bottom
 - face_cx/face_cy: face center as 0.0-1.0 fraction of frame
 """
@@ -196,7 +203,7 @@ def analyze_frame_for_hre(
     seg_idx: int = 0,
     n_total: int = 1,
 ) -> dict:
-    """Per-segment HRE: zoom direction, subtitle position+color for this moment."""
     try:
         from openai import OpenAI
@@ -227,11 +234,12 @@ def analyze_frame_for_hre(
             if raw.startswith("json"):
                 raw = raw[4:]
-        analysis = json.loads(raw.strip())
         logger.debug(
             f"HRE seg {seg_idx}/{n_total}: "
             f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
-            f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_color')} "
             f"type={analysis.get('moment_type')}"
         )
         try:
@@ -257,8 +265,10 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
     else:
         zoom_dir, zoom_speed, moment = "in", "slow", "reaction"
-    _colors    = ["yellow", "white",  "cyan",   "orange", "white",  "yellow"]
-    _positions = ["bottom", "top",    "bottom", "top",    "bottom", "top"]
     return {
         "zoom_direction":    zoom_dir,
@@ -267,6 +277,8 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
         "face_cx":           0.5,
         "face_cy":           0.38,
         "subtitle_position": _positions[seg_idx % len(_positions)],
         "subtitle_color":    _colors[seg_idx % len(_colors)],
         "energy_level":      "medium",
         "moment_type":       moment,

   "face_detected": <true|false>,
   "face_cx": <0.0-1.0>,
   "face_cy": <0.0-1.0>,
+  "subtitle_position": "<top|bottom|left|right|center>",
+  "subtitle_mode": "<word|phrase|sentence>",
+  "subtitle_emphasis": "<pop|punch|calm>",
   "subtitle_color": "<white|yellow|cyan|orange|green>",
   "energy_level": "<high|medium|low>",
   "moment_type": "<hook|punchline|context|reaction|transition>"
 - zoom IN slow: context, buildup, moderate energy
 - zoom OUT: reveals, breathing room after intensity
 - HOLD: stable content, text-heavy moments
+- subtitle WORD: short hooks, reactions, punchlines, important keywords
+- subtitle PHRASE: fast but understandable speech, 2-4 words at a time
+- subtitle SENTENCE: explanation, normal conversation, low/medium energy
+- subtitle TOP: face is in bottom half
+- subtitle BOTTOM: face is in top half
+- subtitle LEFT/RIGHT: face or main object is on the opposite side
+- Avoid choosing the exact same subtitle_position and subtitle_mode for every segment.
 - face_cx/face_cy: face center as 0.0-1.0 fraction of frame
 """
     seg_idx: int = 0,
     n_total: int = 1,
 ) -> dict:
+    """Per-segment HRE: zoom, caption placement, caption mode, and color."""
     try:
         from openai import OpenAI
             if raw.startswith("json"):
                 raw = raw[4:]
+        analysis = {**_default_hre_analysis(seg_idx, n_total), **json.loads(raw.strip())}
         logger.debug(
             f"HRE seg {seg_idx}/{n_total}: "
             f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
+            f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
+            f"{analysis.get('subtitle_color')} "
             f"type={analysis.get('moment_type')}"
         )
         try:
     else:
         zoom_dir, zoom_speed, moment = "in", "slow", "reaction"
+    _colors    = ["yellow", "white", "cyan", "orange", "white", "yellow"]
+    _positions = ["bottom", "top", "left", "bottom", "right", "top"]
+    _modes     = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
+    _emphasis  = ["punch", "calm", "pop", "punch", "calm", "pop"]
     return {
         "zoom_direction":    zoom_dir,
         "face_cx":           0.5,
         "face_cy":           0.38,
         "subtitle_position": _positions[seg_idx % len(_positions)],
+        "subtitle_mode":     _modes[seg_idx % len(_modes)],
+        "subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
         "subtitle_color":    _colors[seg_idx % len(_colors)],
         "energy_level":      "medium",
         "moment_type":       moment,

backend/src/processing/high_retention.py CHANGED Viewed

@@ -1,15 +1,17 @@
 """High-Retention Editing pipeline — per-segment AI decisions.
 Each 3-5s segment gets its own zoom direction, subtitle position,
-and caption color driven by Qwen2.5-VL analyzing one frame per segment.
 Pipeline per clip:
   1. Segment clip at speech pauses (3-5s chunks)
   2. Extract midpoint frame from each segment
   3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions
   4. ffmpeg filter_complex: per-segment zoompan + concat
-  5. ASS subtitles with per-segment alignment/color/size override tags
 """
 import subprocess
 import tempfile
 from pathlib import Path
@@ -175,35 +177,25 @@ def _build_zoom_exprs(
     if direction == "in":
         if speed == "fast":
-            z_expr, max_zoom = "min(1.2+n*0.0014\\,1.6)", 1.6
         else:
-            z_expr, max_zoom = "min(1.05+n*0.0006\\,1.35)", 1.35
     elif direction == "out":
         if speed == "fast":
-            z_expr, max_zoom = "max(1.6-n*0.0016\\,1.0)", 1.6
         else:
-            z_expr, max_zoom = "max(1.4-n*0.0010\\,1.0)", 1.4
     else:  # hold
-        z_expr, max_zoom = "1.1", 1.1
     if face_detected and direction == "in" and max_zoom > 1.05:
-        raw_cx = int(face_cx * w - w / (max_zoom * 2))
-        raw_cy = int(face_cy * h - h / (max_zoom * 2))
-        safe_cx = max(0, min(w - int(w / max_zoom), raw_cx))
-        safe_cy = max(0, min(h - int(h / max_zoom), raw_cy))
-        ctr_x = w / 2 - w / (max_zoom * 2)
-        ctr_y = h / 2 - h / (max_zoom * 2)
-        x_expr = (
-            f"(iw/2-(iw/zoom/2))+({safe_cx}-{ctr_x:.1f})*(zoom-1)/({max_zoom}-1)"
-        )
-        y_expr = (
-            f"(ih/2-(ih/zoom/2))+({safe_cy}-{ctr_y:.1f})*(zoom-1)/({max_zoom}-1)"
-        )
     else:
         x_expr = "iw/2-(iw/zoom/2)"
         if direction == "in":
             y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
-            y_expr = f"ih*{y_bias:.2f}-(ih/zoom/2)"
         else:
             y_expr = "ih/2-(ih/zoom/2)"
@@ -231,10 +223,12 @@ def _apply_per_segment_zoom(
         e = f"{seg['end']:.3f}"
         z, x, y = _build_zoom_exprs(analysis, w, h)
         zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30"
-        filter_parts.append(f"[0:v]trim={s}:{e},setpts=PTS-STARTPTS,{zp}[v{i}]")
         v_labels.append(f"[v{i}]")
         if has_audio:
-            filter_parts.append(f"[0:a]atrim={s}:{e},asetpts=PTS-STARTPTS[a{i}]")
             a_labels.append(f"[a{i}]")
     n = len(segments)
@@ -270,12 +264,377 @@ _ASS_COLORS = {
     "red":    "&H000000FF",
 }
 def _ts(t: float) -> str:
-    h = int(t // 3600)
-    m = int((t % 3600) // 60)
-    s = t % 60
-    return f"{h}:{m:02d}:{s:06.3f}"
 def _generate_per_segment_subtitles(
@@ -285,42 +644,14 @@ def _generate_per_segment_subtitles(
     segments: list[dict],
     analyses: list[dict],
 ) -> None:
-    """Write ASS with per-segment alignment, color, and font-size overrides."""
-    events: list[dict] = []
-    # Word-level events
-    for seg in transcript.get("segments", []):
-        for w in seg.get("words", []):
-            t0 = max(0.0, float(w.get("start", 0)) - clip_start)
-            t1 = max(0.0, float(w.get("end",   0)) - clip_start)
-            text = w.get("word", w.get("text", "")).strip()
-            if text and t1 > 0:
-                events.append({"start": t0, "end": max(t1, t0 + 0.08), "text": text})
-    # Sentence-level fallback (split into 3-word chunks)
-    if not events:
-        for seg in transcript.get("segments", []):
-            t0 = max(0.0, float(seg.get("start", 0)) - clip_start)
-            t1 = max(0.0, float(seg.get("end",   0)) - clip_start)
-            text = seg.get("text", "").strip()
-            if not text or t1 <= 0:
-                continue
-            wlist = text.split()
-            chunk = 3
-            n_ch = max(1, (len(wlist) + chunk - 1) // chunk)
-            dur = (t1 - t0) / n_ch
-            for j in range(n_ch):
-                events.append({
-                    "start": t0 + j * dur,
-                    "end":   t0 + (j + 1) * dur,
-                    "text":  " ".join(wlist[j * chunk:(j + 1) * chunk]),
-                })
-    def get_an(t: float) -> dict:
-        for seg, an in zip(segments, analyses):
-            if seg["start"] <= t < seg["end"]:
-                return an
-        return analyses[-1] if analyses else {}
     lines = [
         "[Script Info]",
@@ -334,36 +665,28 @@ def _generate_per_segment_subtitles(
         "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
         "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
         "Alignment, MarginL, MarginR, MarginV, Encoding",
-        "Style: Default,Impact,90,&H00FFFFFF,&H0000FFFF,&H00000000,&H80000000,"
-        "-1,0,0,0,100,100,0,0,1,4,0,2,40,40,200,1",
         "",
         "[Events]",
         "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
     ]
     for ev in events:
-        an      = get_an(ev["start"])
-        color   = _ASS_COLORS.get(an.get("subtitle_color", "white"), "&H00FFFFFF")
-        pos     = an.get("subtitle_position", "bottom")
-        energy  = an.get("energy_level", "medium")
-        moment  = an.get("moment_type", "context")
-        alignment = 8 if pos == "top" else 2
-        margin_v  = 120 if pos == "top" else 200
-        fs = (108 if energy == "high" or moment in ("hook", "punchline")
-              else 80 if energy == "low" else 92)
-        # Pop animation: start 130% scale, shrink to 100% in 120ms
-        pop = "{\\fscx130\\fscy130\\t(0,120,\\fscx100\\fscy100)}"
-        tag = f"{{\\an{alignment}\\1c{color}&\\fs{fs}\\b1}}{pop}"
         lines.append(
             f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])},"
-            f"Default,,0,0,{margin_v},,{tag}{ev['text'].upper()}"
         )
     ass_path.write_text("\n".join(lines), encoding="utf-8")
-    logger.debug(f"ASS: {len(events)} events across {len(segments)} segments")
 # ─── Emoji ─────────────────────────────────────────────────────────────────────
@@ -437,7 +760,7 @@ def apply_hre(
     transcript: dict,
     output_path: Path,
 ) -> Path:
-    """Apply per-segment AI-driven HRE: each 3-5s chunk gets its own zoom + subtitle style."""
     output_path.parent.mkdir(parents=True, exist_ok=True)
     clip_start = clip_data.get("start", 0.0)
@@ -464,26 +787,28 @@ def apply_hre(
             _analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir)
             for i, seg in enumerate(segments)
         ]
-        for i, (seg, an) in enumerate(zip(segments, analyses)):
             logger.info(
                 f"  [{seg['start']:.1f}s-{seg['end']:.1f}s] "
                 f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) "
-                f"sub={an.get('subtitle_position')}/{an.get('subtitle_color')} "
                 f"type={an.get('moment_type')} energy={an.get('energy_level')}"
             )
         # 3. Per-segment zoom via filter_complex
         zoomed = _apply_per_segment_zoom(
-            clip_path, segments, analyses, w, h, tmp_zoomed, has_audio=has_audio
         )
         # 4. Per-segment ASS subtitles
         ass_path = output_path.with_suffix(".ass")
-        _generate_per_segment_subtitles(transcript, ass_path, clip_start, segments, analyses)
         # 5. Emoji from highest-energy segment
-        emoji = _get_emoji(clip_data, analyses)
         # 6. Render
         _render_final(zoomed, ass_path, emoji, output_path)

 """High-Retention Editing pipeline — per-segment AI decisions.
 Each 3-5s segment gets its own zoom direction, subtitle position,
+subtitle mode, and caption color driven by Qwen2.5-VL analyzing one
+frame plus the local transcript for that segment.
 Pipeline per clip:
   1. Segment clip at speech pauses (3-5s chunks)
   2. Extract midpoint frame from each segment
   3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions
   4. ffmpeg filter_complex: per-segment zoompan + concat
+  5. ASS subtitles with per-segment alignment/color/mode override tags
 """
+import json
 import subprocess
 import tempfile
 from pathlib import Path
     if direction == "in":
         if speed == "fast":
+            z_expr, max_zoom = "min(1.12+on*0.0018\\,1.55)", 1.55
         else:
+            z_expr, max_zoom = "min(1.04+on*0.0009\\,1.32)", 1.32
     elif direction == "out":
         if speed == "fast":
+            z_expr, max_zoom = "max(1.48-on*0.0018\\,1.0)", 1.48
         else:
+            z_expr, max_zoom = "max(1.28-on*0.0009\\,1.0)", 1.28
     else:  # hold
+        z_expr, max_zoom = "1.08", 1.08
     if face_detected and direction == "in" and max_zoom > 1.05:
+        x_expr = f"max(0\\,min(iw-iw/zoom\\,iw*{face_cx:.3f}-iw/zoom/2))"
+        y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{face_cy:.3f}-ih/zoom/2))"
     else:
         x_expr = "iw/2-(iw/zoom/2)"
         if direction == "in":
             y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
+            y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{y_bias:.2f}-(ih/zoom/2)))"
         else:
             y_expr = "ih/2-(ih/zoom/2)"
         e = f"{seg['end']:.3f}"
         z, x, y = _build_zoom_exprs(analysis, w, h)
         zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30"
+        filter_parts.append(
+            f"[0:v]trim=start={s}:end={e},setpts=PTS-STARTPTS,fps=30,{zp},setpts=PTS-STARTPTS[v{i}]"
+        )
         v_labels.append(f"[v{i}]")
         if has_audio:
+            filter_parts.append(f"[0:a]atrim=start={s}:end={e},asetpts=PTS-STARTPTS[a{i}]")
             a_labels.append(f"[a{i}]")
     n = len(segments)
     "red":    "&H000000FF",
 }
+_POSITIONS = {"top", "bottom", "left", "right", "center"}
+_MODES = {"word", "phrase", "sentence"}
+_EMPHASIS = {"pop", "punch", "calm"}
 def _ts(t: float) -> str:
+    total_cs = max(0, int(round(t * 100)))
+    h = total_cs // 360000
+    total_cs %= 360000
+    m = total_cs // 6000
+    total_cs %= 6000
+    s = total_cs // 100
+    cs = total_cs % 100
+    return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
+def _pick(value: object, allowed: set[str], fallback: str) -> str:
+    v = str(value or "").strip().lower()
+    return v if v in allowed else fallback
+def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
+    """Validate model output and fill HRE fields used by the renderer."""
+    an = dict(analysis or {})
+    energy = _pick(an.get("energy_level"), {"high", "medium", "low"}, "medium")
+    moment = _pick(
+        an.get("moment_type"),
+        {"hook", "punchline", "context", "reaction", "transition"},
+        "context",
+    )
+    fallback_mode = "word" if energy == "high" or moment in {"hook", "punchline", "reaction"} else "sentence"
+    if energy == "medium" and moment not in {"context", "transition"}:
+        fallback_mode = "phrase"
+    pos = _pick(an.get("subtitle_position"), _POSITIONS, "bottom")
+    mode = _pick(an.get("subtitle_mode"), _MODES, fallback_mode)
+    emphasis = _pick(an.get("subtitle_emphasis"), _EMPHASIS, "punch" if mode == "word" else "calm")
+    color = _pick(an.get("subtitle_color"), set(_ASS_COLORS), "white")
+    zoom_direction = _pick(an.get("zoom_direction"), {"in", "out", "hold"}, "in")
+    zoom_speed = _pick(an.get("zoom_speed"), {"fast", "slow"}, "slow")
+    try:
+        face_cx = min(1.0, max(0.0, float(an.get("face_cx", 0.5))))
+        face_cy = min(1.0, max(0.0, float(an.get("face_cy", 0.38))))
+    except Exception:
+        face_cx, face_cy = 0.5, 0.38
+    if seg_idx == 0:
+        zoom_direction, zoom_speed = "in", "fast"
+        if mode == "sentence":
+            mode = "word"
+        if emphasis == "calm":
+            emphasis = "punch"
+    return {
+        **an,
+        "zoom_direction": zoom_direction,
+        "zoom_speed": zoom_speed,
+        "face_detected": bool(an.get("face_detected", False)),
+        "face_cx": face_cx,
+        "face_cy": face_cy,
+        "subtitle_position": pos,
+        "subtitle_mode": mode,
+        "subtitle_emphasis": emphasis,
+        "subtitle_color": color,
+        "energy_level": energy,
+        "moment_type": moment,
+    }
+def _build_hre_plan(segments: list[dict], analyses: list[dict]) -> list[dict]:
+    plan = []
+    n_total = len(segments)
+    for i, (seg, analysis) in enumerate(zip(segments, analyses)):
+        an = _normalise_analysis(analysis, i, n_total)
+        plan.append({**an, "segment_index": i, "start": seg["start"], "end": seg["end"]})
+    # If the model repeats the same caption treatment for every segment, rotate
+    # through safe defaults so HRE visibly changes across the clip.
+    if len(plan) > 1 and len({(p["subtitle_position"], p["subtitle_mode"]) for p in plan}) == 1:
+        positions = ["bottom", "top", "left", "bottom", "right", "top"]
+        modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
+        for i, p in enumerate(plan):
+            p["subtitle_position"] = positions[i % len(positions)]
+            p["subtitle_mode"] = modes[i % len(modes)]
+            if p["subtitle_mode"] == "word":
+                p["subtitle_emphasis"] = "punch"
+    return plan
+def _ass_escape(text: str) -> str:
+    return (
+        text.replace("{", "(")
+        .replace("}", ")")
+        .replace("\r", " ")
+        .replace("\n", " ")
+        .strip()
+    )
+def _wrap_text(text: str, max_chars: int) -> str:
+    text = _ass_escape(text)
+    if len(text) <= max_chars:
+        return text
+    words = text.split()
+    if len(words) <= 1:
+        return r"\N".join(text[i:i + max_chars] for i in range(0, len(text), max_chars))
+    lines: list[str] = []
+    line = ""
+    for word in words:
+        candidate = f"{line} {word}".strip()
+        if line and len(candidate) > max_chars:
+            lines.append(line)
+            line = word
+        else:
+            line = candidate
+    if line:
+        lines.append(line)
+    if len(lines) <= 2:
+        return r"\N".join(lines)
+    return r"\N".join([lines[0], " ".join(lines[1:])])
+def _collect_clip_words(transcript: dict, clip_start: float, duration: float) -> list[dict]:
+    words: list[dict] = []
+    for seg in transcript.get("segments", []):
+        seg_start = float(seg.get("start", clip_start)) - clip_start
+        seg_end = float(seg.get("end", clip_start)) - clip_start
+        for word in seg.get("words", []):
+            text = str(word.get("word", word.get("text", ""))).strip()
+            if not text:
+                continue
+            start = float(word.get("start", seg_start + clip_start)) - clip_start
+            end = float(word.get("end", word.get("start", seg_end + clip_start))) - clip_start
+            if end <= start:
+                end = start + 0.24
+            if end <= 0 or start >= duration:
+                continue
+            words.append({
+                "start": max(0.0, start),
+                "end": min(duration, end),
+                "text": text,
+            })
+    return sorted(words, key=lambda w: (w["start"], w["end"]))
+def _segment_text(transcript: dict, clip_start: float, seg: dict) -> str:
+    parts: list[str] = []
+    for item in transcript.get("segments", []):
+        start = float(item.get("start", clip_start)) - clip_start
+        end = float(item.get("end", clip_start)) - clip_start
+        if start < seg["end"] and end > seg["start"]:
+            text = str(item.get("text", "")).strip()
+            if text:
+                parts.append(text)
+    return " ".join(parts).strip()
+def _words_in_segment(words: list[dict], seg: dict) -> list[dict]:
+    return [
+        w for w in words
+        if w["start"] < seg["end"] and w["end"] > seg["start"]
+    ]
+def _display_text(text: str, mode: str, emphasis: str) -> str:
+    text = text.strip()
+    if mode == "sentence" and emphasis == "calm":
+        return text
+    return text.upper()
+def _append_event(events: list[dict], start: float, end: float, text: str, plan: dict) -> None:
+    start = max(float(plan["start"]), start)
+    end = min(float(plan["end"]), end)
+    if end - start < 0.08 or not text.strip():
+        return
+    events.append({
+        "start": start,
+        "end": end,
+        "text": text.strip(),
+        "plan": plan,
+    })
+def _word_events(words: list[dict], seg: dict, plan: dict) -> list[dict]:
+    events: list[dict] = []
+    cursor = seg["start"]
+    min_d = 0.14 if plan["energy_level"] == "high" else 0.18
+    max_d = 0.72 if plan["energy_level"] == "high" else 0.95
+    for i, word in enumerate(words):
+        start = max(seg["start"], word["start"], cursor)
+        next_start = words[i + 1]["start"] if i + 1 < len(words) else seg["end"]
+        natural_end = max(word["end"], start + min_d)
+        end = min(seg["end"], natural_end, start + max_d)
+        if next_start > start:
+            end = min(end, max(start + min_d, next_start - 0.015))
+        if end <= start:
+            end = min(seg["end"], start + min_d)
+        _append_event(events, start, end, word["text"], plan)
+        cursor = end + 0.015
+        if cursor >= seg["end"]:
+            break
+    return events
+def _line_events(
+    words: list[dict],
+    seg: dict,
+    plan: dict,
+    max_words: int,
+    max_duration: float,
+    max_chars: int,
+) -> list[dict]:
+    events: list[dict] = []
+    i = 0
+    cursor = seg["start"]
+    while i < len(words) and cursor < seg["end"] - 0.08:
+        group: list[dict] = []
+        start = max(seg["start"], words[i]["start"], cursor)
+        end = start
+        chars = 0
+        while i < len(words):
+            word = words[i]
+            proposed_end = min(seg["end"], max(word["end"], word["start"] + 0.2))
+            proposed_chars = chars + len(word["text"]) + (1 if group else 0)
+            if group and (
+                len(group) >= max_words
+                or proposed_end - start > max_duration
+                or proposed_chars > max_chars
+            ):
+                break
+            group.append(word)
+            chars = proposed_chars
+            end = max(end, proposed_end)
+            i += 1
+        if not group:
+            i += 1
+            continue
+        end = min(seg["end"], max(end, start + 0.55))
+        text = " ".join(w["text"] for w in group)
+        _append_event(events, start, end, text, plan)
+        cursor = end + 0.04
+    return events
+def _fallback_text_events(text: str, seg: dict, plan: dict) -> list[dict]:
+    if not text:
+        return []
+    mode = plan["subtitle_mode"]
+    if mode == "word":
+        chunk_size = 1
+    elif mode == "phrase":
+        chunk_size = 3
+    else:
+        chunk_size = 7
+    units = text.split()
+    if len(units) <= 1 and len(text) > 20:
+        step = 10 if mode == "word" else 24 if mode == "phrase" else 36
+        units = [text[i:i + step] for i in range(0, len(text), step)]
+    chunks = [" ".join(units[i:i + chunk_size]) for i in range(0, len(units), chunk_size)]
+    chunks = [c for c in chunks if c.strip()]
+    if not chunks:
+        return []
+    events: list[dict] = []
+    seg_d = max(0.1, seg["end"] - seg["start"])
+    dur = seg_d / len(chunks)
+    for i, chunk in enumerate(chunks):
+        start = seg["start"] + i * dur
+        end = seg["start"] + (i + 1) * dur
+        _append_event(events, start, end, chunk, plan)
+    return events
+def _build_subtitle_events(
+    transcript: dict,
+    clip_start: float,
+    duration: float,
+    segments: list[dict],
+    plan: list[dict],
+) -> list[dict]:
+    words = _collect_clip_words(transcript, clip_start, duration)
+    events: list[dict] = []
+    for seg, seg_plan in zip(segments, plan):
+        seg_words = _words_in_segment(words, seg)
+        mode = seg_plan["subtitle_mode"]
+        if seg_words and mode == "word":
+            seg_events = _word_events(seg_words, seg, seg_plan)
+        elif seg_words and mode == "phrase":
+            seg_events = _line_events(seg_words, seg, seg_plan, max_words=3, max_duration=1.7, max_chars=28)
+        elif seg_words:
+            seg_events = _line_events(seg_words, seg, seg_plan, max_words=7, max_duration=2.8, max_chars=44)
+        else:
+            seg_events = []
+        if not seg_events:
+            seg_events = _fallback_text_events(_segment_text(transcript, clip_start, seg), seg, seg_plan)
+        events.extend(seg_events)
+    events = sorted(events, key=lambda ev: (ev["start"], ev["end"]))
+    # ASS draws all active events at once; keep one visible caption event at a
+    # time so word/phrase/sentence modes never stack on top of each other.
+    cleaned: list[dict] = []
+    cursor = 0.0
+    for ev in events:
+        start = max(ev["start"], cursor)
+        end = min(duration, ev["end"])
+        if end - start < 0.08:
+            continue
+        cleaned.append({**ev, "start": start, "end": end})
+        cursor = end + 0.01
+    return cleaned
+def _subtitle_tag(plan: dict) -> tuple[str, int]:
+    pos = plan["subtitle_position"]
+    mode = plan["subtitle_mode"]
+    energy = plan["energy_level"]
+    emphasis = plan["subtitle_emphasis"]
+    color = _ASS_COLORS.get(plan["subtitle_color"], "&H00FFFFFF")
+    anchors = {
+        "top": (8, 540, 230),
+        "bottom": (2, 540, 1660),
+        "left": (4, 95, 960),
+        "right": (6, 985, 960),
+        "center": (5, 540, 960),
+    }
+    alignment, x, y = anchors.get(pos, anchors["bottom"])
+    if mode == "sentence":
+        font_size = 66 if energy != "high" else 74
+        max_chars = 34
+    elif mode == "phrase":
+        font_size = 82 if energy != "low" else 76
+        max_chars = 24
+    else:
+        font_size = 102 if energy == "high" else 92
+        max_chars = 18
+    if pos in {"left", "right"}:
+        font_size -= 8
+        max_chars = min(max_chars, 22)
+    base = (
+        f"{{\\an{alignment}\\pos({x},{y})\\1c{color}&\\fs{font_size}"
+        "\\b1\\bord5\\shad1\\q2}}"
+    )
+    if emphasis in {"pop", "punch"} or mode == "word":
+        base += "{\\fscx125\\fscy125\\t(0,120,\\fscx100\\fscy100)}"
+    return base, max_chars
 def _generate_per_segment_subtitles(
     segments: list[dict],
     analyses: list[dict],
 ) -> None:
+    """Write one ASS file from the HRE plan.
+    The important rule is that HRE can change style every segment, but it must
+    never emit simultaneous caption events at the same timestamp.
+    """
+    duration = max((float(seg["end"]) for seg in segments), default=0.0)
+    plan = _build_hre_plan(segments, analyses)
+    events = _build_subtitle_events(transcript, clip_start, duration, segments, plan)
     lines = [
         "[Script Info]",
         "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
         "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
         "Alignment, MarginL, MarginR, MarginV, Encoding",
+        "Style: Default,Noto Sans,82,&H00FFFFFF,&H0000FFFF,&H00000000,&H80000000,"
+        "-1,0,0,0,100,100,0,0,1,5,1,2,40,40,200,1",
         "",
         "[Events]",
         "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
     ]
     for ev in events:
+        seg_plan = ev["plan"]
+        tag, max_chars = _subtitle_tag(seg_plan)
+        text = _display_text(ev["text"], seg_plan["subtitle_mode"], seg_plan["subtitle_emphasis"])
+        text = _wrap_text(text, max_chars)
         lines.append(
             f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])},"
+            f"Default,,0,0,0,,{tag}{text}"
         )
     ass_path.write_text("\n".join(lines), encoding="utf-8")
+    plan_path = ass_path.with_suffix(".hre_plan.json")
+    plan_path.write_text(json.dumps(plan, ensure_ascii=False, indent=2), encoding="utf-8")
+    logger.debug(f"ASS: {len(events)} events across {len(segments)} HRE segments")
 # ─── Emoji ─────────────────────────────────────────────────────────────────────
     transcript: dict,
     output_path: Path,
 ) -> Path:
+    """Apply per-segment AI-driven HRE with varied zoom and caption plans."""
     output_path.parent.mkdir(parents=True, exist_ok=True)
     clip_start = clip_data.get("start", 0.0)
             _analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir)
             for i, seg in enumerate(segments)
         ]
+        plan = _build_hre_plan(segments, analyses)
+        for i, (seg, an) in enumerate(zip(segments, plan)):
             logger.info(
                 f"  [{seg['start']:.1f}s-{seg['end']:.1f}s] "
                 f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) "
+                f"sub={an.get('subtitle_position')}/{an.get('subtitle_mode')}/"
+                f"{an.get('subtitle_color')} "
                 f"type={an.get('moment_type')} energy={an.get('energy_level')}"
             )
         # 3. Per-segment zoom via filter_complex
         zoomed = _apply_per_segment_zoom(
+            clip_path, segments, plan, w, h, tmp_zoomed, has_audio=has_audio
         )
         # 4. Per-segment ASS subtitles
         ass_path = output_path.with_suffix(".ass")
+        _generate_per_segment_subtitles(transcript, ass_path, clip_start, segments, plan)
         # 5. Emoji from highest-energy segment
+        emoji = _get_emoji(clip_data, plan)
         # 6. Render
         _render_final(zoomed, ass_path, emoji, output_path)

frontend/components/ClipSettings.tsx CHANGED Viewed

@@ -40,8 +40,8 @@ const L = {
     normalTitle: "Normal Subtitles",
     normalDesc: "Customize font, colors, animations",
     hreTitle: "High-Retention",
-    hreDesc: "AI picks everything + auto-zoom + jump cuts",
-    hreInfo: "AI will auto-select font/colors/animation, remove silence, zoom on faces, and add emoji overlays.",
   },
   th: {
     style: "สไตล์คลิป",
@@ -53,8 +53,8 @@ const L = {
     normalTitle: "ซับปกติ",
     normalDesc: "เลือกรูปแบบซับได้เอง",
     hreTitle: "High-Retention",
-    hreDesc: "AI เลือกทุกอย่างให้ + auto-zoom + jump cuts",
-    hreInfo: "AI จะเลือก font/สี/animation + ตัด silence + zoom หน้าคน + ใส่ emoji ให้อัตโนมัติ",
   },
   zh: {
     style: "片段风格",
@@ -66,8 +66,8 @@ const L = {
     normalTitle: "普通字幕",
     normalDesc: "自定义字体、颜色、动画",
     hreTitle: "高留存",
-    hreDesc: "AI 自动处理 + 自动缩放 + 跳切",
-    hreInfo: "AI 将自动选择字体/颜色/动画，去除静音段，放大人脸，并添加表情覆盖。",
   },
 } as const;

     normalTitle: "Normal Subtitles",
     normalDesc: "Customize font, colors, animations",
     hreTitle: "High-Retention",
+    hreDesc: "AI picks timing, captions, and zoom",
+    hreInfo: "AI will create a per-segment edit plan, vary caption placement/mode, zoom on key moments, and add emoji overlays.",
   },
   th: {
     style: "สไตล์คลิป",
     normalTitle: "ซับปกติ",
     normalDesc: "เลือกรูปแบบซับได้เอง",
     hreTitle: "High-Retention",
+    hreDesc: "AI เลือกจังหวะ ซับ และซูมให้",
+    hreInfo: "AI จะสร้างแผนตัดต่อรายช่วง เลือกตำแหน่ง/รูปแบบซับ ซูมช่วงสำคัญ และใส่ emoji ให้อัตโนมัติ",
   },
   zh: {
     style: "片段风格",
     normalTitle: "普通字幕",
     normalDesc: "自定义字体、颜色、动画",
     hreTitle: "高留存",
+    hreDesc: "AI 自动选择节奏、字幕和缩放",
+    hreInfo: "AI 将生成分段剪辑计划，调整字幕位置/模式，缩放关键时刻，并添加表情覆盖。",
   },
 } as const;

frontend/messages/en.json CHANGED Viewed

@@ -20,7 +20,7 @@
     "mode_label": "Editing Mode",
     "normal_mode": "Normal Subtitles",
     "hre_mode": "High-Retention Editing (AI decides)",
-    "hre_hint": "AI will auto-select caption style, apply auto-zoom, silence removal, and TikTok-style cuts."
   },
   "step3": {
     "title": "Subtitle Designer",

     "mode_label": "Editing Mode",
     "normal_mode": "Normal Subtitles",
     "hre_mode": "High-Retention Editing (AI decides)",
+    "hre_hint": "AI will create a per-segment edit plan with varied captions, auto-zoom, and TikTok-style emphasis."
   },
   "step3": {
     "title": "Subtitle Designer",

frontend/messages/th.json CHANGED Viewed

@@ -23,7 +23,7 @@
     "mode_label": "โหมดการตัด",
     "normal_mode": "ซับปกติ",
     "hre_mode": "High-Retention Editing (AI เลือกให้)",
-    "hre_hint": "AI จะเลือกรูปแบบซับ จัด auto-zoom และตัดแบบ TikTok ให้อัตโนมัติ"
   },
   "step3": {
     "title": "ออกแบบซับไตเติ้ล",

     "mode_label": "โหมดการตัด",
     "normal_mode": "ซับปกติ",
     "hre_mode": "High-Retention Editing (AI เลือกให้)",
+    "hre_hint": "AI จะสร้างแผนตัดต่อรายช่วง เลือกซับหลายรูปแบบ จัด auto-zoom และเน้นจังหวะแบบ TikTok"
   },
   "step3": {
     "title": "ออกแบบซับไตเติ้ล",

frontend/messages/zh.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "nav": { "brand": "ElevenClip AI", "tagline": "AI智能剪辑精彩片段" },
   "step1": { "title": "添加视频", "upload_tab": "上传文件", "youtube_tab": "YouTube链接", "drop_hint": "拖放视频文件到此处，或点击选择", "youtube_placeholder": "粘贴YouTube链接...", "channel_label": "频道描述（可选）", "channel_placeholder": "例如：中文游戏频道，专注于搞笑时刻", "fetch_info": "获取信息" },
-  "step2": { "title": "剪辑设置", "style_label": "剪辑风格", "duration_label": "目标时长（秒）", "count_label": "剪辑数量", "clip_lang_label": "视频语言", "sub_lang_label": "字幕语言", "mode_label": "编辑模式", "normal_mode": "普通字幕", "hre_mode": "高留存率编辑（AI决定）", "hre_hint": "AI将自动选择字幕样式、应用自动缩放、去除静音并进行TikTok风格剪辑。" },
   "step3": { "title": "字幕设计", "font_label": "字体", "size_label": "字体大小", "primary_color": "主要颜色", "secondary_color": "卡拉OK颜色", "outline_color": "描边颜色", "shadow_color": "阴影颜色", "outline_size": "描边大小", "shadow_size": "阴影大小", "display_mode": "显示模式", "word_by_word": "逐字", "sentence": "句子", "animation": "动画", "alignment": "对齐", "preview": "预览" },
   "generate": { "button": "生成剪辑", "processing": "处理中..." },
   "styles": { "funny": "搞笑", "serious": "严肃", "educational": "教育", "gaming": "游戏", "entertainment": "娱乐" },

 {
   "nav": { "brand": "ElevenClip AI", "tagline": "AI智能剪辑精彩片段" },
   "step1": { "title": "添加视频", "upload_tab": "上传文件", "youtube_tab": "YouTube链接", "drop_hint": "拖放视频文件到此处，或点击选择", "youtube_placeholder": "粘贴YouTube链接...", "channel_label": "频道描述（可选）", "channel_placeholder": "例如：中文游戏频道，专注于搞笑时刻", "fetch_info": "获取信息" },
+  "step2": { "title": "剪辑设置", "style_label": "剪辑风格", "duration_label": "目标时长（秒）", "count_label": "剪辑数量", "clip_lang_label": "视频语言", "sub_lang_label": "字幕语言", "mode_label": "编辑模式", "normal_mode": "普通字幕", "hre_mode": "高留存率编辑（AI决定）", "hre_hint": "AI将生成分段剪辑计划，改变字幕样式，应用自动缩放，并突出TikTok节奏。" },
   "step3": { "title": "字幕设计", "font_label": "字体", "size_label": "字体大小", "primary_color": "主要颜色", "secondary_color": "卡拉OK颜色", "outline_color": "描边颜色", "shadow_color": "阴影颜色", "outline_size": "描边大小", "shadow_size": "阴影大小", "display_mode": "显示模式", "word_by_word": "逐字", "sentence": "句子", "animation": "动画", "alignment": "对齐", "preview": "预览" },
   "generate": { "button": "生成剪辑", "processing": "处理中..." },
   "styles": { "funny": "搞笑", "serious": "严肃", "educational": "教育", "gaming": "游戏", "entertainment": "娱乐" },

frontend/next-env.d.ts ADDED Viewed

	@@ -0,0 +1,6 @@

+/// <reference types="next" />
+/// <reference types="next/image-types/global" />
+import "./.next/types/routes.d.ts";
+// NOTE: This file should not be edited
+// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.