Spaces:
Runtime error
Runtime error
fix: add per-segment HRE edit plans
Browse files- README.md +10 -9
- backend/src/analysis/vision.py +20 -8
- backend/src/processing/high_retention.py +410 -85
- frontend/components/ClipSettings.tsx +6 -6
- frontend/messages/en.json +1 -1
- frontend/messages/th.json +1 -1
- frontend/messages/zh.json +1 -1
- frontend/next-env.d.ts +6 -0
README.md
CHANGED
|
@@ -96,11 +96,11 @@ where:
|
|
| 96 |
│ │
|
| 97 |
│ Normal Mode HRE (High-Retention Editing) │
|
| 98 |
│ ───────────── ────────────────────────────── │
|
| 99 |
-
│ • pysubs2 ASS •
|
| 100 |
-
│ • User style config • Auto-zoom
|
| 101 |
-
│ • Font/color/animation •
|
| 102 |
-
│ • Karaoke/pop/fade •
|
| 103 |
-
│ • AMD AMF encode •
|
| 104 |
└──────────────────────────────────────────────────────────────────┘
|
| 105 |
│
|
| 106 |
▼
|
|
@@ -139,11 +139,12 @@ Full creative control over:
|
|
| 139 |
|
| 140 |
### High-Retention Editing (HRE)
|
| 141 |
AI chooses everything:
|
| 142 |
-
-
|
| 143 |
-
- Auto-zoom
|
| 144 |
-
-
|
|
|
|
|
|
|
| 145 |
- Qwen2.5-VL selects contextually-appropriate emoji overlay
|
| 146 |
-
- Impact 64px bold white captions, word-by-word, pop animation
|
| 147 |
|
| 148 |
---
|
| 149 |
|
|
|
|
| 96 |
│ │
|
| 97 |
│ Normal Mode HRE (High-Retention Editing) │
|
| 98 |
│ ───────────── ────────────────────────────── │
|
| 99 |
+
│ • pysubs2 ASS • Per-segment AI edit plan │
|
| 100 |
+
│ • User style config • Auto-zoom per segment (zoompan) │
|
| 101 |
+
│ • Font/color/animation • Word / phrase / sentence captions │
|
| 102 |
+
│ • Karaoke/pop/fade • Top / bottom / left / right captions │
|
| 103 |
+
│ • AMD AMF encode • Qwen2.5-VL emoji selection │
|
| 104 |
└──────────────────────────────────────────────────────────────────┘
|
| 105 |
│
|
| 106 |
▼
|
|
|
|
| 139 |
|
| 140 |
### High-Retention Editing (HRE)
|
| 141 |
AI chooses everything:
|
| 142 |
+
- A per-segment edit plan with timestamps
|
| 143 |
+
- Auto-zoom direction and speed per segment (`ffmpeg zoompan`)
|
| 144 |
+
- Caption mode per segment: word, phrase, or sentence
|
| 145 |
+
- Caption placement per segment: top, bottom, left, right, or center
|
| 146 |
+
- Caption color, size, and pop emphasis based on segment energy
|
| 147 |
- Qwen2.5-VL selects contextually-appropriate emoji overlay
|
|
|
|
| 148 |
|
| 149 |
---
|
| 150 |
|
backend/src/analysis/vision.py
CHANGED
|
@@ -172,7 +172,9 @@ Respond ONLY with valid JSON — no markdown:
|
|
| 172 |
"face_detected": <true|false>,
|
| 173 |
"face_cx": <0.0-1.0>,
|
| 174 |
"face_cy": <0.0-1.0>,
|
| 175 |
-
"subtitle_position": "<top|bottom>",
|
|
|
|
|
|
|
| 176 |
"subtitle_color": "<white|yellow|cyan|orange|green>",
|
| 177 |
"energy_level": "<high|medium|low>",
|
| 178 |
"moment_type": "<hook|punchline|context|reaction|transition>"
|
|
@@ -184,8 +186,13 @@ Rules:
|
|
| 184 |
- zoom IN slow: context, buildup, moderate energy
|
| 185 |
- zoom OUT: reveals, breathing room after intensity
|
| 186 |
- HOLD: stable content, text-heavy moments
|
| 187 |
-
- subtitle
|
| 188 |
-
- subtitle
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
- face_cx/face_cy: face center as 0.0-1.0 fraction of frame
|
| 190 |
"""
|
| 191 |
|
|
@@ -196,7 +203,7 @@ def analyze_frame_for_hre(
|
|
| 196 |
seg_idx: int = 0,
|
| 197 |
n_total: int = 1,
|
| 198 |
) -> dict:
|
| 199 |
-
"""Per-segment HRE: zoom
|
| 200 |
try:
|
| 201 |
from openai import OpenAI
|
| 202 |
|
|
@@ -227,11 +234,12 @@ def analyze_frame_for_hre(
|
|
| 227 |
if raw.startswith("json"):
|
| 228 |
raw = raw[4:]
|
| 229 |
|
| 230 |
-
analysis = json.loads(raw.strip())
|
| 231 |
logger.debug(
|
| 232 |
f"HRE seg {seg_idx}/{n_total}: "
|
| 233 |
f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
|
| 234 |
-
f"sub={analysis.get('subtitle_position')}/{analysis.get('
|
|
|
|
| 235 |
f"type={analysis.get('moment_type')}"
|
| 236 |
)
|
| 237 |
try:
|
|
@@ -257,8 +265,10 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
|
|
| 257 |
else:
|
| 258 |
zoom_dir, zoom_speed, moment = "in", "slow", "reaction"
|
| 259 |
|
| 260 |
-
_colors = ["yellow", "white",
|
| 261 |
-
_positions = ["bottom", "top",
|
|
|
|
|
|
|
| 262 |
|
| 263 |
return {
|
| 264 |
"zoom_direction": zoom_dir,
|
|
@@ -267,6 +277,8 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
|
|
| 267 |
"face_cx": 0.5,
|
| 268 |
"face_cy": 0.38,
|
| 269 |
"subtitle_position": _positions[seg_idx % len(_positions)],
|
|
|
|
|
|
|
| 270 |
"subtitle_color": _colors[seg_idx % len(_colors)],
|
| 271 |
"energy_level": "medium",
|
| 272 |
"moment_type": moment,
|
|
|
|
| 172 |
"face_detected": <true|false>,
|
| 173 |
"face_cx": <0.0-1.0>,
|
| 174 |
"face_cy": <0.0-1.0>,
|
| 175 |
+
"subtitle_position": "<top|bottom|left|right|center>",
|
| 176 |
+
"subtitle_mode": "<word|phrase|sentence>",
|
| 177 |
+
"subtitle_emphasis": "<pop|punch|calm>",
|
| 178 |
"subtitle_color": "<white|yellow|cyan|orange|green>",
|
| 179 |
"energy_level": "<high|medium|low>",
|
| 180 |
"moment_type": "<hook|punchline|context|reaction|transition>"
|
|
|
|
| 186 |
- zoom IN slow: context, buildup, moderate energy
|
| 187 |
- zoom OUT: reveals, breathing room after intensity
|
| 188 |
- HOLD: stable content, text-heavy moments
|
| 189 |
+
- subtitle WORD: short hooks, reactions, punchlines, important keywords
|
| 190 |
+
- subtitle PHRASE: fast but understandable speech, 2-4 words at a time
|
| 191 |
+
- subtitle SENTENCE: explanation, normal conversation, low/medium energy
|
| 192 |
+
- subtitle TOP: face is in bottom half
|
| 193 |
+
- subtitle BOTTOM: face is in top half
|
| 194 |
+
- subtitle LEFT/RIGHT: face or main object is on the opposite side
|
| 195 |
+
- Avoid choosing the exact same subtitle_position and subtitle_mode for every segment.
|
| 196 |
- face_cx/face_cy: face center as 0.0-1.0 fraction of frame
|
| 197 |
"""
|
| 198 |
|
|
|
|
| 203 |
seg_idx: int = 0,
|
| 204 |
n_total: int = 1,
|
| 205 |
) -> dict:
|
| 206 |
+
"""Per-segment HRE: zoom, caption placement, caption mode, and color."""
|
| 207 |
try:
|
| 208 |
from openai import OpenAI
|
| 209 |
|
|
|
|
| 234 |
if raw.startswith("json"):
|
| 235 |
raw = raw[4:]
|
| 236 |
|
| 237 |
+
analysis = {**_default_hre_analysis(seg_idx, n_total), **json.loads(raw.strip())}
|
| 238 |
logger.debug(
|
| 239 |
f"HRE seg {seg_idx}/{n_total}: "
|
| 240 |
f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
|
| 241 |
+
f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
|
| 242 |
+
f"{analysis.get('subtitle_color')} "
|
| 243 |
f"type={analysis.get('moment_type')}"
|
| 244 |
)
|
| 245 |
try:
|
|
|
|
| 265 |
else:
|
| 266 |
zoom_dir, zoom_speed, moment = "in", "slow", "reaction"
|
| 267 |
|
| 268 |
+
_colors = ["yellow", "white", "cyan", "orange", "white", "yellow"]
|
| 269 |
+
_positions = ["bottom", "top", "left", "bottom", "right", "top"]
|
| 270 |
+
_modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
|
| 271 |
+
_emphasis = ["punch", "calm", "pop", "punch", "calm", "pop"]
|
| 272 |
|
| 273 |
return {
|
| 274 |
"zoom_direction": zoom_dir,
|
|
|
|
| 277 |
"face_cx": 0.5,
|
| 278 |
"face_cy": 0.38,
|
| 279 |
"subtitle_position": _positions[seg_idx % len(_positions)],
|
| 280 |
+
"subtitle_mode": _modes[seg_idx % len(_modes)],
|
| 281 |
+
"subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
|
| 282 |
"subtitle_color": _colors[seg_idx % len(_colors)],
|
| 283 |
"energy_level": "medium",
|
| 284 |
"moment_type": moment,
|
backend/src/processing/high_retention.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
| 1 |
"""High-Retention Editing pipeline — per-segment AI decisions.
|
| 2 |
|
| 3 |
Each 3-5s segment gets its own zoom direction, subtitle position,
|
| 4 |
-
and caption color driven by Qwen2.5-VL analyzing one
|
|
|
|
| 5 |
|
| 6 |
Pipeline per clip:
|
| 7 |
1. Segment clip at speech pauses (3-5s chunks)
|
| 8 |
2. Extract midpoint frame from each segment
|
| 9 |
3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions
|
| 10 |
4. ffmpeg filter_complex: per-segment zoompan + concat
|
| 11 |
-
5. ASS subtitles with per-segment alignment/color/
|
| 12 |
"""
|
|
|
|
| 13 |
import subprocess
|
| 14 |
import tempfile
|
| 15 |
from pathlib import Path
|
|
@@ -175,35 +177,25 @@ def _build_zoom_exprs(
|
|
| 175 |
|
| 176 |
if direction == "in":
|
| 177 |
if speed == "fast":
|
| 178 |
-
z_expr, max_zoom = "min(1.
|
| 179 |
else:
|
| 180 |
-
z_expr, max_zoom = "min(1.
|
| 181 |
elif direction == "out":
|
| 182 |
if speed == "fast":
|
| 183 |
-
z_expr, max_zoom = "max(1.
|
| 184 |
else:
|
| 185 |
-
z_expr, max_zoom = "max(1.
|
| 186 |
else: # hold
|
| 187 |
-
z_expr, max_zoom = "1.
|
| 188 |
|
| 189 |
if face_detected and direction == "in" and max_zoom > 1.05:
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
safe_cx = max(0, min(w - int(w / max_zoom), raw_cx))
|
| 193 |
-
safe_cy = max(0, min(h - int(h / max_zoom), raw_cy))
|
| 194 |
-
ctr_x = w / 2 - w / (max_zoom * 2)
|
| 195 |
-
ctr_y = h / 2 - h / (max_zoom * 2)
|
| 196 |
-
x_expr = (
|
| 197 |
-
f"(iw/2-(iw/zoom/2))+({safe_cx}-{ctr_x:.1f})*(zoom-1)/({max_zoom}-1)"
|
| 198 |
-
)
|
| 199 |
-
y_expr = (
|
| 200 |
-
f"(ih/2-(ih/zoom/2))+({safe_cy}-{ctr_y:.1f})*(zoom-1)/({max_zoom}-1)"
|
| 201 |
-
)
|
| 202 |
else:
|
| 203 |
x_expr = "iw/2-(iw/zoom/2)"
|
| 204 |
if direction == "in":
|
| 205 |
y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
|
| 206 |
-
y_expr = f"ih*{y_bias:.2f}-(ih/zoom/2)"
|
| 207 |
else:
|
| 208 |
y_expr = "ih/2-(ih/zoom/2)"
|
| 209 |
|
|
@@ -231,10 +223,12 @@ def _apply_per_segment_zoom(
|
|
| 231 |
e = f"{seg['end']:.3f}"
|
| 232 |
z, x, y = _build_zoom_exprs(analysis, w, h)
|
| 233 |
zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30"
|
| 234 |
-
filter_parts.append(
|
|
|
|
|
|
|
| 235 |
v_labels.append(f"[v{i}]")
|
| 236 |
if has_audio:
|
| 237 |
-
filter_parts.append(f"[0:a]atrim={s}:{e},asetpts=PTS-STARTPTS[a{i}]")
|
| 238 |
a_labels.append(f"[a{i}]")
|
| 239 |
|
| 240 |
n = len(segments)
|
|
@@ -270,12 +264,377 @@ _ASS_COLORS = {
|
|
| 270 |
"red": "&H000000FF",
|
| 271 |
}
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
def _ts(t: float) -> str:
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
|
| 281 |
def _generate_per_segment_subtitles(
|
|
@@ -285,42 +644,14 @@ def _generate_per_segment_subtitles(
|
|
| 285 |
segments: list[dict],
|
| 286 |
analyses: list[dict],
|
| 287 |
) -> None:
|
| 288 |
-
"""Write ASS
|
| 289 |
-
events: list[dict] = []
|
| 290 |
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
if text and t1 > 0:
|
| 298 |
-
events.append({"start": t0, "end": max(t1, t0 + 0.08), "text": text})
|
| 299 |
-
|
| 300 |
-
# Sentence-level fallback (split into 3-word chunks)
|
| 301 |
-
if not events:
|
| 302 |
-
for seg in transcript.get("segments", []):
|
| 303 |
-
t0 = max(0.0, float(seg.get("start", 0)) - clip_start)
|
| 304 |
-
t1 = max(0.0, float(seg.get("end", 0)) - clip_start)
|
| 305 |
-
text = seg.get("text", "").strip()
|
| 306 |
-
if not text or t1 <= 0:
|
| 307 |
-
continue
|
| 308 |
-
wlist = text.split()
|
| 309 |
-
chunk = 3
|
| 310 |
-
n_ch = max(1, (len(wlist) + chunk - 1) // chunk)
|
| 311 |
-
dur = (t1 - t0) / n_ch
|
| 312 |
-
for j in range(n_ch):
|
| 313 |
-
events.append({
|
| 314 |
-
"start": t0 + j * dur,
|
| 315 |
-
"end": t0 + (j + 1) * dur,
|
| 316 |
-
"text": " ".join(wlist[j * chunk:(j + 1) * chunk]),
|
| 317 |
-
})
|
| 318 |
-
|
| 319 |
-
def get_an(t: float) -> dict:
|
| 320 |
-
for seg, an in zip(segments, analyses):
|
| 321 |
-
if seg["start"] <= t < seg["end"]:
|
| 322 |
-
return an
|
| 323 |
-
return analyses[-1] if analyses else {}
|
| 324 |
|
| 325 |
lines = [
|
| 326 |
"[Script Info]",
|
|
@@ -334,36 +665,28 @@ def _generate_per_segment_subtitles(
|
|
| 334 |
"OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
|
| 335 |
"ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
|
| 336 |
"Alignment, MarginL, MarginR, MarginV, Encoding",
|
| 337 |
-
"Style: Default,
|
| 338 |
-
"-1,0,0,0,100,100,0,0,1,
|
| 339 |
"",
|
| 340 |
"[Events]",
|
| 341 |
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
|
| 342 |
]
|
| 343 |
|
| 344 |
for ev in events:
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
moment = an.get("moment_type", "context")
|
| 350 |
-
|
| 351 |
-
alignment = 8 if pos == "top" else 2
|
| 352 |
-
margin_v = 120 if pos == "top" else 200
|
| 353 |
-
fs = (108 if energy == "high" or moment in ("hook", "punchline")
|
| 354 |
-
else 80 if energy == "low" else 92)
|
| 355 |
-
|
| 356 |
-
# Pop animation: start 130% scale, shrink to 100% in 120ms
|
| 357 |
-
pop = "{\\fscx130\\fscy130\\t(0,120,\\fscx100\\fscy100)}"
|
| 358 |
-
tag = f"{{\\an{alignment}\\1c{color}&\\fs{fs}\\b1}}{pop}"
|
| 359 |
|
| 360 |
lines.append(
|
| 361 |
f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])},"
|
| 362 |
-
f"Default,,0,0,
|
| 363 |
)
|
| 364 |
|
| 365 |
ass_path.write_text("\n".join(lines), encoding="utf-8")
|
| 366 |
-
|
|
|
|
|
|
|
| 367 |
|
| 368 |
|
| 369 |
# ─── Emoji ─────────────────────────────────────────────────────────────────────
|
|
@@ -437,7 +760,7 @@ def apply_hre(
|
|
| 437 |
transcript: dict,
|
| 438 |
output_path: Path,
|
| 439 |
) -> Path:
|
| 440 |
-
"""Apply per-segment AI-driven HRE
|
| 441 |
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 442 |
clip_start = clip_data.get("start", 0.0)
|
| 443 |
|
|
@@ -464,26 +787,28 @@ def apply_hre(
|
|
| 464 |
_analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir)
|
| 465 |
for i, seg in enumerate(segments)
|
| 466 |
]
|
|
|
|
| 467 |
|
| 468 |
-
for i, (seg, an) in enumerate(zip(segments,
|
| 469 |
logger.info(
|
| 470 |
f" [{seg['start']:.1f}s-{seg['end']:.1f}s] "
|
| 471 |
f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) "
|
| 472 |
-
f"sub={an.get('subtitle_position')}/{an.get('
|
|
|
|
| 473 |
f"type={an.get('moment_type')} energy={an.get('energy_level')}"
|
| 474 |
)
|
| 475 |
|
| 476 |
# 3. Per-segment zoom via filter_complex
|
| 477 |
zoomed = _apply_per_segment_zoom(
|
| 478 |
-
clip_path, segments,
|
| 479 |
)
|
| 480 |
|
| 481 |
# 4. Per-segment ASS subtitles
|
| 482 |
ass_path = output_path.with_suffix(".ass")
|
| 483 |
-
_generate_per_segment_subtitles(transcript, ass_path, clip_start, segments,
|
| 484 |
|
| 485 |
# 5. Emoji from highest-energy segment
|
| 486 |
-
emoji = _get_emoji(clip_data,
|
| 487 |
|
| 488 |
# 6. Render
|
| 489 |
_render_final(zoomed, ass_path, emoji, output_path)
|
|
|
|
| 1 |
"""High-Retention Editing pipeline — per-segment AI decisions.
|
| 2 |
|
| 3 |
Each 3-5s segment gets its own zoom direction, subtitle position,
|
| 4 |
+
subtitle mode, and caption color driven by Qwen2.5-VL analyzing one
|
| 5 |
+
frame plus the local transcript for that segment.
|
| 6 |
|
| 7 |
Pipeline per clip:
|
| 8 |
1. Segment clip at speech pauses (3-5s chunks)
|
| 9 |
2. Extract midpoint frame from each segment
|
| 10 |
3. Qwen2.5-VL analyzes each frame → zoom + subtitle decisions
|
| 11 |
4. ffmpeg filter_complex: per-segment zoompan + concat
|
| 12 |
+
5. ASS subtitles with per-segment alignment/color/mode override tags
|
| 13 |
"""
|
| 14 |
+
import json
|
| 15 |
import subprocess
|
| 16 |
import tempfile
|
| 17 |
from pathlib import Path
|
|
|
|
| 177 |
|
| 178 |
if direction == "in":
|
| 179 |
if speed == "fast":
|
| 180 |
+
z_expr, max_zoom = "min(1.12+on*0.0018\\,1.55)", 1.55
|
| 181 |
else:
|
| 182 |
+
z_expr, max_zoom = "min(1.04+on*0.0009\\,1.32)", 1.32
|
| 183 |
elif direction == "out":
|
| 184 |
if speed == "fast":
|
| 185 |
+
z_expr, max_zoom = "max(1.48-on*0.0018\\,1.0)", 1.48
|
| 186 |
else:
|
| 187 |
+
z_expr, max_zoom = "max(1.28-on*0.0009\\,1.0)", 1.28
|
| 188 |
else: # hold
|
| 189 |
+
z_expr, max_zoom = "1.08", 1.08
|
| 190 |
|
| 191 |
if face_detected and direction == "in" and max_zoom > 1.05:
|
| 192 |
+
x_expr = f"max(0\\,min(iw-iw/zoom\\,iw*{face_cx:.3f}-iw/zoom/2))"
|
| 193 |
+
y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{face_cy:.3f}-ih/zoom/2))"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
else:
|
| 195 |
x_expr = "iw/2-(iw/zoom/2)"
|
| 196 |
if direction == "in":
|
| 197 |
y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
|
| 198 |
+
y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{y_bias:.2f}-(ih/zoom/2)))"
|
| 199 |
else:
|
| 200 |
y_expr = "ih/2-(ih/zoom/2)"
|
| 201 |
|
|
|
|
| 223 |
e = f"{seg['end']:.3f}"
|
| 224 |
z, x, y = _build_zoom_exprs(analysis, w, h)
|
| 225 |
zp = f"zoompan=z='{z}':x='{x}':y='{y}':d=1:s={w}x{h}:fps=30"
|
| 226 |
+
filter_parts.append(
|
| 227 |
+
f"[0:v]trim=start={s}:end={e},setpts=PTS-STARTPTS,fps=30,{zp},setpts=PTS-STARTPTS[v{i}]"
|
| 228 |
+
)
|
| 229 |
v_labels.append(f"[v{i}]")
|
| 230 |
if has_audio:
|
| 231 |
+
filter_parts.append(f"[0:a]atrim=start={s}:end={e},asetpts=PTS-STARTPTS[a{i}]")
|
| 232 |
a_labels.append(f"[a{i}]")
|
| 233 |
|
| 234 |
n = len(segments)
|
|
|
|
| 264 |
"red": "&H000000FF",
|
| 265 |
}
|
| 266 |
|
| 267 |
+
_POSITIONS = {"top", "bottom", "left", "right", "center"}
|
| 268 |
+
_MODES = {"word", "phrase", "sentence"}
|
| 269 |
+
_EMPHASIS = {"pop", "punch", "calm"}
|
| 270 |
+
|
| 271 |
|
| 272 |
def _ts(t: float) -> str:
|
| 273 |
+
total_cs = max(0, int(round(t * 100)))
|
| 274 |
+
h = total_cs // 360000
|
| 275 |
+
total_cs %= 360000
|
| 276 |
+
m = total_cs // 6000
|
| 277 |
+
total_cs %= 6000
|
| 278 |
+
s = total_cs // 100
|
| 279 |
+
cs = total_cs % 100
|
| 280 |
+
return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def _pick(value: object, allowed: set[str], fallback: str) -> str:
|
| 284 |
+
v = str(value or "").strip().lower()
|
| 285 |
+
return v if v in allowed else fallback
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
|
| 289 |
+
"""Validate model output and fill HRE fields used by the renderer."""
|
| 290 |
+
an = dict(analysis or {})
|
| 291 |
+
energy = _pick(an.get("energy_level"), {"high", "medium", "low"}, "medium")
|
| 292 |
+
moment = _pick(
|
| 293 |
+
an.get("moment_type"),
|
| 294 |
+
{"hook", "punchline", "context", "reaction", "transition"},
|
| 295 |
+
"context",
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
fallback_mode = "word" if energy == "high" or moment in {"hook", "punchline", "reaction"} else "sentence"
|
| 299 |
+
if energy == "medium" and moment not in {"context", "transition"}:
|
| 300 |
+
fallback_mode = "phrase"
|
| 301 |
+
|
| 302 |
+
pos = _pick(an.get("subtitle_position"), _POSITIONS, "bottom")
|
| 303 |
+
mode = _pick(an.get("subtitle_mode"), _MODES, fallback_mode)
|
| 304 |
+
emphasis = _pick(an.get("subtitle_emphasis"), _EMPHASIS, "punch" if mode == "word" else "calm")
|
| 305 |
+
color = _pick(an.get("subtitle_color"), set(_ASS_COLORS), "white")
|
| 306 |
+
zoom_direction = _pick(an.get("zoom_direction"), {"in", "out", "hold"}, "in")
|
| 307 |
+
zoom_speed = _pick(an.get("zoom_speed"), {"fast", "slow"}, "slow")
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
face_cx = min(1.0, max(0.0, float(an.get("face_cx", 0.5))))
|
| 311 |
+
face_cy = min(1.0, max(0.0, float(an.get("face_cy", 0.38))))
|
| 312 |
+
except Exception:
|
| 313 |
+
face_cx, face_cy = 0.5, 0.38
|
| 314 |
+
|
| 315 |
+
if seg_idx == 0:
|
| 316 |
+
zoom_direction, zoom_speed = "in", "fast"
|
| 317 |
+
if mode == "sentence":
|
| 318 |
+
mode = "word"
|
| 319 |
+
if emphasis == "calm":
|
| 320 |
+
emphasis = "punch"
|
| 321 |
+
|
| 322 |
+
return {
|
| 323 |
+
**an,
|
| 324 |
+
"zoom_direction": zoom_direction,
|
| 325 |
+
"zoom_speed": zoom_speed,
|
| 326 |
+
"face_detected": bool(an.get("face_detected", False)),
|
| 327 |
+
"face_cx": face_cx,
|
| 328 |
+
"face_cy": face_cy,
|
| 329 |
+
"subtitle_position": pos,
|
| 330 |
+
"subtitle_mode": mode,
|
| 331 |
+
"subtitle_emphasis": emphasis,
|
| 332 |
+
"subtitle_color": color,
|
| 333 |
+
"energy_level": energy,
|
| 334 |
+
"moment_type": moment,
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def _build_hre_plan(segments: list[dict], analyses: list[dict]) -> list[dict]:
|
| 339 |
+
plan = []
|
| 340 |
+
n_total = len(segments)
|
| 341 |
+
for i, (seg, analysis) in enumerate(zip(segments, analyses)):
|
| 342 |
+
an = _normalise_analysis(analysis, i, n_total)
|
| 343 |
+
plan.append({**an, "segment_index": i, "start": seg["start"], "end": seg["end"]})
|
| 344 |
+
|
| 345 |
+
# If the model repeats the same caption treatment for every segment, rotate
|
| 346 |
+
# through safe defaults so HRE visibly changes across the clip.
|
| 347 |
+
if len(plan) > 1 and len({(p["subtitle_position"], p["subtitle_mode"]) for p in plan}) == 1:
|
| 348 |
+
positions = ["bottom", "top", "left", "bottom", "right", "top"]
|
| 349 |
+
modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
|
| 350 |
+
for i, p in enumerate(plan):
|
| 351 |
+
p["subtitle_position"] = positions[i % len(positions)]
|
| 352 |
+
p["subtitle_mode"] = modes[i % len(modes)]
|
| 353 |
+
if p["subtitle_mode"] == "word":
|
| 354 |
+
p["subtitle_emphasis"] = "punch"
|
| 355 |
+
|
| 356 |
+
return plan
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def _ass_escape(text: str) -> str:
|
| 360 |
+
return (
|
| 361 |
+
text.replace("{", "(")
|
| 362 |
+
.replace("}", ")")
|
| 363 |
+
.replace("\r", " ")
|
| 364 |
+
.replace("\n", " ")
|
| 365 |
+
.strip()
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def _wrap_text(text: str, max_chars: int) -> str:
|
| 370 |
+
text = _ass_escape(text)
|
| 371 |
+
if len(text) <= max_chars:
|
| 372 |
+
return text
|
| 373 |
+
|
| 374 |
+
words = text.split()
|
| 375 |
+
if len(words) <= 1:
|
| 376 |
+
return r"\N".join(text[i:i + max_chars] for i in range(0, len(text), max_chars))
|
| 377 |
+
|
| 378 |
+
lines: list[str] = []
|
| 379 |
+
line = ""
|
| 380 |
+
for word in words:
|
| 381 |
+
candidate = f"{line} {word}".strip()
|
| 382 |
+
if line and len(candidate) > max_chars:
|
| 383 |
+
lines.append(line)
|
| 384 |
+
line = word
|
| 385 |
+
else:
|
| 386 |
+
line = candidate
|
| 387 |
+
if line:
|
| 388 |
+
lines.append(line)
|
| 389 |
+
|
| 390 |
+
if len(lines) <= 2:
|
| 391 |
+
return r"\N".join(lines)
|
| 392 |
+
return r"\N".join([lines[0], " ".join(lines[1:])])
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def _collect_clip_words(transcript: dict, clip_start: float, duration: float) -> list[dict]:
|
| 396 |
+
words: list[dict] = []
|
| 397 |
+
for seg in transcript.get("segments", []):
|
| 398 |
+
seg_start = float(seg.get("start", clip_start)) - clip_start
|
| 399 |
+
seg_end = float(seg.get("end", clip_start)) - clip_start
|
| 400 |
+
for word in seg.get("words", []):
|
| 401 |
+
text = str(word.get("word", word.get("text", ""))).strip()
|
| 402 |
+
if not text:
|
| 403 |
+
continue
|
| 404 |
+
start = float(word.get("start", seg_start + clip_start)) - clip_start
|
| 405 |
+
end = float(word.get("end", word.get("start", seg_end + clip_start))) - clip_start
|
| 406 |
+
if end <= start:
|
| 407 |
+
end = start + 0.24
|
| 408 |
+
if end <= 0 or start >= duration:
|
| 409 |
+
continue
|
| 410 |
+
words.append({
|
| 411 |
+
"start": max(0.0, start),
|
| 412 |
+
"end": min(duration, end),
|
| 413 |
+
"text": text,
|
| 414 |
+
})
|
| 415 |
+
return sorted(words, key=lambda w: (w["start"], w["end"]))
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
def _segment_text(transcript: dict, clip_start: float, seg: dict) -> str:
|
| 419 |
+
parts: list[str] = []
|
| 420 |
+
for item in transcript.get("segments", []):
|
| 421 |
+
start = float(item.get("start", clip_start)) - clip_start
|
| 422 |
+
end = float(item.get("end", clip_start)) - clip_start
|
| 423 |
+
if start < seg["end"] and end > seg["start"]:
|
| 424 |
+
text = str(item.get("text", "")).strip()
|
| 425 |
+
if text:
|
| 426 |
+
parts.append(text)
|
| 427 |
+
return " ".join(parts).strip()
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
def _words_in_segment(words: list[dict], seg: dict) -> list[dict]:
|
| 431 |
+
return [
|
| 432 |
+
w for w in words
|
| 433 |
+
if w["start"] < seg["end"] and w["end"] > seg["start"]
|
| 434 |
+
]
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def _display_text(text: str, mode: str, emphasis: str) -> str:
|
| 438 |
+
text = text.strip()
|
| 439 |
+
if mode == "sentence" and emphasis == "calm":
|
| 440 |
+
return text
|
| 441 |
+
return text.upper()
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def _append_event(events: list[dict], start: float, end: float, text: str, plan: dict) -> None:
|
| 445 |
+
start = max(float(plan["start"]), start)
|
| 446 |
+
end = min(float(plan["end"]), end)
|
| 447 |
+
if end - start < 0.08 or not text.strip():
|
| 448 |
+
return
|
| 449 |
+
events.append({
|
| 450 |
+
"start": start,
|
| 451 |
+
"end": end,
|
| 452 |
+
"text": text.strip(),
|
| 453 |
+
"plan": plan,
|
| 454 |
+
})
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def _word_events(words: list[dict], seg: dict, plan: dict) -> list[dict]:
|
| 458 |
+
events: list[dict] = []
|
| 459 |
+
cursor = seg["start"]
|
| 460 |
+
min_d = 0.14 if plan["energy_level"] == "high" else 0.18
|
| 461 |
+
max_d = 0.72 if plan["energy_level"] == "high" else 0.95
|
| 462 |
+
|
| 463 |
+
for i, word in enumerate(words):
|
| 464 |
+
start = max(seg["start"], word["start"], cursor)
|
| 465 |
+
next_start = words[i + 1]["start"] if i + 1 < len(words) else seg["end"]
|
| 466 |
+
natural_end = max(word["end"], start + min_d)
|
| 467 |
+
end = min(seg["end"], natural_end, start + max_d)
|
| 468 |
+
if next_start > start:
|
| 469 |
+
end = min(end, max(start + min_d, next_start - 0.015))
|
| 470 |
+
if end <= start:
|
| 471 |
+
end = min(seg["end"], start + min_d)
|
| 472 |
+
|
| 473 |
+
_append_event(events, start, end, word["text"], plan)
|
| 474 |
+
cursor = end + 0.015
|
| 475 |
+
if cursor >= seg["end"]:
|
| 476 |
+
break
|
| 477 |
+
|
| 478 |
+
return events
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
def _line_events(
|
| 482 |
+
words: list[dict],
|
| 483 |
+
seg: dict,
|
| 484 |
+
plan: dict,
|
| 485 |
+
max_words: int,
|
| 486 |
+
max_duration: float,
|
| 487 |
+
max_chars: int,
|
| 488 |
+
) -> list[dict]:
|
| 489 |
+
events: list[dict] = []
|
| 490 |
+
i = 0
|
| 491 |
+
cursor = seg["start"]
|
| 492 |
+
|
| 493 |
+
while i < len(words) and cursor < seg["end"] - 0.08:
|
| 494 |
+
group: list[dict] = []
|
| 495 |
+
start = max(seg["start"], words[i]["start"], cursor)
|
| 496 |
+
end = start
|
| 497 |
+
chars = 0
|
| 498 |
+
|
| 499 |
+
while i < len(words):
|
| 500 |
+
word = words[i]
|
| 501 |
+
proposed_end = min(seg["end"], max(word["end"], word["start"] + 0.2))
|
| 502 |
+
proposed_chars = chars + len(word["text"]) + (1 if group else 0)
|
| 503 |
+
if group and (
|
| 504 |
+
len(group) >= max_words
|
| 505 |
+
or proposed_end - start > max_duration
|
| 506 |
+
or proposed_chars > max_chars
|
| 507 |
+
):
|
| 508 |
+
break
|
| 509 |
+
group.append(word)
|
| 510 |
+
chars = proposed_chars
|
| 511 |
+
end = max(end, proposed_end)
|
| 512 |
+
i += 1
|
| 513 |
+
|
| 514 |
+
if not group:
|
| 515 |
+
i += 1
|
| 516 |
+
continue
|
| 517 |
+
|
| 518 |
+
end = min(seg["end"], max(end, start + 0.55))
|
| 519 |
+
text = " ".join(w["text"] for w in group)
|
| 520 |
+
_append_event(events, start, end, text, plan)
|
| 521 |
+
cursor = end + 0.04
|
| 522 |
+
|
| 523 |
+
return events
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
def _fallback_text_events(text: str, seg: dict, plan: dict) -> list[dict]:
|
| 527 |
+
if not text:
|
| 528 |
+
return []
|
| 529 |
+
|
| 530 |
+
mode = plan["subtitle_mode"]
|
| 531 |
+
if mode == "word":
|
| 532 |
+
chunk_size = 1
|
| 533 |
+
elif mode == "phrase":
|
| 534 |
+
chunk_size = 3
|
| 535 |
+
else:
|
| 536 |
+
chunk_size = 7
|
| 537 |
+
|
| 538 |
+
units = text.split()
|
| 539 |
+
if len(units) <= 1 and len(text) > 20:
|
| 540 |
+
step = 10 if mode == "word" else 24 if mode == "phrase" else 36
|
| 541 |
+
units = [text[i:i + step] for i in range(0, len(text), step)]
|
| 542 |
+
|
| 543 |
+
chunks = [" ".join(units[i:i + chunk_size]) for i in range(0, len(units), chunk_size)]
|
| 544 |
+
chunks = [c for c in chunks if c.strip()]
|
| 545 |
+
if not chunks:
|
| 546 |
+
return []
|
| 547 |
+
|
| 548 |
+
events: list[dict] = []
|
| 549 |
+
seg_d = max(0.1, seg["end"] - seg["start"])
|
| 550 |
+
dur = seg_d / len(chunks)
|
| 551 |
+
for i, chunk in enumerate(chunks):
|
| 552 |
+
start = seg["start"] + i * dur
|
| 553 |
+
end = seg["start"] + (i + 1) * dur
|
| 554 |
+
_append_event(events, start, end, chunk, plan)
|
| 555 |
+
return events
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
def _build_subtitle_events(
|
| 559 |
+
transcript: dict,
|
| 560 |
+
clip_start: float,
|
| 561 |
+
duration: float,
|
| 562 |
+
segments: list[dict],
|
| 563 |
+
plan: list[dict],
|
| 564 |
+
) -> list[dict]:
|
| 565 |
+
words = _collect_clip_words(transcript, clip_start, duration)
|
| 566 |
+
events: list[dict] = []
|
| 567 |
+
|
| 568 |
+
for seg, seg_plan in zip(segments, plan):
|
| 569 |
+
seg_words = _words_in_segment(words, seg)
|
| 570 |
+
mode = seg_plan["subtitle_mode"]
|
| 571 |
+
|
| 572 |
+
if seg_words and mode == "word":
|
| 573 |
+
seg_events = _word_events(seg_words, seg, seg_plan)
|
| 574 |
+
elif seg_words and mode == "phrase":
|
| 575 |
+
seg_events = _line_events(seg_words, seg, seg_plan, max_words=3, max_duration=1.7, max_chars=28)
|
| 576 |
+
elif seg_words:
|
| 577 |
+
seg_events = _line_events(seg_words, seg, seg_plan, max_words=7, max_duration=2.8, max_chars=44)
|
| 578 |
+
else:
|
| 579 |
+
seg_events = []
|
| 580 |
+
|
| 581 |
+
if not seg_events:
|
| 582 |
+
seg_events = _fallback_text_events(_segment_text(transcript, clip_start, seg), seg, seg_plan)
|
| 583 |
+
events.extend(seg_events)
|
| 584 |
+
|
| 585 |
+
events = sorted(events, key=lambda ev: (ev["start"], ev["end"]))
|
| 586 |
+
|
| 587 |
+
# ASS draws all active events at once; keep one visible caption event at a
|
| 588 |
+
# time so word/phrase/sentence modes never stack on top of each other.
|
| 589 |
+
cleaned: list[dict] = []
|
| 590 |
+
cursor = 0.0
|
| 591 |
+
for ev in events:
|
| 592 |
+
start = max(ev["start"], cursor)
|
| 593 |
+
end = min(duration, ev["end"])
|
| 594 |
+
if end - start < 0.08:
|
| 595 |
+
continue
|
| 596 |
+
cleaned.append({**ev, "start": start, "end": end})
|
| 597 |
+
cursor = end + 0.01
|
| 598 |
+
return cleaned
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
def _subtitle_tag(plan: dict) -> tuple[str, int]:
|
| 602 |
+
pos = plan["subtitle_position"]
|
| 603 |
+
mode = plan["subtitle_mode"]
|
| 604 |
+
energy = plan["energy_level"]
|
| 605 |
+
emphasis = plan["subtitle_emphasis"]
|
| 606 |
+
color = _ASS_COLORS.get(plan["subtitle_color"], "&H00FFFFFF")
|
| 607 |
+
|
| 608 |
+
anchors = {
|
| 609 |
+
"top": (8, 540, 230),
|
| 610 |
+
"bottom": (2, 540, 1660),
|
| 611 |
+
"left": (4, 95, 960),
|
| 612 |
+
"right": (6, 985, 960),
|
| 613 |
+
"center": (5, 540, 960),
|
| 614 |
+
}
|
| 615 |
+
alignment, x, y = anchors.get(pos, anchors["bottom"])
|
| 616 |
+
|
| 617 |
+
if mode == "sentence":
|
| 618 |
+
font_size = 66 if energy != "high" else 74
|
| 619 |
+
max_chars = 34
|
| 620 |
+
elif mode == "phrase":
|
| 621 |
+
font_size = 82 if energy != "low" else 76
|
| 622 |
+
max_chars = 24
|
| 623 |
+
else:
|
| 624 |
+
font_size = 102 if energy == "high" else 92
|
| 625 |
+
max_chars = 18
|
| 626 |
+
|
| 627 |
+
if pos in {"left", "right"}:
|
| 628 |
+
font_size -= 8
|
| 629 |
+
max_chars = min(max_chars, 22)
|
| 630 |
+
|
| 631 |
+
base = (
|
| 632 |
+
f"{{\\an{alignment}\\pos({x},{y})\\1c{color}&\\fs{font_size}"
|
| 633 |
+
"\\b1\\bord5\\shad1\\q2}}"
|
| 634 |
+
)
|
| 635 |
+
if emphasis in {"pop", "punch"} or mode == "word":
|
| 636 |
+
base += "{\\fscx125\\fscy125\\t(0,120,\\fscx100\\fscy100)}"
|
| 637 |
+
return base, max_chars
|
| 638 |
|
| 639 |
|
| 640 |
def _generate_per_segment_subtitles(
|
|
|
|
| 644 |
segments: list[dict],
|
| 645 |
analyses: list[dict],
|
| 646 |
) -> None:
|
| 647 |
+
"""Write one ASS file from the HRE plan.
|
|
|
|
| 648 |
|
| 649 |
+
The important rule is that HRE can change style every segment, but it must
|
| 650 |
+
never emit simultaneous caption events at the same timestamp.
|
| 651 |
+
"""
|
| 652 |
+
duration = max((float(seg["end"]) for seg in segments), default=0.0)
|
| 653 |
+
plan = _build_hre_plan(segments, analyses)
|
| 654 |
+
events = _build_subtitle_events(transcript, clip_start, duration, segments, plan)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
|
| 656 |
lines = [
|
| 657 |
"[Script Info]",
|
|
|
|
| 665 |
"OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
|
| 666 |
"ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
|
| 667 |
"Alignment, MarginL, MarginR, MarginV, Encoding",
|
| 668 |
+
"Style: Default,Noto Sans,82,&H00FFFFFF,&H0000FFFF,&H00000000,&H80000000,"
|
| 669 |
+
"-1,0,0,0,100,100,0,0,1,5,1,2,40,40,200,1",
|
| 670 |
"",
|
| 671 |
"[Events]",
|
| 672 |
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
|
| 673 |
]
|
| 674 |
|
| 675 |
for ev in events:
|
| 676 |
+
seg_plan = ev["plan"]
|
| 677 |
+
tag, max_chars = _subtitle_tag(seg_plan)
|
| 678 |
+
text = _display_text(ev["text"], seg_plan["subtitle_mode"], seg_plan["subtitle_emphasis"])
|
| 679 |
+
text = _wrap_text(text, max_chars)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 680 |
|
| 681 |
lines.append(
|
| 682 |
f"Dialogue: 0,{_ts(ev['start'])},{_ts(ev['end'])},"
|
| 683 |
+
f"Default,,0,0,0,,{tag}{text}"
|
| 684 |
)
|
| 685 |
|
| 686 |
ass_path.write_text("\n".join(lines), encoding="utf-8")
|
| 687 |
+
plan_path = ass_path.with_suffix(".hre_plan.json")
|
| 688 |
+
plan_path.write_text(json.dumps(plan, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 689 |
+
logger.debug(f"ASS: {len(events)} events across {len(segments)} HRE segments")
|
| 690 |
|
| 691 |
|
| 692 |
# ─── Emoji ─────────────────────────────────────────────────────────────────────
|
|
|
|
| 760 |
transcript: dict,
|
| 761 |
output_path: Path,
|
| 762 |
) -> Path:
|
| 763 |
+
"""Apply per-segment AI-driven HRE with varied zoom and caption plans."""
|
| 764 |
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 765 |
clip_start = clip_data.get("start", 0.0)
|
| 766 |
|
|
|
|
| 787 |
_analyze_segment(clip_path, seg, i, n, transcript, clip_start, tmp_dir)
|
| 788 |
for i, seg in enumerate(segments)
|
| 789 |
]
|
| 790 |
+
plan = _build_hre_plan(segments, analyses)
|
| 791 |
|
| 792 |
+
for i, (seg, an) in enumerate(zip(segments, plan)):
|
| 793 |
logger.info(
|
| 794 |
f" [{seg['start']:.1f}s-{seg['end']:.1f}s] "
|
| 795 |
f"zoom={an.get('zoom_direction')}({an.get('zoom_speed')}) "
|
| 796 |
+
f"sub={an.get('subtitle_position')}/{an.get('subtitle_mode')}/"
|
| 797 |
+
f"{an.get('subtitle_color')} "
|
| 798 |
f"type={an.get('moment_type')} energy={an.get('energy_level')}"
|
| 799 |
)
|
| 800 |
|
| 801 |
# 3. Per-segment zoom via filter_complex
|
| 802 |
zoomed = _apply_per_segment_zoom(
|
| 803 |
+
clip_path, segments, plan, w, h, tmp_zoomed, has_audio=has_audio
|
| 804 |
)
|
| 805 |
|
| 806 |
# 4. Per-segment ASS subtitles
|
| 807 |
ass_path = output_path.with_suffix(".ass")
|
| 808 |
+
_generate_per_segment_subtitles(transcript, ass_path, clip_start, segments, plan)
|
| 809 |
|
| 810 |
# 5. Emoji from highest-energy segment
|
| 811 |
+
emoji = _get_emoji(clip_data, plan)
|
| 812 |
|
| 813 |
# 6. Render
|
| 814 |
_render_final(zoomed, ass_path, emoji, output_path)
|
frontend/components/ClipSettings.tsx
CHANGED
|
@@ -40,8 +40,8 @@ const L = {
|
|
| 40 |
normalTitle: "Normal Subtitles",
|
| 41 |
normalDesc: "Customize font, colors, animations",
|
| 42 |
hreTitle: "High-Retention",
|
| 43 |
-
hreDesc: "AI picks
|
| 44 |
-
hreInfo: "AI will
|
| 45 |
},
|
| 46 |
th: {
|
| 47 |
style: "สไตล์คลิป",
|
|
@@ -53,8 +53,8 @@ const L = {
|
|
| 53 |
normalTitle: "ซับปกติ",
|
| 54 |
normalDesc: "เลือกรูปแบบซับได้เอง",
|
| 55 |
hreTitle: "High-Retention",
|
| 56 |
-
hreDesc: "AI เลือก
|
| 57 |
-
hreInfo: "AI จะเลือก
|
| 58 |
},
|
| 59 |
zh: {
|
| 60 |
style: "片段风格",
|
|
@@ -66,8 +66,8 @@ const L = {
|
|
| 66 |
normalTitle: "普通字幕",
|
| 67 |
normalDesc: "自定义字体、颜色、动画",
|
| 68 |
hreTitle: "高留存",
|
| 69 |
-
hreDesc: "AI 自动
|
| 70 |
-
hreInfo: "AI 将
|
| 71 |
},
|
| 72 |
} as const;
|
| 73 |
|
|
|
|
| 40 |
normalTitle: "Normal Subtitles",
|
| 41 |
normalDesc: "Customize font, colors, animations",
|
| 42 |
hreTitle: "High-Retention",
|
| 43 |
+
hreDesc: "AI picks timing, captions, and zoom",
|
| 44 |
+
hreInfo: "AI will create a per-segment edit plan, vary caption placement/mode, zoom on key moments, and add emoji overlays.",
|
| 45 |
},
|
| 46 |
th: {
|
| 47 |
style: "สไตล์คลิป",
|
|
|
|
| 53 |
normalTitle: "ซับปกติ",
|
| 54 |
normalDesc: "เลือกรูปแบบซับได้เอง",
|
| 55 |
hreTitle: "High-Retention",
|
| 56 |
+
hreDesc: "AI เลือกจังหวะ ซับ และซูมให้",
|
| 57 |
+
hreInfo: "AI จะสร้างแผนตัดต่อรายช่วง เลือกตำแหน่ง/รูปแบบซับ ซูมช่วงสำคัญ และใส่ emoji ให้อัตโนมัติ",
|
| 58 |
},
|
| 59 |
zh: {
|
| 60 |
style: "片段风格",
|
|
|
|
| 66 |
normalTitle: "普通字幕",
|
| 67 |
normalDesc: "自定义字体、颜色、动画",
|
| 68 |
hreTitle: "高留存",
|
| 69 |
+
hreDesc: "AI 自动选择节奏、字幕和缩放",
|
| 70 |
+
hreInfo: "AI 将生成分段剪辑计划,调整字幕位置/模式,缩放关键时刻,并添加表情覆盖。",
|
| 71 |
},
|
| 72 |
} as const;
|
| 73 |
|
frontend/messages/en.json
CHANGED
|
@@ -20,7 +20,7 @@
|
|
| 20 |
"mode_label": "Editing Mode",
|
| 21 |
"normal_mode": "Normal Subtitles",
|
| 22 |
"hre_mode": "High-Retention Editing (AI decides)",
|
| 23 |
-
"hre_hint": "AI will
|
| 24 |
},
|
| 25 |
"step3": {
|
| 26 |
"title": "Subtitle Designer",
|
|
|
|
| 20 |
"mode_label": "Editing Mode",
|
| 21 |
"normal_mode": "Normal Subtitles",
|
| 22 |
"hre_mode": "High-Retention Editing (AI decides)",
|
| 23 |
+
"hre_hint": "AI will create a per-segment edit plan with varied captions, auto-zoom, and TikTok-style emphasis."
|
| 24 |
},
|
| 25 |
"step3": {
|
| 26 |
"title": "Subtitle Designer",
|
frontend/messages/th.json
CHANGED
|
@@ -23,7 +23,7 @@
|
|
| 23 |
"mode_label": "โหมดการตัด",
|
| 24 |
"normal_mode": "ซับปกติ",
|
| 25 |
"hre_mode": "High-Retention Editing (AI เลือกให้)",
|
| 26 |
-
"hre_hint": "AI จะเลือกรูปแบบ
|
| 27 |
},
|
| 28 |
"step3": {
|
| 29 |
"title": "ออกแบบซับไตเติ้ล",
|
|
|
|
| 23 |
"mode_label": "โหมดการตัด",
|
| 24 |
"normal_mode": "ซับปกติ",
|
| 25 |
"hre_mode": "High-Retention Editing (AI เลือกให้)",
|
| 26 |
+
"hre_hint": "AI จะสร้างแผนตัดต่อรายช่วง เลือกซับหลายรูปแบบ จัด auto-zoom และเน้นจังหวะแบบ TikTok"
|
| 27 |
},
|
| 28 |
"step3": {
|
| 29 |
"title": "ออกแบบซับไตเติ้ล",
|
frontend/messages/zh.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"nav": { "brand": "ElevenClip AI", "tagline": "AI智能剪辑精彩片段" },
|
| 3 |
"step1": { "title": "添加视频", "upload_tab": "上传文件", "youtube_tab": "YouTube链接", "drop_hint": "拖放视频文件到此处,或点击选择", "youtube_placeholder": "粘贴YouTube链接...", "channel_label": "频道描述(可选)", "channel_placeholder": "例如:中文游戏频道,专注于搞笑时刻", "fetch_info": "获取信息" },
|
| 4 |
-
"step2": { "title": "剪辑设置", "style_label": "剪辑风格", "duration_label": "目标时长(秒)", "count_label": "剪辑数量", "clip_lang_label": "视频语言", "sub_lang_label": "字幕语言", "mode_label": "编辑模式", "normal_mode": "普通字幕", "hre_mode": "高留存率编辑(AI决定)", "hre_hint": "AI将
|
| 5 |
"step3": { "title": "字幕设计", "font_label": "字体", "size_label": "字体大小", "primary_color": "主要颜色", "secondary_color": "卡拉OK颜色", "outline_color": "描边颜色", "shadow_color": "阴影颜色", "outline_size": "描边大小", "shadow_size": "阴影大小", "display_mode": "显示模式", "word_by_word": "逐字", "sentence": "句子", "animation": "动画", "alignment": "对齐", "preview": "预览" },
|
| 6 |
"generate": { "button": "生成剪辑", "processing": "处理中..." },
|
| 7 |
"styles": { "funny": "搞笑", "serious": "严肃", "educational": "教育", "gaming": "游戏", "entertainment": "娱乐" },
|
|
|
|
| 1 |
{
|
| 2 |
"nav": { "brand": "ElevenClip AI", "tagline": "AI智能剪辑精彩片段" },
|
| 3 |
"step1": { "title": "添加视频", "upload_tab": "上传文件", "youtube_tab": "YouTube链接", "drop_hint": "拖放视频文件到此处,或点击选择", "youtube_placeholder": "粘贴YouTube链接...", "channel_label": "频道描述(可选)", "channel_placeholder": "例如:中文游戏频道,专注于搞笑时刻", "fetch_info": "获取信息" },
|
| 4 |
+
"step2": { "title": "剪辑设置", "style_label": "剪辑风格", "duration_label": "目标时长(秒)", "count_label": "剪辑数量", "clip_lang_label": "视频语言", "sub_lang_label": "字幕语言", "mode_label": "编辑模式", "normal_mode": "普通字幕", "hre_mode": "高留存率编辑(AI决定)", "hre_hint": "AI将生成分段剪辑计划,改变字幕样式,应用自动缩放,并突出TikTok节奏。" },
|
| 5 |
"step3": { "title": "字幕设计", "font_label": "字体", "size_label": "字体大小", "primary_color": "主要颜色", "secondary_color": "卡拉OK颜色", "outline_color": "描边颜色", "shadow_color": "阴影颜色", "outline_size": "描边大小", "shadow_size": "阴影大小", "display_mode": "显示模式", "word_by_word": "逐字", "sentence": "句子", "animation": "动画", "alignment": "对齐", "preview": "预览" },
|
| 6 |
"generate": { "button": "生成剪辑", "processing": "处理中..." },
|
| 7 |
"styles": { "funny": "搞笑", "serious": "严肃", "educational": "教育", "gaming": "游戏", "entertainment": "娱乐" },
|
frontend/next-env.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/// <reference types="next" />
|
| 2 |
+
/// <reference types="next/image-types/global" />
|
| 3 |
+
import "./.next/types/routes.d.ts";
|
| 4 |
+
|
| 5 |
+
// NOTE: This file should not be edited
|
| 6 |
+
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
|