Spaces:
Runtime error
Runtime error
fix: tune HRE crop and caption timing
Browse files
backend/main.py
CHANGED
|
@@ -354,7 +354,9 @@ async def _run_pipeline(
|
|
| 354 |
|
| 355 |
# ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
|
| 356 |
await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
|
| 357 |
-
|
|
|
|
|
|
|
| 358 |
clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=extract_aspect_mode)
|
| 359 |
|
| 360 |
# ── 8. Subtitles / HRE (all clips in parallel) ─────────────────
|
|
|
|
| 354 |
|
| 355 |
# ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
|
| 356 |
await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
|
| 357 |
+
# HRE needs a true TikTok crop, not a shrunken fit/letterbox frame.
|
| 358 |
+
# The crop extractor centers on Qwen's face/person bbox when available.
|
| 359 |
+
extract_aspect_mode = "crop" if settings.mode == "hre" else settings.aspect_mode
|
| 360 |
clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=extract_aspect_mode)
|
| 361 |
|
| 362 |
# ── 8. Subtitles / HRE (all clips in parallel) ─────────────────
|
backend/src/analysis/vision.py
CHANGED
|
@@ -195,6 +195,8 @@ Rules:
|
|
| 195 |
- For normal explanatory speech:
|
| 196 |
zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
|
| 197 |
- Use zoom OUT only as breathing room after an intense/key moment.
|
|
|
|
|
|
|
| 198 |
- subtitle WORD: short hooks, reactions, punchlines, important keywords
|
| 199 |
- subtitle PHRASE: fast but understandable speech, 2-4 words at a time
|
| 200 |
- subtitle SENTENCE: explanation, normal conversation, low/medium energy
|
|
|
|
| 195 |
- For normal explanatory speech:
|
| 196 |
zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
|
| 197 |
- Use zoom OUT only as breathing room after an intense/key moment.
|
| 198 |
+
- Sentence captions should sit around center-bottom: caption_x about 0.50, caption_y about 0.68-0.74, caption_anchor=2.
|
| 199 |
+
- Word highlight captions can sit center, mid-upper, mid-left, or mid-right with larger text, as long as they avoid the face/product.
|
| 200 |
- subtitle WORD: short hooks, reactions, punchlines, important keywords
|
| 201 |
- subtitle PHRASE: fast but understandable speech, 2-4 words at a time
|
| 202 |
- subtitle SENTENCE: explanation, normal conversation, low/medium energy
|
backend/src/processing/high_retention.py
CHANGED
|
@@ -192,14 +192,14 @@ def _build_zoom_exprs(
|
|
| 192 |
|
| 193 |
if direction == "in":
|
| 194 |
if speed == "fast":
|
| 195 |
-
z_expr, max_zoom = "min(1.
|
| 196 |
else:
|
| 197 |
-
z_expr, max_zoom = "min(1.
|
| 198 |
elif direction == "out":
|
| 199 |
if speed == "fast":
|
| 200 |
-
z_expr, max_zoom = "max(1.
|
| 201 |
else:
|
| 202 |
-
z_expr, max_zoom = "max(1.28-on*0.
|
| 203 |
else: # hold
|
| 204 |
z_expr, max_zoom = "1.08", 1.08
|
| 205 |
|
|
@@ -357,6 +357,20 @@ def _safe_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tup
|
|
| 357 |
return x, y, _caption_anchor_for(x, y)
|
| 358 |
|
| 359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
|
| 361 |
"""Validate model output and fill HRE fields used by the renderer."""
|
| 362 |
an = dict(analysis or {})
|
|
@@ -402,11 +416,27 @@ def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
|
|
| 402 |
0.82,
|
| 403 |
)
|
| 404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
if subject_bbox:
|
| 406 |
x1, y1, x2, y2 = subject_bbox
|
| 407 |
overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
|
| 408 |
if overlaps_subject:
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
if seg_idx == 0:
|
| 412 |
zoom_direction, zoom_speed = "in", "fast"
|
|
@@ -724,11 +754,11 @@ def _subtitle_tag(plan: dict) -> tuple[str, int]:
|
|
| 724 |
max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
|
| 725 |
|
| 726 |
if mode == "sentence":
|
| 727 |
-
font_size =
|
| 728 |
elif mode == "phrase":
|
| 729 |
font_size = 68 if energy != "low" else 62
|
| 730 |
else:
|
| 731 |
-
font_size =
|
| 732 |
|
| 733 |
if alignment in {4, 5, 6}:
|
| 734 |
font_size = max(54, font_size - 4)
|
|
|
|
| 192 |
|
| 193 |
if direction == "in":
|
| 194 |
if speed == "fast":
|
| 195 |
+
z_expr, max_zoom = "min(1.0+on*0.0100\\,1.45)", 1.45
|
| 196 |
else:
|
| 197 |
+
z_expr, max_zoom = "min(1.0+on*0.0035\\,1.28)", 1.28
|
| 198 |
elif direction == "out":
|
| 199 |
if speed == "fast":
|
| 200 |
+
z_expr, max_zoom = "max(1.45-on*0.0100\\,1.0)", 1.45
|
| 201 |
else:
|
| 202 |
+
z_expr, max_zoom = "max(1.28-on*0.0040\\,1.0)", 1.28
|
| 203 |
else: # hold
|
| 204 |
z_expr, max_zoom = "1.08", 1.08
|
| 205 |
|
|
|
|
| 357 |
return x, y, _caption_anchor_for(x, y)
|
| 358 |
|
| 359 |
|
| 360 |
+
def _word_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tuple[float, float, int]:
|
| 361 |
+
"""Put highlight words in punchy mid-frame zones instead of ordinary subtitle zones."""
|
| 362 |
+
candidates = [
|
| 363 |
+
(0.50, 0.42),
|
| 364 |
+
(0.50, 0.26),
|
| 365 |
+
(0.28 if subject_x > 0.55 else 0.72, 0.46),
|
| 366 |
+
(0.30 if subject_x > 0.50 else 0.70, 0.58),
|
| 367 |
+
]
|
| 368 |
+
x, y = candidates[seg_idx % len(candidates)]
|
| 369 |
+
if abs(x - subject_x) < 0.18 and abs(y - subject_y) < 0.18:
|
| 370 |
+
x = 0.25 if subject_x > 0.5 else 0.75
|
| 371 |
+
return x, y, _caption_anchor_for(x, y)
|
| 372 |
+
|
| 373 |
+
|
| 374 |
def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
|
| 375 |
"""Validate model output and fill HRE fields used by the renderer."""
|
| 376 |
an = dict(analysis or {})
|
|
|
|
| 416 |
0.82,
|
| 417 |
)
|
| 418 |
|
| 419 |
+
if mode == "sentence":
|
| 420 |
+
caption_x = 0.50
|
| 421 |
+
caption_y = _clamp_float(an.get("caption_y"), 0.70, 0.64, 0.74)
|
| 422 |
+
caption_anchor = 2
|
| 423 |
+
caption_max_width_pct = max(caption_max_width_pct, 0.68)
|
| 424 |
+
elif mode == "word":
|
| 425 |
+
word_x, word_y, word_anchor = _word_caption_point(subject_x, subject_y, seg_idx)
|
| 426 |
+
if caption_y > 0.66 or (abs(caption_x - subject_x) < 0.14 and abs(caption_y - subject_y) < 0.14):
|
| 427 |
+
caption_x, caption_y, caption_anchor = word_x, word_y, word_anchor
|
| 428 |
+
caption_max_width_pct = min(caption_max_width_pct, 0.56)
|
| 429 |
+
|
| 430 |
if subject_bbox:
|
| 431 |
x1, y1, x2, y2 = subject_bbox
|
| 432 |
overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
|
| 433 |
if overlaps_subject:
|
| 434 |
+
if mode == "sentence":
|
| 435 |
+
caption_x, caption_y, caption_anchor = 0.50, 0.70, 2
|
| 436 |
+
elif mode == "word":
|
| 437 |
+
caption_x, caption_y, caption_anchor = _word_caption_point(subject_x, subject_y, seg_idx + 1)
|
| 438 |
+
else:
|
| 439 |
+
caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor
|
| 440 |
|
| 441 |
if seg_idx == 0:
|
| 442 |
zoom_direction, zoom_speed = "in", "fast"
|
|
|
|
| 754 |
max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
|
| 755 |
|
| 756 |
if mode == "sentence":
|
| 757 |
+
font_size = 54 if energy != "high" else 60
|
| 758 |
elif mode == "phrase":
|
| 759 |
font_size = 68 if energy != "low" else 62
|
| 760 |
else:
|
| 761 |
+
font_size = 96 if energy == "high" else 84
|
| 762 |
|
| 763 |
if alignment in {4, 5, 6}:
|
| 764 |
font_size = max(54, font_size - 4)
|