jakgritb commited on
Commit
5543229
·
verified ·
1 Parent(s): 65a5f35

fix: tune HRE crop and caption timing

Browse files
backend/main.py CHANGED
@@ -354,7 +354,9 @@ async def _run_pipeline(
354
 
355
  # ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
356
  await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
357
- extract_aspect_mode = "safe_fit" if settings.mode == "hre" and settings.aspect_mode == "crop" else settings.aspect_mode
 
 
358
  clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=extract_aspect_mode)
359
 
360
  # ── 8. Subtitles / HRE (all clips in parallel) ─────────────────
 
354
 
355
  # ── 7. Extract clips (AMD AMF hardware encoder) ─────────────────
356
  await send_progress(session_id, "cutting", 81, f"Cutting {len(selected)} clips (h264_amf)...")
357
+ # HRE needs a true TikTok crop, not a shrunken fit/letterbox frame.
358
+ # The crop extractor centers on Qwen's face/person bbox when available.
359
+ extract_aspect_mode = "crop" if settings.mode == "hre" else settings.aspect_mode
360
  clips = await extract_all_clips_async(video_path, selected, session_dir, session_id, aspect_mode=extract_aspect_mode)
361
 
362
  # ── 8. Subtitles / HRE (all clips in parallel) ─────────────────
backend/src/analysis/vision.py CHANGED
@@ -195,6 +195,8 @@ Rules:
195
  - For normal explanatory speech:
196
  zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
197
  - Use zoom OUT only as breathing room after an intense/key moment.
 
 
198
  - subtitle WORD: short hooks, reactions, punchlines, important keywords
199
  - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
200
  - subtitle SENTENCE: explanation, normal conversation, low/medium energy
 
195
  - For normal explanatory speech:
196
  zoom_direction=hold, zoom_speed=slow, subtitle_mode=sentence, subtitle_emphasis=calm, energy_level=medium or low.
197
  - Use zoom OUT only as breathing room after an intense/key moment.
198
+ - Sentence captions should sit around center-bottom: caption_x about 0.50, caption_y about 0.68-0.74, caption_anchor=2.
199
+ - Word highlight captions can sit center, mid-upper, mid-left, or mid-right with larger text, as long as they avoid the face/product.
200
  - subtitle WORD: short hooks, reactions, punchlines, important keywords
201
  - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
202
  - subtitle SENTENCE: explanation, normal conversation, low/medium energy
backend/src/processing/high_retention.py CHANGED
@@ -192,14 +192,14 @@ def _build_zoom_exprs(
192
 
193
  if direction == "in":
194
  if speed == "fast":
195
- z_expr, max_zoom = "min(1.12+on*0.0018\\,1.55)", 1.55
196
  else:
197
- z_expr, max_zoom = "min(1.04+on*0.0009\\,1.32)", 1.32
198
  elif direction == "out":
199
  if speed == "fast":
200
- z_expr, max_zoom = "max(1.48-on*0.0018\\,1.0)", 1.48
201
  else:
202
- z_expr, max_zoom = "max(1.28-on*0.0009\\,1.0)", 1.28
203
  else: # hold
204
  z_expr, max_zoom = "1.08", 1.08
205
 
@@ -357,6 +357,20 @@ def _safe_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tup
357
  return x, y, _caption_anchor_for(x, y)
358
 
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
361
  """Validate model output and fill HRE fields used by the renderer."""
362
  an = dict(analysis or {})
@@ -402,11 +416,27 @@ def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
402
  0.82,
403
  )
404
 
 
 
 
 
 
 
 
 
 
 
 
405
  if subject_bbox:
406
  x1, y1, x2, y2 = subject_bbox
407
  overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
408
  if overlaps_subject:
409
- caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor
 
 
 
 
 
410
 
411
  if seg_idx == 0:
412
  zoom_direction, zoom_speed = "in", "fast"
@@ -724,11 +754,11 @@ def _subtitle_tag(plan: dict) -> tuple[str, int]:
724
  max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
725
 
726
  if mode == "sentence":
727
- font_size = 56 if energy != "high" else 62
728
  elif mode == "phrase":
729
  font_size = 68 if energy != "low" else 62
730
  else:
731
- font_size = 80 if energy == "high" else 72
732
 
733
  if alignment in {4, 5, 6}:
734
  font_size = max(54, font_size - 4)
 
192
 
193
  if direction == "in":
194
  if speed == "fast":
195
+ z_expr, max_zoom = "min(1.0+on*0.0100\\,1.45)", 1.45
196
  else:
197
+ z_expr, max_zoom = "min(1.0+on*0.0035\\,1.28)", 1.28
198
  elif direction == "out":
199
  if speed == "fast":
200
+ z_expr, max_zoom = "max(1.45-on*0.0100\\,1.0)", 1.45
201
  else:
202
+ z_expr, max_zoom = "max(1.28-on*0.0040\\,1.0)", 1.28
203
  else: # hold
204
  z_expr, max_zoom = "1.08", 1.08
205
 
 
357
  return x, y, _caption_anchor_for(x, y)
358
 
359
 
360
+ def _word_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tuple[float, float, int]:
361
+ """Put highlight words in punchy mid-frame zones instead of ordinary subtitle zones."""
362
+ candidates = [
363
+ (0.50, 0.42),
364
+ (0.50, 0.26),
365
+ (0.28 if subject_x > 0.55 else 0.72, 0.46),
366
+ (0.30 if subject_x > 0.50 else 0.70, 0.58),
367
+ ]
368
+ x, y = candidates[seg_idx % len(candidates)]
369
+ if abs(x - subject_x) < 0.18 and abs(y - subject_y) < 0.18:
370
+ x = 0.25 if subject_x > 0.5 else 0.75
371
+ return x, y, _caption_anchor_for(x, y)
372
+
373
+
374
  def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
375
  """Validate model output and fill HRE fields used by the renderer."""
376
  an = dict(analysis or {})
 
416
  0.82,
417
  )
418
 
419
+ if mode == "sentence":
420
+ caption_x = 0.50
421
+ caption_y = _clamp_float(an.get("caption_y"), 0.70, 0.64, 0.74)
422
+ caption_anchor = 2
423
+ caption_max_width_pct = max(caption_max_width_pct, 0.68)
424
+ elif mode == "word":
425
+ word_x, word_y, word_anchor = _word_caption_point(subject_x, subject_y, seg_idx)
426
+ if caption_y > 0.66 or (abs(caption_x - subject_x) < 0.14 and abs(caption_y - subject_y) < 0.14):
427
+ caption_x, caption_y, caption_anchor = word_x, word_y, word_anchor
428
+ caption_max_width_pct = min(caption_max_width_pct, 0.56)
429
+
430
  if subject_bbox:
431
  x1, y1, x2, y2 = subject_bbox
432
  overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
433
  if overlaps_subject:
434
+ if mode == "sentence":
435
+ caption_x, caption_y, caption_anchor = 0.50, 0.70, 2
436
+ elif mode == "word":
437
+ caption_x, caption_y, caption_anchor = _word_caption_point(subject_x, subject_y, seg_idx + 1)
438
+ else:
439
+ caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor
440
 
441
  if seg_idx == 0:
442
  zoom_direction, zoom_speed = "in", "fast"
 
754
  max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
755
 
756
  if mode == "sentence":
757
+ font_size = 54 if energy != "high" else 60
758
  elif mode == "phrase":
759
  font_size = 68 if energy != "low" else 62
760
  else:
761
+ font_size = 96 if energy == "high" else 84
762
 
763
  if alignment in {4, 5, 6}:
764
  font_size = max(54, font_size - 4)