jakgritb commited on
Commit
e82c4ed
·
verified ·
1 Parent(s): 936060a

fix: improve HRE spatial editing

Browse files
backend/src/analysis/vision.py CHANGED
@@ -161,7 +161,7 @@ def _default_analysis() -> dict:
161
  }
162
 
163
 
164
- HRE_SEGMENT_PROMPT = """Analyze this video frame for high-retention TikTok editing decisions.
165
 
166
  Segment {seg_idx} of {n_total}. Transcript: "{context}"
167
 
@@ -172,7 +172,14 @@ Respond ONLY with valid JSON — no markdown:
172
  "face_detected": <true|false>,
173
  "face_cx": <0.0-1.0>,
174
  "face_cy": <0.0-1.0>,
 
 
 
175
  "subtitle_position": "<top|bottom|left|right|center>",
 
 
 
 
176
  "subtitle_mode": "<word|phrase|sentence>",
177
  "subtitle_emphasis": "<pop|punch|calm>",
178
  "subtitle_color": "<white|yellow|cyan|orange|green>",
@@ -189,16 +196,19 @@ Rules:
189
  - subtitle WORD: short hooks, reactions, punchlines, important keywords
190
  - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
191
  - subtitle SENTENCE: explanation, normal conversation, low/medium energy
192
- - subtitle TOP: face is in bottom half
193
- - subtitle BOTTOM: face is in top half
194
- - subtitle LEFT/RIGHT: face or main object is on the opposite side
195
- - Avoid choosing the exact same subtitle_position and subtitle_mode for every segment.
 
 
 
196
  - face_cx/face_cy: face center as 0.0-1.0 fraction of frame
197
  """
198
 
199
 
200
- def analyze_frame_for_hre(
201
- frame_path: "Path",
202
  context: str = "",
203
  seg_idx: int = 0,
204
  n_total: int = 1,
@@ -208,23 +218,29 @@ def analyze_frame_for_hre(
208
  from openai import OpenAI
209
 
210
  client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
211
- if not Path(frame_path).exists():
 
212
  return _default_hre_analysis(seg_idx, n_total)
213
 
214
- b64 = _encode_image(str(frame_path))
 
 
 
 
 
 
 
215
  prompt = HRE_SEGMENT_PROMPT.format(
216
- seg_idx=seg_idx, n_total=n_total, context=context[:200]
217
  )
 
218
  response = client.chat.completions.create(
219
  model=VLLM_MODEL,
220
  messages=[{
221
  "role": "user",
222
- "content": [
223
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
224
- {"type": "text", "text": prompt},
225
- ],
226
  }],
227
- max_tokens=200,
228
  temperature=0.1,
229
  )
230
  raw = response.choices[0].message.content.strip()
@@ -238,6 +254,7 @@ def analyze_frame_for_hre(
238
  logger.debug(
239
  f"HRE seg {seg_idx}/{n_total}: "
240
  f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
 
241
  f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
242
  f"{analysis.get('subtitle_color')} "
243
  f"type={analysis.get('moment_type')}"
@@ -254,6 +271,16 @@ def analyze_frame_for_hre(
254
  return _default_hre_analysis(seg_idx, n_total)
255
 
256
 
 
 
 
 
 
 
 
 
 
 
257
  def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
258
  """Fallback with varied decisions based on position in clip."""
259
  if seg_idx == 0:
@@ -267,8 +294,11 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
267
 
268
  _colors = ["yellow", "white", "cyan", "orange", "white", "yellow"]
269
  _positions = ["bottom", "top", "left", "bottom", "right", "top"]
 
 
270
  _modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
271
  _emphasis = ["punch", "calm", "pop", "punch", "calm", "pop"]
 
272
 
273
  return {
274
  "zoom_direction": zoom_dir,
@@ -276,7 +306,14 @@ def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
276
  "face_detected": False,
277
  "face_cx": 0.5,
278
  "face_cy": 0.38,
 
 
 
279
  "subtitle_position": _positions[seg_idx % len(_positions)],
 
 
 
 
280
  "subtitle_mode": _modes[seg_idx % len(_modes)],
281
  "subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
282
  "subtitle_color": _colors[seg_idx % len(_colors)],
 
161
  }
162
 
163
 
164
+ HRE_SEGMENT_PROMPT = """Analyze these video frames for high-retention TikTok editing decisions.
165
 
166
  Segment {seg_idx} of {n_total}. Transcript: "{context}"
167
 
 
172
  "face_detected": <true|false>,
173
  "face_cx": <0.0-1.0>,
174
  "face_cy": <0.0-1.0>,
175
+ "subject_bbox": [<x1>, <y1>, <x2>, <y2>] or null,
176
+ "zoom_anchor_x": <0.0-1.0>,
177
+ "zoom_anchor_y": <0.0-1.0>,
178
  "subtitle_position": "<top|bottom|left|right|center>",
179
+ "caption_x": <0.10-0.90>,
180
+ "caption_y": <0.12-0.88>,
181
+ "caption_anchor": <1-9>,
182
+ "caption_max_width_pct": <0.35-0.82>,
183
  "subtitle_mode": "<word|phrase|sentence>",
184
  "subtitle_emphasis": "<pop|punch|calm>",
185
  "subtitle_color": "<white|yellow|cyan|orange|green>",
 
196
  - subtitle WORD: short hooks, reactions, punchlines, important keywords
197
  - subtitle PHRASE: fast but understandable speech, 2-4 words at a time
198
  - subtitle SENTENCE: explanation, normal conversation, low/medium energy
199
+ - subject_bbox: main face/person/product/object box in normalized frame coordinates, or null if unclear
200
+ - zoom_anchor_x/y: center of the face/person/product to keep important content in frame; never choose a blank wall/window
201
+ - caption_x/y: choose an actually empty readable area in this frame, not just fixed top/bottom
202
+ - caption_anchor: ASS anchor 1-9 matching caption_x/y (1 bottom-left, 5 center, 9 top-right)
203
+ - caption_max_width_pct: smaller when the empty space is narrow; captions must stay fully inside the 9:16 frame
204
+ - Keep captions away from face, product, hands, and important screen/object regions.
205
+ - Avoid choosing the exact same caption_x/y and subtitle_mode for every segment.
206
  - face_cx/face_cy: face center as 0.0-1.0 fraction of frame
207
  """
208
 
209
 
210
+ def analyze_frames_for_hre(
211
+ frame_paths: list["Path"],
212
  context: str = "",
213
  seg_idx: int = 0,
214
  n_total: int = 1,
 
218
  from openai import OpenAI
219
 
220
  client = OpenAI(base_url=VLLM_BASE_URL, api_key=VLLM_API_KEY)
221
+ valid_frames = [Path(p) for p in frame_paths[:3] if Path(p).exists()]
222
+ if not valid_frames:
223
  return _default_hre_analysis(seg_idx, n_total)
224
 
225
+ content = []
226
+ for frame_path in valid_frames:
227
+ b64 = _encode_image(str(frame_path))
228
+ content.append({
229
+ "type": "image_url",
230
+ "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
231
+ })
232
+
233
  prompt = HRE_SEGMENT_PROMPT.format(
234
+ seg_idx=seg_idx, n_total=n_total, context=context[:320]
235
  )
236
+ content.append({"type": "text", "text": prompt})
237
  response = client.chat.completions.create(
238
  model=VLLM_MODEL,
239
  messages=[{
240
  "role": "user",
241
+ "content": content,
 
 
 
242
  }],
243
+ max_tokens=380,
244
  temperature=0.1,
245
  )
246
  raw = response.choices[0].message.content.strip()
 
254
  logger.debug(
255
  f"HRE seg {seg_idx}/{n_total}: "
256
  f"zoom={analysis.get('zoom_direction')}({analysis.get('zoom_speed')}) "
257
+ f"caption=({analysis.get('caption_x')},{analysis.get('caption_y')}) "
258
  f"sub={analysis.get('subtitle_position')}/{analysis.get('subtitle_mode')}/"
259
  f"{analysis.get('subtitle_color')} "
260
  f"type={analysis.get('moment_type')}"
 
271
  return _default_hre_analysis(seg_idx, n_total)
272
 
273
 
274
+ def analyze_frame_for_hre(
275
+ frame_path: "Path",
276
+ context: str = "",
277
+ seg_idx: int = 0,
278
+ n_total: int = 1,
279
+ ) -> dict:
280
+ """Backward-compatible wrapper for callers that provide one frame."""
281
+ return analyze_frames_for_hre([frame_path], context, seg_idx, n_total)
282
+
283
+
284
  def _default_hre_analysis(seg_idx: int = 0, n_total: int = 1) -> dict:
285
  """Fallback with varied decisions based on position in clip."""
286
  if seg_idx == 0:
 
294
 
295
  _colors = ["yellow", "white", "cyan", "orange", "white", "yellow"]
296
  _positions = ["bottom", "top", "left", "bottom", "right", "top"]
297
+ _coords = [(0.50, 0.76), (0.50, 0.18), (0.28, 0.56), (0.50, 0.72), (0.72, 0.56), (0.50, 0.20)]
298
+ _anchors = [2, 8, 4, 2, 6, 8]
299
  _modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
300
  _emphasis = ["punch", "calm", "pop", "punch", "calm", "pop"]
301
+ caption_x, caption_y = _coords[seg_idx % len(_coords)]
302
 
303
  return {
304
  "zoom_direction": zoom_dir,
 
306
  "face_detected": False,
307
  "face_cx": 0.5,
308
  "face_cy": 0.38,
309
+ "subject_bbox": None,
310
+ "zoom_anchor_x": 0.5,
311
+ "zoom_anchor_y": 0.38,
312
  "subtitle_position": _positions[seg_idx % len(_positions)],
313
+ "caption_x": caption_x,
314
+ "caption_y": caption_y,
315
+ "caption_anchor": _anchors[seg_idx % len(_anchors)],
316
+ "caption_max_width_pct": 0.62,
317
  "subtitle_mode": _modes[seg_idx % len(_modes)],
318
  "subtitle_emphasis": _emphasis[seg_idx % len(_emphasis)],
319
  "subtitle_color": _colors[seg_idx % len(_colors)],
backend/src/processing/clip_extractor.py CHANGED
@@ -5,6 +5,26 @@ from pathlib import Path
5
  from loguru import logger
6
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def extract_clip(
9
  video_path: Path,
10
  start: float,
@@ -37,10 +57,9 @@ def extract_clip(
37
  else:
38
  # Crop: scale to 1920 height first, then center-crop to 1080 wide
39
  # Optionally center on face_bbox x when available
40
- if face_bbox and len(face_bbox) == 4:
41
- x1, _, x2, _ = face_bbox
42
- face_cx = int((x1 + x2) / 2)
43
- crop = f"scale=-1:1920,crop=1080:1920:max(0\\,min(iw-1080\\,{face_cx}*iw/in_w-540)):0"
44
  else:
45
  crop = "scale=-1:1920,crop=1080:1920:(iw-1080)/2:0"
46
  vf_filters.append(crop)
 
5
  from loguru import logger
6
 
7
 
8
+ def _face_center_expr(face_bbox: list | None) -> str | None:
9
+ """Return a crop expression x-center from Qwen's normalized face bbox."""
10
+ if not face_bbox or len(face_bbox) != 4:
11
+ return None
12
+ try:
13
+ x1, _, x2, _ = [float(v) for v in face_bbox]
14
+ except Exception:
15
+ return None
16
+
17
+ # Qwen prompt asks for normalized percentages. Older comments said pixels,
18
+ # so keep a conservative pixel fallback, but prefer normalized handling.
19
+ face_cx = (x1 + x2) / 2.0
20
+ if max(abs(x1), abs(x2)) <= 1.5:
21
+ face_cx = min(1.0, max(0.0, face_cx))
22
+ return f"{face_cx:.4f}*iw-540"
23
+ if 0 <= face_cx <= 1080:
24
+ return f"({face_cx:.1f}/1080)*iw-540"
25
+ return None
26
+
27
+
28
  def extract_clip(
29
  video_path: Path,
30
  start: float,
 
57
  else:
58
  # Crop: scale to 1920 height first, then center-crop to 1080 wide
59
  # Optionally center on face_bbox x when available
60
+ face_expr = _face_center_expr(face_bbox)
61
+ if face_expr:
62
+ crop = f"scale=-1:1920,crop=1080:1920:max(0\\,min(iw-1080\\,{face_expr})):0"
 
63
  else:
64
  crop = "scale=-1:1920,crop=1080:1920:(iw-1080)/2:0"
65
  vf_filters.append(crop)
backend/src/processing/high_retention.py CHANGED
@@ -125,6 +125,24 @@ def _extract_frame(video_path: Path, t: float, out_path: Path) -> bool:
125
  return result.returncode == 0 and out_path.exists()
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # ─── Per-segment AI analysis ──────────────────────────────────────────────────
129
 
130
  def _analyze_segment(
@@ -136,12 +154,10 @@ def _analyze_segment(
136
  clip_start: float,
137
  tmp_dir: Path,
138
  ) -> dict:
139
- from src.analysis.vision import analyze_frame_for_hre, _default_hre_analysis
140
-
141
- mid_t = (seg["start"] + seg["end"]) / 2.0
142
- frame_path = tmp_dir / f"seg_{seg_idx:03d}.jpg"
143
 
144
- if not _extract_frame(video_path, mid_t, frame_path):
 
145
  return _default_hre_analysis(seg_idx, n_total)
146
 
147
  words_all: list[dict] = []
@@ -156,7 +172,7 @@ def _analyze_segment(
156
  if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start
157
  ).strip()
158
 
159
- return analyze_frame_for_hre(frame_path, context, seg_idx, n_total)
160
 
161
 
162
  # ─── Zoom expression builders ─────────────────────────────────────────────────
@@ -171,9 +187,8 @@ def _build_zoom_exprs(
171
  """
172
  direction = analysis.get("zoom_direction", "in")
173
  speed = analysis.get("zoom_speed", "slow")
174
- face_detected = bool(analysis.get("face_detected", False))
175
- face_cx = float(analysis.get("face_cx") or 0.5)
176
- face_cy = float(analysis.get("face_cy") or 0.38)
177
 
178
  if direction == "in":
179
  if speed == "fast":
@@ -188,13 +203,13 @@ def _build_zoom_exprs(
188
  else: # hold
189
  z_expr, max_zoom = "1.08", 1.08
190
 
191
- if face_detected and direction == "in" and max_zoom > 1.05:
192
- x_expr = f"max(0\\,min(iw-iw/zoom\\,iw*{face_cx:.3f}-iw/zoom/2))"
193
- y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{face_cy:.3f}-ih/zoom/2))"
194
  else:
195
  x_expr = "iw/2-(iw/zoom/2)"
196
  if direction == "in":
197
- y_bias = min(face_cy, 0.5) if face_cy < 0.55 else 0.38
198
  y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{y_bias:.2f}-(ih/zoom/2)))"
199
  else:
200
  y_expr = "ih/2-(ih/zoom/2)"
@@ -264,9 +279,10 @@ _ASS_COLORS = {
264
  "red": "&H000000FF",
265
  }
266
 
267
- _POSITIONS = {"top", "bottom", "left", "right", "center"}
268
  _MODES = {"word", "phrase", "sentence"}
269
  _EMPHASIS = {"pop", "punch", "calm"}
 
270
 
271
 
272
  def _ts(t: float) -> str:
@@ -285,9 +301,66 @@ def _pick(value: object, allowed: set[str], fallback: str) -> str:
285
  return v if v in allowed else fallback
286
 
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
289
  """Validate model output and fill HRE fields used by the renderer."""
290
  an = dict(analysis or {})
 
291
  energy = _pick(an.get("energy_level"), {"high", "medium", "low"}, "medium")
292
  moment = _pick(
293
  an.get("moment_type"),
@@ -299,18 +372,41 @@ def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
299
  if energy == "medium" and moment not in {"context", "transition"}:
300
  fallback_mode = "phrase"
301
 
302
- pos = _pick(an.get("subtitle_position"), _POSITIONS, "bottom")
 
 
 
 
 
 
 
303
  mode = _pick(an.get("subtitle_mode"), _MODES, fallback_mode)
304
  emphasis = _pick(an.get("subtitle_emphasis"), _EMPHASIS, "punch" if mode == "word" else "calm")
305
  color = _pick(an.get("subtitle_color"), set(_ASS_COLORS), "white")
306
  zoom_direction = _pick(an.get("zoom_direction"), {"in", "out", "hold"}, "in")
307
  zoom_speed = _pick(an.get("zoom_speed"), {"fast", "slow"}, "slow")
308
 
309
- try:
310
- face_cx = min(1.0, max(0.0, float(an.get("face_cx", 0.5))))
311
- face_cy = min(1.0, max(0.0, float(an.get("face_cy", 0.38))))
312
- except Exception:
313
- face_cx, face_cy = 0.5, 0.38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  if seg_idx == 0:
316
  zoom_direction, zoom_speed = "in", "fast"
@@ -326,7 +422,14 @@ def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
326
  "face_detected": bool(an.get("face_detected", False)),
327
  "face_cx": face_cx,
328
  "face_cy": face_cy,
 
 
 
329
  "subtitle_position": pos,
 
 
 
 
330
  "subtitle_mode": mode,
331
  "subtitle_emphasis": emphasis,
332
  "subtitle_color": color,
@@ -344,11 +447,14 @@ def _build_hre_plan(segments: list[dict], analyses: list[dict]) -> list[dict]:
344
 
345
  # If the model repeats the same caption treatment for every segment, rotate
346
  # through safe defaults so HRE visibly changes across the clip.
347
- if len(plan) > 1 and len({(p["subtitle_position"], p["subtitle_mode"]) for p in plan}) == 1:
348
- positions = ["bottom", "top", "left", "bottom", "right", "top"]
 
349
  modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
350
  for i, p in enumerate(plan):
351
  p["subtitle_position"] = positions[i % len(positions)]
 
 
352
  p["subtitle_mode"] = modes[i % len(modes)]
353
  if p["subtitle_mode"] == "word":
354
  p["subtitle_emphasis"] = "punch"
@@ -599,38 +705,30 @@ def _build_subtitle_events(
599
 
600
 
601
  def _subtitle_tag(plan: dict) -> tuple[str, int]:
602
- pos = plan["subtitle_position"]
603
  mode = plan["subtitle_mode"]
604
  energy = plan["energy_level"]
605
  emphasis = plan["subtitle_emphasis"]
606
  color = _ASS_COLORS.get(plan["subtitle_color"], "&H00FFFFFF")
607
-
608
- anchors = {
609
- "top": (8, 540, 230),
610
- "bottom": (2, 540, 1660),
611
- "left": (4, 95, 960),
612
- "right": (6, 985, 960),
613
- "center": (5, 540, 960),
614
- }
615
- alignment, x, y = anchors.get(pos, anchors["bottom"])
616
 
617
  if mode == "sentence":
618
- font_size = 66 if energy != "high" else 74
619
- max_chars = 34
620
  elif mode == "phrase":
621
- font_size = 82 if energy != "low" else 76
622
- max_chars = 24
623
  else:
624
- font_size = 102 if energy == "high" else 92
625
- max_chars = 18
 
 
626
 
627
- if pos in {"left", "right"}:
628
- font_size -= 8
629
- max_chars = min(max_chars, 22)
630
 
631
  base = (
632
  f"{{\\an{alignment}\\pos({x},{y})\\1c{color}&\\fs{font_size}"
633
- "\\b1\\bord5\\shad1\\q2}}"
634
  )
635
  if emphasis in {"pop", "punch"} or mode == "word":
636
  base += "{\\fscx125\\fscy125\\t(0,120,\\fscx100\\fscy100)}"
 
125
  return result.returncode == 0 and out_path.exists()
126
 
127
 
128
+ def _extract_segment_frames(video_path: Path, seg: dict, seg_idx: int, tmp_dir: Path) -> list[Path]:
129
+ """Extract a few representative frames so HRE decisions see motion, not one random still."""
130
+ start = float(seg["start"])
131
+ end = float(seg["end"])
132
+ duration = max(0.1, end - start)
133
+ times = [
134
+ start + duration * 0.25,
135
+ start + duration * 0.50,
136
+ start + duration * 0.75,
137
+ ]
138
+ frames: list[Path] = []
139
+ for j, t in enumerate(times):
140
+ frame_path = tmp_dir / f"seg_{seg_idx:03d}_{j}.jpg"
141
+ if _extract_frame(video_path, min(max(start, t), max(start, end - 0.05)), frame_path):
142
+ frames.append(frame_path)
143
+ return frames
144
+
145
+
146
  # ─── Per-segment AI analysis ──────────────────────────────────────────────────
147
 
148
  def _analyze_segment(
 
154
  clip_start: float,
155
  tmp_dir: Path,
156
  ) -> dict:
157
+ from src.analysis.vision import analyze_frames_for_hre, _default_hre_analysis
 
 
 
158
 
159
+ frame_paths = _extract_segment_frames(video_path, seg, seg_idx, tmp_dir)
160
+ if not frame_paths:
161
  return _default_hre_analysis(seg_idx, n_total)
162
 
163
  words_all: list[dict] = []
 
172
  if w.get("start", 0) < abs_end and w.get("end", 0) > abs_start
173
  ).strip()
174
 
175
+ return analyze_frames_for_hre(frame_paths, context, seg_idx, n_total)
176
 
177
 
178
  # ─── Zoom expression builders ─────────────────────────────────────────────────
 
187
  """
188
  direction = analysis.get("zoom_direction", "in")
189
  speed = analysis.get("zoom_speed", "slow")
190
+ zoom_anchor_x = _clamp_float(analysis.get("zoom_anchor_x"), _clamp_float(analysis.get("face_cx"), 0.5))
191
+ zoom_anchor_y = _clamp_float(analysis.get("zoom_anchor_y"), _clamp_float(analysis.get("face_cy"), 0.38))
 
192
 
193
  if direction == "in":
194
  if speed == "fast":
 
203
  else: # hold
204
  z_expr, max_zoom = "1.08", 1.08
205
 
206
+ if direction == "in" and max_zoom > 1.05:
207
+ x_expr = f"max(0\\,min(iw-iw/zoom\\,iw*{zoom_anchor_x:.3f}-iw/zoom/2))"
208
+ y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{zoom_anchor_y:.3f}-ih/zoom/2))"
209
  else:
210
  x_expr = "iw/2-(iw/zoom/2)"
211
  if direction == "in":
212
+ y_bias = min(zoom_anchor_y, 0.5) if zoom_anchor_y < 0.55 else 0.38
213
  y_expr = f"max(0\\,min(ih-ih/zoom\\,ih*{y_bias:.2f}-(ih/zoom/2)))"
214
  else:
215
  y_expr = "ih/2-(ih/zoom/2)"
 
279
  "red": "&H000000FF",
280
  }
281
 
282
+ _POSITIONS = {"top", "bottom", "left", "right", "center", "free"}
283
  _MODES = {"word", "phrase", "sentence"}
284
  _EMPHASIS = {"pop", "punch", "calm"}
285
+ _ANCHORS = set(range(1, 10))
286
 
287
 
288
  def _ts(t: float) -> str:
 
301
  return v if v in allowed else fallback
302
 
303
 
304
+ def _clamp_float(value: object, fallback: float, low: float = 0.0, high: float = 1.0) -> float:
305
+ try:
306
+ return min(high, max(low, float(value)))
307
+ except Exception:
308
+ return fallback
309
+
310
+
311
+ def _clamp_int(value: object, fallback: int, allowed: set[int]) -> int:
312
+ try:
313
+ v = int(value)
314
+ except Exception:
315
+ return fallback
316
+ return v if v in allowed else fallback
317
+
318
+
319
+ def _normalise_bbox(value: object) -> list[float] | None:
320
+ if not isinstance(value, (list, tuple)) or len(value) != 4:
321
+ return None
322
+ try:
323
+ coords = [float(v) for v in value]
324
+ except Exception:
325
+ return None
326
+ if max(abs(v) for v in coords) > 1.5:
327
+ return None
328
+ x1, y1, x2, y2 = coords
329
+ x1, x2 = sorted((min(1.0, max(0.0, x1)), min(1.0, max(0.0, x2))))
330
+ y1, y2 = sorted((min(1.0, max(0.0, y1)), min(1.0, max(0.0, y2))))
331
+ if x2 - x1 < 0.02 or y2 - y1 < 0.02:
332
+ return None
333
+ return [x1, y1, x2, y2]
334
+
335
+
336
+ def _caption_anchor_for(x: float, y: float) -> int:
337
+ if y < 0.34:
338
+ return 8 if 0.30 <= x <= 0.70 else 7 if x < 0.5 else 9
339
+ if y > 0.66:
340
+ return 2 if 0.30 <= x <= 0.70 else 1 if x < 0.5 else 3
341
+ return 5 if 0.34 <= x <= 0.66 else 4 if x < 0.5 else 6
342
+
343
+
344
+ def _safe_caption_point(subject_x: float, subject_y: float, seg_idx: int) -> tuple[float, float, int]:
345
+ """Pick a varied but readable empty-ish zone opposite the main subject."""
346
+ left_side = subject_x < 0.50
347
+ high_subject = subject_y < 0.42
348
+ low_subject = subject_y > 0.62
349
+
350
+ candidates = [
351
+ (0.68 if left_side else 0.32, 0.72 if high_subject else 0.24 if low_subject else 0.76),
352
+ (0.72 if left_side else 0.28, 0.50),
353
+ (0.50, 0.18 if subject_y > 0.45 else 0.82),
354
+ (0.50, 0.72),
355
+ ]
356
+ x, y = candidates[seg_idx % len(candidates)]
357
+ return x, y, _caption_anchor_for(x, y)
358
+
359
+
360
  def _normalise_analysis(analysis: dict, seg_idx: int, n_total: int) -> dict:
361
  """Validate model output and fill HRE fields used by the renderer."""
362
  an = dict(analysis or {})
363
+ subject_bbox = _normalise_bbox(an.get("subject_bbox"))
364
  energy = _pick(an.get("energy_level"), {"high", "medium", "low"}, "medium")
365
  moment = _pick(
366
  an.get("moment_type"),
 
372
  if energy == "medium" and moment not in {"context", "transition"}:
373
  fallback_mode = "phrase"
374
 
375
+ if subject_bbox:
376
+ subject_x = (subject_bbox[0] + subject_bbox[2]) / 2.0
377
+ subject_y = (subject_bbox[1] + subject_bbox[3]) / 2.0
378
+ else:
379
+ subject_x = _clamp_float(an.get("face_cx"), 0.5)
380
+ subject_y = _clamp_float(an.get("face_cy"), 0.38)
381
+
382
+ pos = _pick(an.get("subtitle_position"), _POSITIONS, "free")
383
  mode = _pick(an.get("subtitle_mode"), _MODES, fallback_mode)
384
  emphasis = _pick(an.get("subtitle_emphasis"), _EMPHASIS, "punch" if mode == "word" else "calm")
385
  color = _pick(an.get("subtitle_color"), set(_ASS_COLORS), "white")
386
  zoom_direction = _pick(an.get("zoom_direction"), {"in", "out", "hold"}, "in")
387
  zoom_speed = _pick(an.get("zoom_speed"), {"fast", "slow"}, "slow")
388
 
389
+ face_cx = _clamp_float(an.get("face_cx"), subject_x)
390
+ face_cy = _clamp_float(an.get("face_cy"), subject_y)
391
+ zoom_anchor_x = _clamp_float(an.get("zoom_anchor_x"), face_cx)
392
+ zoom_anchor_y = _clamp_float(an.get("zoom_anchor_y"), face_cy)
393
+
394
+ fallback_x, fallback_y, fallback_anchor = _safe_caption_point(subject_x, subject_y, seg_idx)
395
+ caption_x = _clamp_float(an.get("caption_x"), fallback_x, 0.10, 0.90)
396
+ caption_y = _clamp_float(an.get("caption_y"), fallback_y, 0.12, 0.88)
397
+ caption_anchor = _clamp_int(an.get("caption_anchor"), fallback_anchor, _ANCHORS)
398
+ caption_max_width_pct = _clamp_float(
399
+ an.get("caption_max_width_pct"),
400
+ 0.58 if mode != "sentence" else 0.72,
401
+ 0.35,
402
+ 0.82,
403
+ )
404
+
405
+ if subject_bbox:
406
+ x1, y1, x2, y2 = subject_bbox
407
+ overlaps_subject = (x1 - 0.08) <= caption_x <= (x2 + 0.08) and (y1 - 0.08) <= caption_y <= (y2 + 0.08)
408
+ if overlaps_subject:
409
+ caption_x, caption_y, caption_anchor = fallback_x, fallback_y, fallback_anchor
410
 
411
  if seg_idx == 0:
412
  zoom_direction, zoom_speed = "in", "fast"
 
422
  "face_detected": bool(an.get("face_detected", False)),
423
  "face_cx": face_cx,
424
  "face_cy": face_cy,
425
+ "subject_bbox": subject_bbox,
426
+ "zoom_anchor_x": zoom_anchor_x,
427
+ "zoom_anchor_y": zoom_anchor_y,
428
  "subtitle_position": pos,
429
+ "caption_x": caption_x,
430
+ "caption_y": caption_y,
431
+ "caption_anchor": caption_anchor,
432
+ "caption_max_width_pct": caption_max_width_pct,
433
  "subtitle_mode": mode,
434
  "subtitle_emphasis": emphasis,
435
  "subtitle_color": color,
 
447
 
448
  # If the model repeats the same caption treatment for every segment, rotate
449
  # through safe defaults so HRE visibly changes across the clip.
450
+ if len(plan) > 1 and len({(round(p["caption_x"], 2), round(p["caption_y"], 2), p["subtitle_mode"]) for p in plan}) == 1:
451
+ positions = ["free", "free", "free", "free", "free", "free"]
452
+ coords = [(0.50, 0.76), (0.50, 0.18), (0.28, 0.56), (0.72, 0.52), (0.50, 0.82), (0.50, 0.22)]
453
  modes = ["word", "sentence", "phrase", "word", "sentence", "phrase"]
454
  for i, p in enumerate(plan):
455
  p["subtitle_position"] = positions[i % len(positions)]
456
+ p["caption_x"], p["caption_y"] = coords[i % len(coords)]
457
+ p["caption_anchor"] = _caption_anchor_for(p["caption_x"], p["caption_y"])
458
  p["subtitle_mode"] = modes[i % len(modes)]
459
  if p["subtitle_mode"] == "word":
460
  p["subtitle_emphasis"] = "punch"
 
705
 
706
 
707
  def _subtitle_tag(plan: dict) -> tuple[str, int]:
 
708
  mode = plan["subtitle_mode"]
709
  energy = plan["energy_level"]
710
  emphasis = plan["subtitle_emphasis"]
711
  color = _ASS_COLORS.get(plan["subtitle_color"], "&H00FFFFFF")
712
+ alignment = int(plan.get("caption_anchor", 5))
713
+ x = round(_clamp_float(plan.get("caption_x"), 0.5, 0.08, 0.92) * 1080)
714
+ y = round(_clamp_float(plan.get("caption_y"), 0.75, 0.10, 0.90) * 1920)
715
+ max_width_px = max(360, min(886, int(_clamp_float(plan.get("caption_max_width_pct"), 0.62, 0.35, 0.82) * 1080)))
 
 
 
 
 
716
 
717
  if mode == "sentence":
718
+ font_size = 56 if energy != "high" else 62
 
719
  elif mode == "phrase":
720
+ font_size = 68 if energy != "low" else 62
 
721
  else:
722
+ font_size = 80 if energy == "high" else 72
723
+
724
+ if alignment in {4, 5, 6}:
725
+ font_size = max(54, font_size - 4)
726
 
727
+ max_chars = max(8, min(34, int(max_width_px / (font_size * 0.58))))
 
 
728
 
729
  base = (
730
  f"{{\\an{alignment}\\pos({x},{y})\\1c{color}&\\fs{font_size}"
731
+ "\\b1\\bord5\\shad1\\q2}"
732
  )
733
  if emphasis in {"pop", "punch"} or mode == "word":
734
  base += "{\\fscx125\\fscy125\\t(0,120,\\fscx100\\fscy100)}"