ex510 commited on
Commit
ea1bf8a
Β·
verified Β·
1 Parent(s): 3acae71

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +88 -30
processor.py CHANGED
@@ -1,6 +1,5 @@
1
  """
2
  VideoProcessor β€” Core pipeline for viral clip extraction.
3
-
4
  Fixes applied:
5
  - source_language (for Whisper) separated from target_language (for translation/captions)
6
  - Removed duplicate _clean_json_response (json_repair version kept)
@@ -13,6 +12,11 @@ Fixes applied:
13
  using SubtitleSegmenter._split_into_lines so line splits match translated content
14
  - βœ… FIX: translated word timestamps distributed proportional to word length
15
  (instead of uniform distribution) for better highlight sync
 
 
 
 
 
16
  """
17
  import os
18
  import gc
@@ -28,7 +32,6 @@ from core.stt import STT, SubtitleSegmenter
28
  from core.analyze import analyze_transcript
29
  from core.styles import StyleFactory
30
  from core.subtitle_manager import SubtitleManager
31
- # from core.free_translator import FreeTranslator
32
 
33
  logger = Logger.get_logger(__name__)
34
 
@@ -41,7 +44,6 @@ def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: flo
41
  βœ… FIX: Distribute word timestamps proportional to character length instead of
42
  uniform distribution. Longer words get more time, giving better sync in
43
  highlight_word mode after translation.
44
-
45
  words: list of str (translated words)
46
  Returns: list of { text, start, end }
47
  """
@@ -156,7 +158,6 @@ class VideoProcessor:
156
  progress_callback=None):
157
  """
158
  STT + AI viral-moment detection.
159
-
160
  source_language : passed directly to Whisper.
161
  None β†’ Whisper auto-detects (slower but safe).
162
  target_language : stored in data for process_clips to use for
@@ -180,6 +181,7 @@ class VideoProcessor:
180
 
181
  data = {
182
  "segments": full_segments,
 
183
  "detected_language": detected_lang,
184
  "target_language": target_language,
185
  "duration": duration,
@@ -262,14 +264,18 @@ class VideoProcessor:
262
  """
263
  Cuts, styles, captions, and exports each viral clip.
264
 
265
- βœ… FIX 1: After translation, _line1 and _line2 are re-computed from
266
- the translated text using SubtitleSegmenter._split_into_lines.
267
- Previously they were left as the original-language splits which
268
- caused wrong line breaks in the translated captions.
269
-
270
- βœ… FIX 2: Word timestamps after translation are distributed proportional
271
- to character length (via _distribute_timestamps_by_length) instead of
272
- uniform distribution, giving better sync in highlight_word mode.
 
 
 
 
273
  """
274
  logger.info("🎨 Phase 3: Style & Captions …")
275
  if progress_callback:
@@ -284,11 +290,8 @@ class VideoProcessor:
284
  logger.error(f"❌ Could not determine video duration: {e}")
285
 
286
  # ── Language resolution ───────────────────────────────────────────────
287
- detected_lang = data.get("detected_language", "en")
288
-
289
- # SIMPLIFIED: No separate translation step.
290
- # STT has already provided the correct text (English or Arabic).
291
- caption_lang = detected_lang
292
  logger.info(f"πŸ—£οΈ Captions language: {caption_lang}")
293
 
294
  # ── Normalise style string once ───────────────────────────────────────
@@ -297,11 +300,12 @@ class VideoProcessor:
297
  style_str = style_str.split(".")[-1]
298
 
299
  # ── Main loop ─────────────────────────────────────────────────────────
300
- output_files = []
 
301
 
302
  if not best_clips:
303
  logger.warning("⚠️ No clips to process.")
304
- return []
305
 
306
  logger.info(f"πŸ“Š Processing {len(best_clips)} clip(s) …")
307
 
@@ -344,18 +348,16 @@ class VideoProcessor:
344
  clip = current_video_clip.subclip(start, end)
345
 
346
  # ── Build segment_transcript ──────────────────────────────────
347
- segment_transcript = {"segments": []}
348
 
349
  for s in data["segments"]:
350
  if s["start"] >= end or s["end"] <= start:
351
  continue
352
 
353
- new_seg = s.copy()
354
  new_seg["start"] = max(0, s["start"] - start)
355
  new_seg["end"] = min(end - start, s["end"] - start)
356
 
357
- # SIMPLIFIED: No translation step here.
358
- # Just adjust timestamps relative to clip start.
359
  if "words" in s:
360
  new_seg["words"] = [
361
  {
@@ -366,8 +368,10 @@ class VideoProcessor:
366
  for w in s["words"]
367
  if w["start"] < end and w["end"] > start
368
  ]
369
-
370
- segment_transcript["segments"].append(new_seg)
 
 
371
 
372
  # ── Apply style + captions ────────────────────────────────────
373
  style_strategy = StyleFactory.get_style(style_str)
@@ -398,6 +402,23 @@ class VideoProcessor:
398
  output_files.append(final_output)
399
  logger.info(f"βœ… Saved: {final_output}")
400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  except Exception as e:
402
  logger.error(f"❌ Clip {i+1} error: {e}")
403
  logger.error(traceback.format_exc())
@@ -411,7 +432,7 @@ class VideoProcessor:
411
  pass
412
  gc.collect()
413
 
414
- return output_files
415
 
416
 
417
  # ─────────────────────────────────────────────────────────────────────────────
@@ -422,6 +443,15 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
422
  """
423
  End-to-end pipeline: STT β†’ AI analysis β†’ clip export.
424
 
 
 
 
 
 
 
 
 
 
425
  Important kwargs:
426
  source_language : language of the original video β†’ passed to Whisper.
427
  If not set β†’ Whisper auto-detects.
@@ -452,12 +482,18 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
452
 
453
  if not viral_segments:
454
  logger.warning("⚠️ No viral segments found.")
455
- return []
 
 
 
 
 
 
456
 
457
  best_clips = processor.get_best_segments(viral_segments, duration)
458
 
459
  # Phase 3: render
460
- return processor.process_clips(
461
  video_path,
462
  best_clips,
463
  stt_data,
@@ -465,13 +501,35 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
465
  **kwargs,
466
  )
467
 
 
 
 
 
 
 
 
 
468
  except Exception as e:
469
  logger.error(f"❌ Processing failed: {e}")
470
  logger.error(traceback.format_exc())
471
- return []
 
 
 
 
 
 
472
 
473
 
474
  if __name__ == "__main__":
475
  import sys
476
  if len(sys.argv) > 1:
477
- process_video(sys.argv[1])
 
 
 
 
 
 
 
 
 
1
  """
2
  VideoProcessor β€” Core pipeline for viral clip extraction.
 
3
  Fixes applied:
4
  - source_language (for Whisper) separated from target_language (for translation/captions)
5
  - Removed duplicate _clean_json_response (json_repair version kept)
 
12
  using SubtitleSegmenter._split_into_lines so line splits match translated content
13
  - βœ… FIX: translated word timestamps distributed proportional to word length
14
  (instead of uniform distribution) for better highlight sync
15
+ - βœ… NEW: process_clips now returns (output_files, transcripts_per_clip)
16
+ where transcripts_per_clip is a list of dicts:
17
+ { clip_index, start, end, segments, full_text }
18
+ - βœ… NEW: process_video returns a dict with keys:
19
+ output_files, transcripts, viral_segments, duration
20
  """
21
  import os
22
  import gc
 
32
  from core.analyze import analyze_transcript
33
  from core.styles import StyleFactory
34
  from core.subtitle_manager import SubtitleManager
 
35
 
36
  logger = Logger.get_logger(__name__)
37
 
 
44
  βœ… FIX: Distribute word timestamps proportional to character length instead of
45
  uniform distribution. Longer words get more time, giving better sync in
46
  highlight_word mode after translation.
 
47
  words: list of str (translated words)
48
  Returns: list of { text, start, end }
49
  """
 
158
  progress_callback=None):
159
  """
160
  STT + AI viral-moment detection.
 
161
  source_language : passed directly to Whisper.
162
  None β†’ Whisper auto-detects (slower but safe).
163
  target_language : stored in data for process_clips to use for
 
181
 
182
  data = {
183
  "segments": full_segments,
184
+ "full_text": full_text, # βœ… NEW: store full transcript text
185
  "detected_language": detected_lang,
186
  "target_language": target_language,
187
  "duration": duration,
 
264
  """
265
  Cuts, styles, captions, and exports each viral clip.
266
 
267
+ βœ… Returns: (output_files, transcripts_per_clip)
268
+ output_files : list of str β€” paths to rendered .mp4 files
269
+ transcripts_per_clip : list of dicts, one per successfully rendered clip:
270
+ {
271
+ "clip_index" : int, # 1-based
272
+ "filename" : str, # output filename (basename)
273
+ "start" : float, # clip start in original video (s)
274
+ "end" : float, # clip end in original video (s)
275
+ "language" : str, # detected/caption language
276
+ "segments" : [ ... ], # STT segments relative to clip start
277
+ "full_text" : str, # concatenated text of all segments
278
+ }
279
  """
280
  logger.info("🎨 Phase 3: Style & Captions …")
281
  if progress_callback:
 
290
  logger.error(f"❌ Could not determine video duration: {e}")
291
 
292
  # ── Language resolution ───────────────────────────────────────────────
293
+ detected_lang = data.get("detected_language", "en")
294
+ caption_lang = detected_lang
 
 
 
295
  logger.info(f"πŸ—£οΈ Captions language: {caption_lang}")
296
 
297
  # ── Normalise style string once ───────────────────────────────────────
 
300
  style_str = style_str.split(".")[-1]
301
 
302
  # ── Main loop ─────────────────────────────────────────────────────────
303
+ output_files = []
304
+ transcripts_per_clip = [] # βœ… NEW
305
 
306
  if not best_clips:
307
  logger.warning("⚠️ No clips to process.")
308
+ return [], []
309
 
310
  logger.info(f"πŸ“Š Processing {len(best_clips)} clip(s) …")
311
 
 
348
  clip = current_video_clip.subclip(start, end)
349
 
350
  # ── Build segment_transcript ──────────────────────────────────
351
+ clip_segments = []
352
 
353
  for s in data["segments"]:
354
  if s["start"] >= end or s["end"] <= start:
355
  continue
356
 
357
+ new_seg = s.copy()
358
  new_seg["start"] = max(0, s["start"] - start)
359
  new_seg["end"] = min(end - start, s["end"] - start)
360
 
 
 
361
  if "words" in s:
362
  new_seg["words"] = [
363
  {
 
368
  for w in s["words"]
369
  if w["start"] < end and w["end"] > start
370
  ]
371
+
372
+ clip_segments.append(new_seg)
373
+
374
+ segment_transcript = {"segments": clip_segments}
375
 
376
  # ── Apply style + captions ────────────────────────────────────
377
  style_strategy = StyleFactory.get_style(style_str)
 
402
  output_files.append(final_output)
403
  logger.info(f"βœ… Saved: {final_output}")
404
 
405
+ # βœ… NEW: Build transcript entry for this clip
406
+ clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
407
+ transcripts_per_clip.append({
408
+ "clip_index": i + 1,
409
+ "filename": out_name,
410
+ "start": start,
411
+ "end": end,
412
+ "language": caption_lang,
413
+ "segments": clip_segments,
414
+ "full_text": clip_full_text,
415
+ })
416
+ logger.info(
417
+ f"πŸ“ Transcript for clip {i+1}: "
418
+ f"{len(clip_segments)} segment(s), "
419
+ f"{len(clip_full_text)} chars"
420
+ )
421
+
422
  except Exception as e:
423
  logger.error(f"❌ Clip {i+1} error: {e}")
424
  logger.error(traceback.format_exc())
 
432
  pass
433
  gc.collect()
434
 
435
+ return output_files, transcripts_per_clip # βœ… tuple now
436
 
437
 
438
  # ─────────────────────────────────────────────────────────────────────────────
 
443
  """
444
  End-to-end pipeline: STT β†’ AI analysis β†’ clip export.
445
 
446
+ βœ… Returns a dict (instead of a plain list) with:
447
+ {
448
+ "output_files" : list[str], # paths to rendered clips
449
+ "transcripts" : list[dict], # per-clip transcripts (see process_clips)
450
+ "viral_segments" : list[dict], # raw AI viral segment detections
451
+ "full_transcript": str, # full video transcript text
452
+ "duration" : float, # video duration in seconds
453
+ }
454
+
455
  Important kwargs:
456
  source_language : language of the original video β†’ passed to Whisper.
457
  If not set β†’ Whisper auto-detects.
 
482
 
483
  if not viral_segments:
484
  logger.warning("⚠️ No viral segments found.")
485
+ return {
486
+ "output_files": [],
487
+ "transcripts": [],
488
+ "viral_segments": [],
489
+ "full_transcript": stt_data.get("full_text", ""),
490
+ "duration": duration,
491
+ }
492
 
493
  best_clips = processor.get_best_segments(viral_segments, duration)
494
 
495
  # Phase 3: render
496
+ output_files, transcripts = processor.process_clips(
497
  video_path,
498
  best_clips,
499
  stt_data,
 
501
  **kwargs,
502
  )
503
 
504
+ return {
505
+ "output_files": output_files,
506
+ "transcripts": transcripts,
507
+ "viral_segments": viral_segments,
508
+ "full_transcript": stt_data.get("full_text", ""),
509
+ "duration": duration,
510
+ }
511
+
512
  except Exception as e:
513
  logger.error(f"❌ Processing failed: {e}")
514
  logger.error(traceback.format_exc())
515
+ return {
516
+ "output_files": [],
517
+ "transcripts": [],
518
+ "viral_segments": [],
519
+ "full_transcript": "",
520
+ "duration": 0,
521
+ }
522
 
523
 
524
  if __name__ == "__main__":
525
  import sys
526
  if len(sys.argv) > 1:
527
+ result = process_video(sys.argv[1])
528
+ print(json.dumps({
529
+ "clips": result["output_files"],
530
+ "full_transcript": result["full_transcript"],
531
+ "clip_transcripts": [
532
+ {"clip": t["clip_index"], "text": t["full_text"]}
533
+ for t in result["transcripts"]
534
+ ],
535
+ }, indent=2, ensure_ascii=False))