ex510 commited on
Commit
fb3f56f
ยท
verified ยท
1 Parent(s): 6ea5d49

Update processor.py

Browse files
Files changed (1) hide show
  1. processor.py +85 -47
processor.py CHANGED
@@ -17,6 +17,8 @@ Fixes applied:
17
  { clip_index, start, end, segments, full_text }
18
  - โœ… NEW: process_video returns a dict with keys:
19
  output_files, transcripts, viral_segments, duration
 
 
20
  """
21
  import os
22
  import gc
@@ -31,13 +33,9 @@ from core.logger import Logger
31
  from core.stt import STT, SubtitleSegmenter
32
  from core.analyze import analyze_transcript
33
  from core.styles import StyleFactory
34
- from core.subtitle_manager import SubtitleManager
35
 
36
  logger = Logger.get_logger(__name__)
37
 
38
- # Max chars per line โ€” must match SubtitleSegmenter constant
39
- _MAX_CHARS_PER_LINE = 42
40
-
41
 
42
  def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: float) -> list:
43
  """
@@ -52,22 +50,13 @@ def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: flo
52
 
53
  total_chars = sum(len(w) for w in words)
54
  seg_dur = seg_end - seg_start
55
-
56
- result = []
57
- cursor = seg_start
58
 
59
  for i, w in enumerate(words):
60
- if total_chars > 0:
61
- fraction = len(w) / total_chars
62
- else:
63
- fraction = 1.0 / len(words)
64
-
65
- w_dur = seg_dur * fraction
66
- w_end = cursor + w_dur
67
-
68
- # Clamp last word to seg_end to avoid float drift
69
- if i == len(words) - 1:
70
- w_end = seg_end
71
 
72
  result.append({
73
  "text": w,
@@ -86,6 +75,51 @@ class VideoProcessor:
86
  self.stt = STT(model_size=model_size)
87
  Config.setup_dirs()
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  # โ”€โ”€ JSON helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
90
 
91
  def _clean_json_response(self, content):
@@ -181,7 +215,7 @@ class VideoProcessor:
181
 
182
  data = {
183
  "segments": full_segments,
184
- "full_text": full_text, # โœ… NEW: store full transcript text
185
  "detected_language": detected_lang,
186
  "target_language": target_language,
187
  "duration": duration,
@@ -265,16 +299,16 @@ class VideoProcessor:
265
  Cuts, styles, captions, and exports each viral clip.
266
 
267
  โœ… Returns: (output_files, transcripts_per_clip)
268
- output_files : list of str โ€” paths to rendered .mp4 files
269
  transcripts_per_clip : list of dicts, one per successfully rendered clip:
270
  {
271
- "clip_index" : int, # 1-based
272
- "filename" : str, # output filename (basename)
273
- "start" : float, # clip start in original video (s)
274
- "end" : float, # clip end in original video (s)
275
- "language" : str, # detected/caption language
276
- "segments" : [ ... ], # STT segments relative to clip start
277
- "full_text" : str, # concatenated text of all segments
278
  }
279
  """
280
  logger.info("๐ŸŽจ Phase 3: Style & Captions โ€ฆ")
@@ -301,7 +335,7 @@ class VideoProcessor:
301
 
302
  # โ”€โ”€ Main loop โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
303
  output_files = []
304
- transcripts_per_clip = [] # โœ… NEW
305
 
306
  if not best_clips:
307
  logger.warning("โš ๏ธ No clips to process.")
@@ -337,9 +371,9 @@ class VideoProcessor:
337
  logger.info(f"\n๐ŸŽฌ Clip {i+1}/{len(best_clips)} ({start:.2f}s โ€“ {end:.2f}s)")
338
 
339
  # โ”€โ”€ Output path โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
340
- task_id = kwargs.get("task_id")
341
- prefix = f"viral_{task_id}_{i+1}" if task_id else f"viral_{i+1}"
342
- out_name = f"{prefix}_{style_str}.mp4"
343
  final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
344
  os.makedirs(os.path.dirname(final_output), exist_ok=True)
345
 
@@ -387,6 +421,14 @@ class VideoProcessor:
387
  playground_path = kwargs.get("playground_path"),
388
  )
389
 
 
 
 
 
 
 
 
 
390
  # โ”€โ”€ Export โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
391
  cpu_count = os.cpu_count() or 4
392
  logger.info(f"โš™๏ธ Rendering with {cpu_count} thread(s) โ€ฆ")
@@ -402,7 +444,7 @@ class VideoProcessor:
402
  output_files.append(final_output)
403
  logger.info(f"โœ… Saved: {final_output}")
404
 
405
- # โœ… NEW: Build transcript entry for this clip
406
  clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
407
  transcripts_per_clip.append({
408
  "clip_index": i + 1,
@@ -432,7 +474,7 @@ class VideoProcessor:
432
  pass
433
  gc.collect()
434
 
435
- return output_files, transcripts_per_clip # โœ… tuple now
436
 
437
 
438
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@@ -443,36 +485,33 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
443
  """
444
  End-to-end pipeline: STT โ†’ AI analysis โ†’ clip export.
445
 
446
- โœ… Returns a dict (instead of a plain list) with:
447
  {
448
- "output_files" : list[str], # paths to rendered clips
449
- "transcripts" : list[dict], # per-clip transcripts (see process_clips)
450
- "viral_segments" : list[dict], # raw AI viral segment detections
451
- "full_transcript": str, # full video transcript text
452
- "duration" : float, # video duration in seconds
453
  }
454
 
455
  Important kwargs:
456
  source_language : language of the original video โ†’ passed to Whisper.
457
- If not set โ†’ Whisper auto-detects.
458
  language : desired output language (translation + captions).
459
- If same as source โ†’ no translation.
460
  caption_mode : sentence | word | highlight_word
461
  caption_style : classic | modern_glow | tiktok_bold | โ€ฆ
 
 
462
  """
463
  try:
464
- processor = VideoProcessor(model_size=model_size)
465
-
466
  caption_mode = kwargs.get("caption_mode", "sentence")
467
 
468
- # highlight_word and word modes both need word-level timestamps
469
  timestamp_mode = (
470
  "words"
471
  if caption_mode in ("word", "highlight_word")
472
  else "segments"
473
  )
474
 
475
- # Phase 1 + 2: STT + AI analysis
476
  viral_segments, duration, stt_data = processor.analyze_impact(
477
  video_path,
478
  source_language = kwargs.get("source_language"),
@@ -492,7 +531,6 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
492
 
493
  best_clips = processor.get_best_segments(viral_segments, duration)
494
 
495
- # Phase 3: render
496
  output_files, transcripts = processor.process_clips(
497
  video_path,
498
  best_clips,
@@ -526,7 +564,7 @@ if __name__ == "__main__":
526
  if len(sys.argv) > 1:
527
  result = process_video(sys.argv[1])
528
  print(json.dumps({
529
- "clips": result["output_files"],
530
  "full_transcript": result["full_transcript"],
531
  "clip_transcripts": [
532
  {"clip": t["clip_index"], "text": t["full_text"]}
 
17
  { clip_index, start, end, segments, full_text }
18
  - โœ… NEW: process_video returns a dict with keys:
19
  output_files, transcripts, viral_segments, duration
20
+ - โœ… NEW: mix_audio method added to VideoProcessor
21
+ blends background music with original video audio
22
  """
23
  import os
24
  import gc
 
33
  from core.stt import STT, SubtitleSegmenter
34
  from core.analyze import analyze_transcript
35
  from core.styles import StyleFactory
 
36
 
37
  logger = Logger.get_logger(__name__)
38
 
 
 
 
39
 
40
  def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: float) -> list:
41
  """
 
50
 
51
  total_chars = sum(len(w) for w in words)
52
  seg_dur = seg_end - seg_start
53
+ result = []
54
+ cursor = seg_start
 
55
 
56
  for i, w in enumerate(words):
57
+ fraction = (len(w) / total_chars) if total_chars > 0 else (1.0 / len(words))
58
+ w_dur = seg_dur * fraction
59
+ w_end = seg_end if i == len(words) - 1 else cursor + w_dur
 
 
 
 
 
 
 
 
60
 
61
  result.append({
62
  "text": w,
 
75
  self.stt = STT(model_size=model_size)
76
  Config.setup_dirs()
77
 
78
+ # โ”€โ”€ Audio Mixing โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
79
+
80
+ def mix_audio(self, video_clip, audio_path=None, bg_music_volume=0.1, original_volume=1.0):
81
+ """
82
+ Blends background music with the original video audio.
83
+
84
+ video_clip : MoviePy VideoFileClip or CompositeVideoClip
85
+ audio_path : path to music file (mp3/m4a/...) โ€” None = skip
86
+ bg_music_volume : background music level (0.0 โ†’ 1.0)
87
+ original_volume : original video audio level (0.0 โ†’ 1.0)
88
+
89
+ Returns: video_clip with mixed audio
90
+ """
91
+ if not audio_path or not os.path.exists(audio_path):
92
+ return video_clip
93
+
94
+ clip_duration = video_clip.duration
95
+ logger.info(f"๐ŸŽต Mixing audio: {audio_path} | vol={bg_music_volume}")
96
+
97
+ music = mpe.AudioFileClip(audio_path)
98
+
99
+ # ู„ูˆ ุงู„ู…ูˆุณูŠู‚ู‰ ุฃู‚ุตุฑ ู…ู† ุงู„ูƒู„ูŠุจ โ†’ ู„ูˆู‘ุจ
100
+ if music.duration < clip_duration:
101
+ loops = int(clip_duration / music.duration) + 1
102
+ music = mpe.concatenate_audioclips([music] * loops)
103
+ logger.info(f"๐Ÿ” Music looped x{loops}")
104
+
105
+ # ู‚ุต ุงู„ู…ูˆุณูŠู‚ู‰ ุจู†ูุณ ุทูˆู„ ุงู„ูƒู„ูŠุจ
106
+ music = music.subclip(0, clip_duration).volumex(bg_music_volume)
107
+
108
+ original_audio = video_clip.audio
109
+
110
+ # ู„ูˆ ู…ููŠุด ุตูˆุช ุฃุตู„ูŠ โ†’ ุฎู„ูŠ ุงู„ู…ูˆุณูŠู‚ู‰ ุจุณ
111
+ if original_audio is None:
112
+ logger.info("โš ๏ธ No original audio โ€” using music only")
113
+ return video_clip.set_audio(music)
114
+
115
+ # ุฎู„ุท ุงู„ุตูˆุชูŠู†
116
+ mixed = mpe.CompositeAudioClip([
117
+ original_audio.volumex(original_volume),
118
+ music,
119
+ ])
120
+ logger.info("โœ… Audio mixed successfully")
121
+ return video_clip.set_audio(mixed)
122
+
123
  # โ”€โ”€ JSON helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
124
 
125
  def _clean_json_response(self, content):
 
215
 
216
  data = {
217
  "segments": full_segments,
218
+ "full_text": full_text,
219
  "detected_language": detected_lang,
220
  "target_language": target_language,
221
  "duration": duration,
 
299
  Cuts, styles, captions, and exports each viral clip.
300
 
301
  โœ… Returns: (output_files, transcripts_per_clip)
302
+ output_files : list of str โ€” paths to rendered .mp4 files
303
  transcripts_per_clip : list of dicts, one per successfully rendered clip:
304
  {
305
+ "clip_index" : int,
306
+ "filename" : str,
307
+ "start" : float,
308
+ "end" : float,
309
+ "language" : str,
310
+ "segments" : [ ... ],
311
+ "full_text" : str,
312
  }
313
  """
314
  logger.info("๐ŸŽจ Phase 3: Style & Captions โ€ฆ")
 
335
 
336
  # โ”€โ”€ Main loop โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
337
  output_files = []
338
+ transcripts_per_clip = []
339
 
340
  if not best_clips:
341
  logger.warning("โš ๏ธ No clips to process.")
 
371
  logger.info(f"\n๐ŸŽฌ Clip {i+1}/{len(best_clips)} ({start:.2f}s โ€“ {end:.2f}s)")
372
 
373
  # โ”€โ”€ Output path โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
374
+ task_id = kwargs.get("task_id")
375
+ prefix = f"viral_{task_id}_{i+1}" if task_id else f"viral_{i+1}"
376
+ out_name = f"{prefix}_{style_str}.mp4"
377
  final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
378
  os.makedirs(os.path.dirname(final_output), exist_ok=True)
379
 
 
421
  playground_path = kwargs.get("playground_path"),
422
  )
423
 
424
+ # โœ… Mix background music โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
425
+ final_clip = self.mix_audio(
426
+ final_clip,
427
+ audio_path = kwargs.get("audio_path"),
428
+ bg_music_volume = kwargs.get("bg_music_volume", 0.1),
429
+ original_volume = 1.0,
430
+ )
431
+
432
  # โ”€โ”€ Export โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
433
  cpu_count = os.cpu_count() or 4
434
  logger.info(f"โš™๏ธ Rendering with {cpu_count} thread(s) โ€ฆ")
 
444
  output_files.append(final_output)
445
  logger.info(f"โœ… Saved: {final_output}")
446
 
447
+ # โ”€โ”€ Build transcript entry โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
448
  clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
449
  transcripts_per_clip.append({
450
  "clip_index": i + 1,
 
474
  pass
475
  gc.collect()
476
 
477
+ return output_files, transcripts_per_clip
478
 
479
 
480
  # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
485
  """
486
  End-to-end pipeline: STT โ†’ AI analysis โ†’ clip export.
487
 
488
+ โœ… Returns a dict with:
489
  {
490
+ "output_files" : list[str],
491
+ "transcripts" : list[dict],
492
+ "viral_segments" : list[dict],
493
+ "full_transcript": str,
494
+ "duration" : float,
495
  }
496
 
497
  Important kwargs:
498
  source_language : language of the original video โ†’ passed to Whisper.
 
499
  language : desired output language (translation + captions).
 
500
  caption_mode : sentence | word | highlight_word
501
  caption_style : classic | modern_glow | tiktok_bold | โ€ฆ
502
+ audio_path : path to background music file
503
+ bg_music_volume : background music volume (0.0 โ†’ 1.0)
504
  """
505
  try:
506
+ processor = VideoProcessor(model_size=model_size)
 
507
  caption_mode = kwargs.get("caption_mode", "sentence")
508
 
 
509
  timestamp_mode = (
510
  "words"
511
  if caption_mode in ("word", "highlight_word")
512
  else "segments"
513
  )
514
 
 
515
  viral_segments, duration, stt_data = processor.analyze_impact(
516
  video_path,
517
  source_language = kwargs.get("source_language"),
 
531
 
532
  best_clips = processor.get_best_segments(viral_segments, duration)
533
 
 
534
  output_files, transcripts = processor.process_clips(
535
  video_path,
536
  best_clips,
 
564
  if len(sys.argv) > 1:
565
  result = process_video(sys.argv[1])
566
  print(json.dumps({
567
+ "clips": result["output_files"],
568
  "full_transcript": result["full_transcript"],
569
  "clip_transcripts": [
570
  {"clip": t["clip_index"], "text": t["full_text"]}