Spaces:

ex510
/

auto_cliper

Sleeping

App Files Files Community

ex510 commited on Feb 28

Commit

fb3f56f

verified ·

1 Parent(s): 6ea5d49

Update processor.py

Browse files

Files changed (1) hide show

processor.py +85 -47

processor.py CHANGED Viewed

@@ -17,6 +17,8 @@ Fixes applied:
              { clip_index, start, end, segments, full_text }
   - ✅ NEW: process_video returns a dict with keys:
              output_files, transcripts, viral_segments, duration
 """
 import os
 import gc
@@ -31,13 +33,9 @@ from core.logger import Logger
 from core.stt import STT, SubtitleSegmenter
 from core.analyze import analyze_transcript
 from core.styles import StyleFactory
-from core.subtitle_manager import SubtitleManager
 logger = Logger.get_logger(__name__)
-# Max chars per line — must match SubtitleSegmenter constant
-_MAX_CHARS_PER_LINE = 42
 def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: float) -> list:
     """
@@ -52,22 +50,13 @@ def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: flo
     total_chars = sum(len(w) for w in words)
     seg_dur     = seg_end - seg_start
-    result  = []
-    cursor  = seg_start
     for i, w in enumerate(words):
-        if total_chars > 0:
-            fraction = len(w) / total_chars
-        else:
-            fraction = 1.0 / len(words)
-        w_dur = seg_dur * fraction
-        w_end = cursor + w_dur
-        # Clamp last word to seg_end to avoid float drift
-        if i == len(words) - 1:
-            w_end = seg_end
         result.append({
             "text":  w,
@@ -86,6 +75,51 @@ class VideoProcessor:
         self.stt = STT(model_size=model_size)
         Config.setup_dirs()
     # ── JSON helpers ──────────────────────────────────────────────────────────
     def _clean_json_response(self, content):
@@ -181,7 +215,7 @@ class VideoProcessor:
         data = {
             "segments":          full_segments,
-            "full_text":         full_text,           # ✅ NEW: store full transcript text
             "detected_language": detected_lang,
             "target_language":   target_language,
             "duration":          duration,
@@ -265,16 +299,16 @@ class VideoProcessor:
         Cuts, styles, captions, and exports each viral clip.
         ✅ Returns: (output_files, transcripts_per_clip)
-            output_files      : list of str — paths to rendered .mp4 files
             transcripts_per_clip : list of dicts, one per successfully rendered clip:
                 {
-                    "clip_index"  : int,           # 1-based
-                    "filename"    : str,            # output filename (basename)
-                    "start"       : float,          # clip start in original video (s)
-                    "end"         : float,          # clip end in original video (s)
-                    "language"    : str,            # detected/caption language
-                    "segments"    : [ ... ],        # STT segments relative to clip start
-                    "full_text"   : str,            # concatenated text of all segments
                 }
         """
         logger.info("🎨 Phase 3: Style & Captions …")
@@ -301,7 +335,7 @@ class VideoProcessor:
         # ── Main loop ─────────────────────────────────────────────────────────
         output_files         = []
-        transcripts_per_clip = []   # ✅ NEW
         if not best_clips:
             logger.warning("⚠️ No clips to process.")
@@ -337,9 +371,9 @@ class VideoProcessor:
                 logger.info(f"\n🎬 Clip {i+1}/{len(best_clips)} ({start:.2f}s – {end:.2f}s)")
                 # ── Output path ───────────────────────────────────────────────
-                task_id  = kwargs.get("task_id")
-                prefix   = f"viral_{task_id}_{i+1}" if task_id else f"viral_{i+1}"
-                out_name = f"{prefix}_{style_str}.mp4"
                 final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
                 os.makedirs(os.path.dirname(final_output), exist_ok=True)
@@ -387,6 +421,14 @@ class VideoProcessor:
                     playground_path = kwargs.get("playground_path"),
                 )
                 # ── Export ────────────────────────────────────────────────────
                 cpu_count = os.cpu_count() or 4
                 logger.info(f"⚙️ Rendering with {cpu_count} thread(s) …")
@@ -402,7 +444,7 @@ class VideoProcessor:
                 output_files.append(final_output)
                 logger.info(f"✅ Saved: {final_output}")
-                # ✅ NEW: Build transcript entry for this clip
                 clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
                 transcripts_per_clip.append({
                     "clip_index": i + 1,
@@ -432,7 +474,7 @@ class VideoProcessor:
                             pass
                 gc.collect()
-        return output_files, transcripts_per_clip   # ✅ tuple now
 # ─────────────────────────────────────────────────────────────────────────────
@@ -443,36 +485,33 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
     """
     End-to-end pipeline: STT → AI analysis → clip export.
-    ✅ Returns a dict (instead of a plain list) with:
         {
-            "output_files"   : list[str],   # paths to rendered clips
-            "transcripts"    : list[dict],  # per-clip transcripts (see process_clips)
-            "viral_segments" : list[dict],  # raw AI viral segment detections
-            "full_transcript": str,         # full video transcript text
-            "duration"       : float,       # video duration in seconds
         }
     Important kwargs:
         source_language : language of the original video → passed to Whisper.
-                          If not set → Whisper auto-detects.
         language        : desired output language (translation + captions).
-                          If same as source → no translation.
         caption_mode    : sentence | word | highlight_word
         caption_style   : classic | modern_glow | tiktok_bold | …
     """
     try:
-        processor = VideoProcessor(model_size=model_size)
         caption_mode = kwargs.get("caption_mode", "sentence")
-        # highlight_word and word modes both need word-level timestamps
         timestamp_mode = (
             "words"
             if caption_mode in ("word", "highlight_word")
             else "segments"
         )
-        # Phase 1 + 2: STT + AI analysis
         viral_segments, duration, stt_data = processor.analyze_impact(
             video_path,
             source_language = kwargs.get("source_language"),
@@ -492,7 +531,6 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
         best_clips = processor.get_best_segments(viral_segments, duration)
-        # Phase 3: render
         output_files, transcripts = processor.process_clips(
             video_path,
             best_clips,
@@ -526,7 +564,7 @@ if __name__ == "__main__":
     if len(sys.argv) > 1:
         result = process_video(sys.argv[1])
         print(json.dumps({
-            "clips":          result["output_files"],
             "full_transcript": result["full_transcript"],
             "clip_transcripts": [
                 {"clip": t["clip_index"], "text": t["full_text"]}

              { clip_index, start, end, segments, full_text }
   - ✅ NEW: process_video returns a dict with keys:
              output_files, transcripts, viral_segments, duration
+  - ✅ NEW: mix_audio method added to VideoProcessor
+             blends background music with original video audio
 """
 import os
 import gc
 from core.stt import STT, SubtitleSegmenter
 from core.analyze import analyze_transcript
 from core.styles import StyleFactory
 logger = Logger.get_logger(__name__)
 def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: float) -> list:
     """
     total_chars = sum(len(w) for w in words)
     seg_dur     = seg_end - seg_start
+    result      = []
+    cursor      = seg_start
     for i, w in enumerate(words):
+        fraction = (len(w) / total_chars) if total_chars > 0 else (1.0 / len(words))
+        w_dur    = seg_dur * fraction
+        w_end    = seg_end if i == len(words) - 1 else cursor + w_dur
         result.append({
             "text":  w,
         self.stt = STT(model_size=model_size)
         Config.setup_dirs()
+    # ── Audio Mixing ──────────────────────────────────────────────────────────
+    def mix_audio(self, video_clip, audio_path=None, bg_music_volume=0.1, original_volume=1.0):
+        """
+        Blends background music with the original video audio.
+        video_clip       : MoviePy VideoFileClip or CompositeVideoClip
+        audio_path       : path to music file (mp3/m4a/...) — None = skip
+        bg_music_volume  : background music level  (0.0 → 1.0)
+        original_volume  : original video audio level (0.0 → 1.0)
+        Returns: video_clip with mixed audio
+        """
+        if not audio_path or not os.path.exists(audio_path):
+            return video_clip
+        clip_duration = video_clip.duration
+        logger.info(f"🎵 Mixing audio: {audio_path} | vol={bg_music_volume}")
+        music = mpe.AudioFileClip(audio_path)
+        # لو الموسيقى أقصر من الكليب → لوّب
+        if music.duration < clip_duration:
+            loops = int(clip_duration / music.duration) + 1
+            music = mpe.concatenate_audioclips([music] * loops)
+            logger.info(f"🔁 Music looped x{loops}")
+        # قص الموسيقى بنفس طول الكليب
+        music = music.subclip(0, clip_duration).volumex(bg_music_volume)
+        original_audio = video_clip.audio
+        # لو مفيش صوت أصلي → خلي الموسيقى بس
+        if original_audio is None:
+            logger.info("⚠️ No original audio — using music only")
+            return video_clip.set_audio(music)
+        # خلط الصوتين
+        mixed = mpe.CompositeAudioClip([
+            original_audio.volumex(original_volume),
+            music,
+        ])
+        logger.info("✅ Audio mixed successfully")
+        return video_clip.set_audio(mixed)
     # ── JSON helpers ──────────────────────────────────────────────────────────
     def _clean_json_response(self, content):
         data = {
             "segments":          full_segments,
+            "full_text":         full_text,
             "detected_language": detected_lang,
             "target_language":   target_language,
             "duration":          duration,
         Cuts, styles, captions, and exports each viral clip.
         ✅ Returns: (output_files, transcripts_per_clip)
+            output_files         : list of str — paths to rendered .mp4 files
             transcripts_per_clip : list of dicts, one per successfully rendered clip:
                 {
+                    "clip_index" : int,
+                    "filename"   : str,
+                    "start"      : float,
+                    "end"        : float,
+                    "language"   : str,
+                    "segments"   : [ ... ],
+                    "full_text"  : str,
                 }
         """
         logger.info("🎨 Phase 3: Style & Captions …")
         # ── Main loop ─────────────────────────────────────────────────────────
         output_files         = []
+        transcripts_per_clip = []
         if not best_clips:
             logger.warning("⚠️ No clips to process.")
                 logger.info(f"\n🎬 Clip {i+1}/{len(best_clips)} ({start:.2f}s – {end:.2f}s)")
                 # ── Output path ───────────────────────────────────────────────
+                task_id      = kwargs.get("task_id")
+                prefix       = f"viral_{task_id}_{i+1}" if task_id else f"viral_{i+1}"
+                out_name     = f"{prefix}_{style_str}.mp4"
                 final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
                 os.makedirs(os.path.dirname(final_output), exist_ok=True)
                     playground_path = kwargs.get("playground_path"),
                 )
+                # ✅ Mix background music ──────────────────────────────────────
+                final_clip = self.mix_audio(
+                    final_clip,
+                    audio_path      = kwargs.get("audio_path"),
+                    bg_music_volume = kwargs.get("bg_music_volume", 0.1),
+                    original_volume = 1.0,
+                )
                 # ── Export ────────────────────────────────────────────────────
                 cpu_count = os.cpu_count() or 4
                 logger.info(f"⚙️ Rendering with {cpu_count} thread(s) …")
                 output_files.append(final_output)
                 logger.info(f"✅ Saved: {final_output}")
+                # ── Build transcript entry ────────────────────────────────────
                 clip_full_text = " ".join(s.get("text", "") for s in clip_segments).strip()
                 transcripts_per_clip.append({
                     "clip_index": i + 1,
                             pass
                 gc.collect()
+        return output_files, transcripts_per_clip
 # ─────────────────────────────────────────────────────────────────────────────
     """
     End-to-end pipeline: STT → AI analysis → clip export.
+    ✅ Returns a dict with:
         {
+            "output_files"   : list[str],
+            "transcripts"    : list[dict],
+            "viral_segments" : list[dict],
+            "full_transcript": str,
+            "duration"       : float,
         }
     Important kwargs:
         source_language : language of the original video → passed to Whisper.
         language        : desired output language (translation + captions).
         caption_mode    : sentence | word | highlight_word
         caption_style   : classic | modern_glow | tiktok_bold | …
+        audio_path      : path to background music file
+        bg_music_volume : background music volume (0.0 → 1.0)
     """
     try:
+        processor    = VideoProcessor(model_size=model_size)
         caption_mode = kwargs.get("caption_mode", "sentence")
         timestamp_mode = (
             "words"
             if caption_mode in ("word", "highlight_word")
             else "segments"
         )
         viral_segments, duration, stt_data = processor.analyze_impact(
             video_path,
             source_language = kwargs.get("source_language"),
         best_clips = processor.get_best_segments(viral_segments, duration)
         output_files, transcripts = processor.process_clips(
             video_path,
             best_clips,
     if len(sys.argv) > 1:
         result = process_video(sys.argv[1])
         print(json.dumps({
+            "clips":           result["output_files"],
             "full_transcript": result["full_transcript"],
             "clip_transcripts": [
                 {"clip": t["clip_index"], "text": t["full_text"]}