Spaces:

ex510
/

auto_cliper

Sleeping

App Files Files Community

ex510 commited on Mar 2

Commit

6ad2031

verified ·

1 Parent(s): c6c14f2

Update processor.py

Browse files

Files changed (1) hide show

processor.py +165 -23

processor.py CHANGED Viewed

@@ -17,12 +17,17 @@ Fixes applied:
              { clip_index, start, end, segments, full_text }
   - ✅ NEW: process_video returns a dict with keys:
              output_files, transcripts, viral_segments, duration
-  - ✅ NEW: mix_audio method added to VideoProcessor
-             blends background music with original video audio
 """
 import os
 import gc
 import json
 import traceback
 import moviepy.editor as mpe
 import json_repair
@@ -75,49 +80,152 @@ class VideoProcessor:
         self.stt = STT(model_size=model_size)
         Config.setup_dirs()
-    # ── Audio Mixing ──────────────────────────────────────────────────────────
     def mix_audio(self, video_clip, audio_path=None, bg_music_volume=0.1, original_volume=1.0):
         """
-        Blends background music with the original video audio.
         video_clip       : MoviePy VideoFileClip or CompositeVideoClip
         audio_path       : path to music file (mp3/m4a/...) — None = skip
         bg_music_volume  : background music level  (0.0 → 1.0)
         original_volume  : original video audio level (0.0 → 1.0)
-        Returns: video_clip with mixed audio
         """
         if not audio_path or not os.path.exists(audio_path):
             return video_clip
         clip_duration = video_clip.duration
-        logger.info(f"🎵 Mixing audio: {audio_path} | vol={bg_music_volume}")
         music = mpe.AudioFileClip(audio_path)
-        # لو الموسيقى أقصر من الكليب → لوّب
         if music.duration < clip_duration:
             loops = int(clip_duration / music.duration) + 1
             music = mpe.concatenate_audioclips([music] * loops)
             logger.info(f"🔁 Music looped x{loops}")
-        # قص الموسيقى بنفس طول الكليب
         music = music.subclip(0, clip_duration).volumex(bg_music_volume)
         original_audio = video_clip.audio
-        # لو مفيش صوت أصلي → خلي الموسيقى بس
         if original_audio is None:
             logger.info("⚠️ No original audio — using music only")
             return video_clip.set_audio(music)
-        # خلط الصوتين
         mixed = mpe.CompositeAudioClip([
             original_audio.volumex(original_volume),
             music,
         ])
-        logger.info("✅ Audio mixed successfully")
         return video_clip.set_audio(mixed)
     # ── JSON helpers ──────────────────────────────────────────────────────────
@@ -298,6 +406,13 @@ class VideoProcessor:
         """
         Cuts, styles, captions, and exports each viral clip.
         ✅ Returns: (output_files, transcripts_per_clip)
             output_files         : list of str — paths to rendered .mp4 files
             transcripts_per_clip : list of dicts, one per successfully rendered clip:
@@ -333,6 +448,10 @@ class VideoProcessor:
         if "." in style_str:
             style_str = style_str.split(".")[-1]
         # ── Main loop ─────────────────────────────────────────────────────────
         output_files         = []
         transcripts_per_clip = []
@@ -421,15 +540,9 @@ class VideoProcessor:
                     playground_path = kwargs.get("playground_path"),
                 )
-                # ✅ Mix background music ──────────────────────────────────────
-                final_clip = self.mix_audio(
-                    final_clip,
-                    audio_path      = kwargs.get("audio_path"),
-                    bg_music_volume = kwargs.get("bg_music_volume", 0.1),
-                    original_volume = 1.0,
-                )
-                # ── Export ────────────────────────────────────────────────────
                 cpu_count = os.cpu_count() or 4
                 logger.info(f"⚙️ Rendering with {cpu_count} thread(s) …")
@@ -441,6 +554,37 @@ class VideoProcessor:
                     logger      = None,
                 )
                 output_files.append(final_output)
                 logger.info(f"✅ Saved: {final_output}")
@@ -484,7 +628,6 @@ class VideoProcessor:
 def process_video(video_path, style="cinematic_blur", model_size="base", **kwargs):
     """
     End-to-end pipeline: STT → AI analysis → clip export.
     ✅ Returns a dict with:
         {
             "output_files"   : list[str],
@@ -493,7 +636,6 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
             "full_transcript": str,
             "duration"       : float,
         }
     Important kwargs:
         source_language : language of the original video → passed to Whisper.
         language        : desired output language (translation + captions).

              { clip_index, start, end, segments, full_text }
   - ✅ NEW: process_video returns a dict with keys:
              output_files, transcripts, viral_segments, duration
+  - ✅ NEW: mix_audio method — simple MoviePy blend (fallback / no-audio-path case)
+  - ✅ NEW: _apply_ducking_ffmpeg — FFmpeg sidechaincompress ducking (production)
+             Called as a post-process step after write_videofile to avoid
+             double-encoding. Falls back to simple mix_audio on FFmpeg failure.
 """
 import os
 import gc
 import json
+import shutil
+import subprocess
+import tempfile
 import traceback
 import moviepy.editor as mpe
 import json_repair
         self.stt = STT(model_size=model_size)
         Config.setup_dirs()
+    # ── Audio: FFmpeg Ducking (Production) ────────────────────────────────────
+    def _apply_ducking_ffmpeg(
+        self,
+        video_path: str,
+        audio_path: str,
+        bg_music_volume: float = 0.1,
+    ) -> bool:
+        """
+        ✅ Production-grade audio ducking via FFmpeg sidechaincompress.
+        Works as a POST-PROCESS step on an already-rendered .mp4 file,
+        so there is NO double-encoding of the video stream (codec=copy).
+        Ducking parameters (tuned for speech-over-music):
+          threshold : 0.02  → ducking kicks in when speech RMS > ~-34 dBFS
+          ratio     : 4     → music reduced to 1/4 of its level under speech
+          attack    : 200ms → smooth fade-down when speech starts
+          release   : 1000ms→ smooth fade-up when speech ends
+        Returns True on success, False on any FFmpeg error (caller falls back).
+        """
+        if not audio_path or not os.path.exists(audio_path):
+            return False
+        tmp_output = tempfile.mktemp(suffix=".mp4")
+        try:
+            logger.info(f"🎚️ FFmpeg ducking: {os.path.basename(audio_path)} | vol={bg_music_volume}")
+            # ── Build filter_complex ─────────────────────────────────────────
+            # [0:a] = original speech  (from rendered video)
+            # [1:a] = background music (from audio_path)
+            #
+            # Step 1 – split original audio: one copy for sidechain detection,
+            #          one copy for the final mix.
+            # Step 2 – apply volume to music.
+            # Step 3 – sidechaincompress: music ducks when speech is loud.
+            # Step 4 – amix: blend original speech + ducked music.
+            filter_complex = (
+                "[0:a]asplit=2[speech_sc][speech_mix];"
+                f"[1:a]volume={bg_music_volume},"
+                f"afade=t=in:ss=0:d=1.5,"
+                f"afade=t=out:st={{fade_start}}:d=2.0[music_in];"
+                "[music_in][speech_sc]"
+                "sidechaincompress="
+                "threshold=0.02:ratio=4:attack=200:release=1000"
+                "[music_ducked];"
+                "[speech_mix][music_ducked]amix=inputs=2:duration=first[aout]"
+            )
+            # Calculate fade-out start from video duration
+            try:
+                probe = subprocess.run(
+                    [
+                        "ffprobe", "-v", "error",
+                        "-show_entries", "format=duration",
+                        "-of", "default=noprint_wrappers=1:nokey=1",
+                        video_path,
+                    ],
+                    capture_output=True, text=True, check=True,
+                )
+                duration = float(probe.stdout.strip())
+                fade_start = max(0.0, duration - 2.0)
+            except Exception:
+                fade_start = 0.0  # fallback: no fade-out
+            filter_complex = filter_complex.format(fade_start=fade_start)
+            cmd = [
+                "ffmpeg", "-y",
+                "-i", video_path,       # input 0: rendered video (speech)
+                "-i", audio_path,       # input 1: background music
+                "-filter_complex", filter_complex,
+                "-map", "0:v",          # video stream: copy as-is (no re-encode)
+                "-map", "[aout]",       # mixed audio
+                "-c:v", "copy",         # ✅ NO video re-encoding
+                "-c:a", "aac",
+                "-b:a", "192k",
+                tmp_output,
+            ]
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                logger.error(f"❌ FFmpeg ducking failed:\n{result.stderr[-1000:]}")
+                return False
+            # Replace original file with ducked version
+            shutil.move(tmp_output, video_path)
+            logger.info("✅ FFmpeg ducking applied successfully")
+            return True
+        except FileNotFoundError:
+            logger.error("❌ FFmpeg not found — install ffmpeg and add to PATH")
+            return False
+        except Exception as e:
+            logger.error(f"❌ FFmpeg ducking error: {e}")
+            logger.error(traceback.format_exc())
+            return False
+        finally:
+            if os.path.exists(tmp_output):
+                try:
+                    os.unlink(tmp_output)
+                except Exception:
+                    pass
+    # ── Audio: Simple MoviePy Mix (Fallback) ──────────────────────────────────
     def mix_audio(self, video_clip, audio_path=None, bg_music_volume=0.1, original_volume=1.0):
         """
+        Simple MoviePy audio blend — used as fallback when FFmpeg ducking fails,
+        or when no audio_path is provided.
         video_clip       : MoviePy VideoFileClip or CompositeVideoClip
         audio_path       : path to music file (mp3/m4a/...) — None = skip
         bg_music_volume  : background music level  (0.0 → 1.0)
         original_volume  : original video audio level (0.0 → 1.0)
+        Returns: video_clip with mixed audio (or original clip unchanged)
         """
         if not audio_path or not os.path.exists(audio_path):
             return video_clip
         clip_duration = video_clip.duration
+        logger.info(f"🎵 Fallback mix: {audio_path} | vol={bg_music_volume}")
         music = mpe.AudioFileClip(audio_path)
         if music.duration < clip_duration:
             loops = int(clip_duration / music.duration) + 1
             music = mpe.concatenate_audioclips([music] * loops)
             logger.info(f"🔁 Music looped x{loops}")
         music = music.subclip(0, clip_duration).volumex(bg_music_volume)
         original_audio = video_clip.audio
         if original_audio is None:
             logger.info("⚠️ No original audio — using music only")
             return video_clip.set_audio(music)
         mixed = mpe.CompositeAudioClip([
             original_audio.volumex(original_volume),
             music,
         ])
+        logger.info("✅ Fallback audio mixed successfully")
         return video_clip.set_audio(mixed)
     # ── JSON helpers ──────────────────────────────────────────────────────────
         """
         Cuts, styles, captions, and exports each viral clip.
+        Audio strategy:
+          1. MoviePy renders the styled clip with original audio only.
+          2. _apply_ducking_ffmpeg() applies sidechaincompress as a post-process
+             on the written .mp4 (video stream copied, no re-encode).
+          3. If FFmpeg is unavailable or fails, mix_audio() is called as fallback
+             and the file is re-written with the simple blend.
         ✅ Returns: (output_files, transcripts_per_clip)
             output_files         : list of str — paths to rendered .mp4 files
             transcripts_per_clip : list of dicts, one per successfully rendered clip:
         if "." in style_str:
             style_str = style_str.split(".")[-1]
+        # ── kwargs ────────────────────────────────────────────────────────────
+        audio_path      = kwargs.get("audio_path")
+        bg_music_volume = float(kwargs.get("bg_music_volume", 0.1))
         # ── Main loop ─────────────────────────────────────────────────────────
         output_files         = []
         transcripts_per_clip = []
                     playground_path = kwargs.get("playground_path"),
                 )
+                # ── Step 1: Write clip with original audio only ───────────────
+                # Background music is NOT mixed here — FFmpeg handles it below
+                # as a post-process to avoid double video encoding.
                 cpu_count = os.cpu_count() or 4
                 logger.info(f"⚙️ Rendering with {cpu_count} thread(s) …")
                     logger      = None,
                 )
+                # ── Step 2: Apply FFmpeg ducking as post-process ──────────────
+                if audio_path:
+                    ducking_ok = self._apply_ducking_ffmpeg(
+                        final_output,
+                        audio_path,
+                        bg_music_volume,
+                    )
+                    if not ducking_ok:
+                        # ── Fallback: MoviePy simple blend ───────────────────
+                        logger.warning("⚠️ Falling back to MoviePy simple audio blend")
+                        fallback_clip  = mpe.VideoFileClip(final_output)
+                        fallback_mixed = self.mix_audio(
+                            fallback_clip,
+                            audio_path      = audio_path,
+                            bg_music_volume = bg_music_volume,
+                            original_volume = 1.0,
+                        )
+                        fallback_mixed.write_videofile(
+                            final_output,
+                            codec       = "libx264",
+                            audio_codec = "aac",
+                            threads     = cpu_count,
+                            logger      = None,
+                        )
+                        try:
+                            fallback_mixed.close()
+                            fallback_clip.close()
+                        except Exception:
+                            pass
                 output_files.append(final_output)
                 logger.info(f"✅ Saved: {final_output}")
 def process_video(video_path, style="cinematic_blur", model_size="base", **kwargs):
     """
     End-to-end pipeline: STT → AI analysis → clip export.
     ✅ Returns a dict with:
         {
             "output_files"   : list[str],
             "full_transcript": str,
             "duration"       : float,
         }
     Important kwargs:
         source_language : language of the original video → passed to Whisper.
         language        : desired output language (translation + captions).