Spaces:
Sleeping
Sleeping
Update processor.py
Browse files- processor.py +165 -23
processor.py
CHANGED
|
@@ -17,12 +17,17 @@ Fixes applied:
|
|
| 17 |
{ clip_index, start, end, segments, full_text }
|
| 18 |
- ✅ NEW: process_video returns a dict with keys:
|
| 19 |
output_files, transcripts, viral_segments, duration
|
| 20 |
-
- ✅ NEW: mix_audio method
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
"""
|
| 23 |
import os
|
| 24 |
import gc
|
| 25 |
import json
|
|
|
|
|
|
|
|
|
|
| 26 |
import traceback
|
| 27 |
import moviepy.editor as mpe
|
| 28 |
import json_repair
|
|
@@ -75,49 +80,152 @@ class VideoProcessor:
|
|
| 75 |
self.stt = STT(model_size=model_size)
|
| 76 |
Config.setup_dirs()
|
| 77 |
|
| 78 |
-
# ── Audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
def mix_audio(self, video_clip, audio_path=None, bg_music_volume=0.1, original_volume=1.0):
|
| 81 |
"""
|
| 82 |
-
|
|
|
|
| 83 |
|
| 84 |
video_clip : MoviePy VideoFileClip or CompositeVideoClip
|
| 85 |
audio_path : path to music file (mp3/m4a/...) — None = skip
|
| 86 |
bg_music_volume : background music level (0.0 → 1.0)
|
| 87 |
original_volume : original video audio level (0.0 → 1.0)
|
| 88 |
-
|
| 89 |
-
Returns: video_clip with mixed audio
|
| 90 |
"""
|
| 91 |
if not audio_path or not os.path.exists(audio_path):
|
| 92 |
return video_clip
|
| 93 |
|
| 94 |
clip_duration = video_clip.duration
|
| 95 |
-
logger.info(f"🎵
|
| 96 |
|
| 97 |
music = mpe.AudioFileClip(audio_path)
|
| 98 |
|
| 99 |
-
# لو الموسيقى أقصر من الكليب → لوّب
|
| 100 |
if music.duration < clip_duration:
|
| 101 |
loops = int(clip_duration / music.duration) + 1
|
| 102 |
music = mpe.concatenate_audioclips([music] * loops)
|
| 103 |
logger.info(f"🔁 Music looped x{loops}")
|
| 104 |
|
| 105 |
-
# قص الموسيقى بنفس طول الكليب
|
| 106 |
music = music.subclip(0, clip_duration).volumex(bg_music_volume)
|
| 107 |
|
| 108 |
original_audio = video_clip.audio
|
| 109 |
|
| 110 |
-
# لو مفيش صوت أصلي → خلي الموسيقى بس
|
| 111 |
if original_audio is None:
|
| 112 |
logger.info("⚠️ No original audio — using music only")
|
| 113 |
return video_clip.set_audio(music)
|
| 114 |
|
| 115 |
-
# خلط الصوتين
|
| 116 |
mixed = mpe.CompositeAudioClip([
|
| 117 |
original_audio.volumex(original_volume),
|
| 118 |
music,
|
| 119 |
])
|
| 120 |
-
logger.info("✅
|
| 121 |
return video_clip.set_audio(mixed)
|
| 122 |
|
| 123 |
# ── JSON helpers ──────────────────────────────────────────────────────────
|
|
@@ -298,6 +406,13 @@ class VideoProcessor:
|
|
| 298 |
"""
|
| 299 |
Cuts, styles, captions, and exports each viral clip.
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
✅ Returns: (output_files, transcripts_per_clip)
|
| 302 |
output_files : list of str — paths to rendered .mp4 files
|
| 303 |
transcripts_per_clip : list of dicts, one per successfully rendered clip:
|
|
@@ -333,6 +448,10 @@ class VideoProcessor:
|
|
| 333 |
if "." in style_str:
|
| 334 |
style_str = style_str.split(".")[-1]
|
| 335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
# ── Main loop ─────────────────────────────────────────────────────────
|
| 337 |
output_files = []
|
| 338 |
transcripts_per_clip = []
|
|
@@ -421,15 +540,9 @@ class VideoProcessor:
|
|
| 421 |
playground_path = kwargs.get("playground_path"),
|
| 422 |
)
|
| 423 |
|
| 424 |
-
#
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
audio_path = kwargs.get("audio_path"),
|
| 428 |
-
bg_music_volume = kwargs.get("bg_music_volume", 0.1),
|
| 429 |
-
original_volume = 1.0,
|
| 430 |
-
)
|
| 431 |
-
|
| 432 |
-
# ── Export ────────────────────────────────────────────────────
|
| 433 |
cpu_count = os.cpu_count() or 4
|
| 434 |
logger.info(f"⚙️ Rendering with {cpu_count} thread(s) …")
|
| 435 |
|
|
@@ -441,6 +554,37 @@ class VideoProcessor:
|
|
| 441 |
logger = None,
|
| 442 |
)
|
| 443 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
output_files.append(final_output)
|
| 445 |
logger.info(f"✅ Saved: {final_output}")
|
| 446 |
|
|
@@ -484,7 +628,6 @@ class VideoProcessor:
|
|
| 484 |
def process_video(video_path, style="cinematic_blur", model_size="base", **kwargs):
|
| 485 |
"""
|
| 486 |
End-to-end pipeline: STT → AI analysis → clip export.
|
| 487 |
-
|
| 488 |
✅ Returns a dict with:
|
| 489 |
{
|
| 490 |
"output_files" : list[str],
|
|
@@ -493,7 +636,6 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 493 |
"full_transcript": str,
|
| 494 |
"duration" : float,
|
| 495 |
}
|
| 496 |
-
|
| 497 |
Important kwargs:
|
| 498 |
source_language : language of the original video → passed to Whisper.
|
| 499 |
language : desired output language (translation + captions).
|
|
|
|
| 17 |
{ clip_index, start, end, segments, full_text }
|
| 18 |
- ✅ NEW: process_video returns a dict with keys:
|
| 19 |
output_files, transcripts, viral_segments, duration
|
| 20 |
+
- ✅ NEW: mix_audio method — simple MoviePy blend (fallback / no-audio-path case)
|
| 21 |
+
- ✅ NEW: _apply_ducking_ffmpeg — FFmpeg sidechaincompress ducking (production)
|
| 22 |
+
Called as a post-process step after write_videofile to avoid
|
| 23 |
+
double-encoding. Falls back to simple mix_audio on FFmpeg failure.
|
| 24 |
"""
|
| 25 |
import os
|
| 26 |
import gc
|
| 27 |
import json
|
| 28 |
+
import shutil
|
| 29 |
+
import subprocess
|
| 30 |
+
import tempfile
|
| 31 |
import traceback
|
| 32 |
import moviepy.editor as mpe
|
| 33 |
import json_repair
|
|
|
|
| 80 |
self.stt = STT(model_size=model_size)
|
| 81 |
Config.setup_dirs()
|
| 82 |
|
| 83 |
+
# ── Audio: FFmpeg Ducking (Production) ────────────────────────────────────
|
| 84 |
+
|
| 85 |
+
def _apply_ducking_ffmpeg(
|
| 86 |
+
self,
|
| 87 |
+
video_path: str,
|
| 88 |
+
audio_path: str,
|
| 89 |
+
bg_music_volume: float = 0.1,
|
| 90 |
+
) -> bool:
|
| 91 |
+
"""
|
| 92 |
+
✅ Production-grade audio ducking via FFmpeg sidechaincompress.
|
| 93 |
+
|
| 94 |
+
Works as a POST-PROCESS step on an already-rendered .mp4 file,
|
| 95 |
+
so there is NO double-encoding of the video stream (codec=copy).
|
| 96 |
+
|
| 97 |
+
Ducking parameters (tuned for speech-over-music):
|
| 98 |
+
threshold : 0.02 → ducking kicks in when speech RMS > ~-34 dBFS
|
| 99 |
+
ratio : 4 → music reduced to 1/4 of its level under speech
|
| 100 |
+
attack : 200ms → smooth fade-down when speech starts
|
| 101 |
+
release : 1000ms→ smooth fade-up when speech ends
|
| 102 |
+
|
| 103 |
+
Returns True on success, False on any FFmpeg error (caller falls back).
|
| 104 |
+
"""
|
| 105 |
+
if not audio_path or not os.path.exists(audio_path):
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
tmp_output = tempfile.mktemp(suffix=".mp4")
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
logger.info(f"🎚️ FFmpeg ducking: {os.path.basename(audio_path)} | vol={bg_music_volume}")
|
| 112 |
+
|
| 113 |
+
# ── Build filter_complex ─────────────────────────────────────────
|
| 114 |
+
# [0:a] = original speech (from rendered video)
|
| 115 |
+
# [1:a] = background music (from audio_path)
|
| 116 |
+
#
|
| 117 |
+
# Step 1 – split original audio: one copy for sidechain detection,
|
| 118 |
+
# one copy for the final mix.
|
| 119 |
+
# Step 2 – apply volume to music.
|
| 120 |
+
# Step 3 – sidechaincompress: music ducks when speech is loud.
|
| 121 |
+
# Step 4 – amix: blend original speech + ducked music.
|
| 122 |
+
filter_complex = (
|
| 123 |
+
"[0:a]asplit=2[speech_sc][speech_mix];"
|
| 124 |
+
f"[1:a]volume={bg_music_volume},"
|
| 125 |
+
f"afade=t=in:ss=0:d=1.5,"
|
| 126 |
+
f"afade=t=out:st={{fade_start}}:d=2.0[music_in];"
|
| 127 |
+
"[music_in][speech_sc]"
|
| 128 |
+
"sidechaincompress="
|
| 129 |
+
"threshold=0.02:ratio=4:attack=200:release=1000"
|
| 130 |
+
"[music_ducked];"
|
| 131 |
+
"[speech_mix][music_ducked]amix=inputs=2:duration=first[aout]"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Calculate fade-out start from video duration
|
| 135 |
+
try:
|
| 136 |
+
probe = subprocess.run(
|
| 137 |
+
[
|
| 138 |
+
"ffprobe", "-v", "error",
|
| 139 |
+
"-show_entries", "format=duration",
|
| 140 |
+
"-of", "default=noprint_wrappers=1:nokey=1",
|
| 141 |
+
video_path,
|
| 142 |
+
],
|
| 143 |
+
capture_output=True, text=True, check=True,
|
| 144 |
+
)
|
| 145 |
+
duration = float(probe.stdout.strip())
|
| 146 |
+
fade_start = max(0.0, duration - 2.0)
|
| 147 |
+
except Exception:
|
| 148 |
+
fade_start = 0.0 # fallback: no fade-out
|
| 149 |
+
|
| 150 |
+
filter_complex = filter_complex.format(fade_start=fade_start)
|
| 151 |
+
|
| 152 |
+
cmd = [
|
| 153 |
+
"ffmpeg", "-y",
|
| 154 |
+
"-i", video_path, # input 0: rendered video (speech)
|
| 155 |
+
"-i", audio_path, # input 1: background music
|
| 156 |
+
"-filter_complex", filter_complex,
|
| 157 |
+
"-map", "0:v", # video stream: copy as-is (no re-encode)
|
| 158 |
+
"-map", "[aout]", # mixed audio
|
| 159 |
+
"-c:v", "copy", # ✅ NO video re-encoding
|
| 160 |
+
"-c:a", "aac",
|
| 161 |
+
"-b:a", "192k",
|
| 162 |
+
tmp_output,
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 166 |
+
|
| 167 |
+
if result.returncode != 0:
|
| 168 |
+
logger.error(f"❌ FFmpeg ducking failed:\n{result.stderr[-1000:]}")
|
| 169 |
+
return False
|
| 170 |
+
|
| 171 |
+
# Replace original file with ducked version
|
| 172 |
+
shutil.move(tmp_output, video_path)
|
| 173 |
+
logger.info("✅ FFmpeg ducking applied successfully")
|
| 174 |
+
return True
|
| 175 |
+
|
| 176 |
+
except FileNotFoundError:
|
| 177 |
+
logger.error("❌ FFmpeg not found — install ffmpeg and add to PATH")
|
| 178 |
+
return False
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.error(f"❌ FFmpeg ducking error: {e}")
|
| 181 |
+
logger.error(traceback.format_exc())
|
| 182 |
+
return False
|
| 183 |
+
finally:
|
| 184 |
+
if os.path.exists(tmp_output):
|
| 185 |
+
try:
|
| 186 |
+
os.unlink(tmp_output)
|
| 187 |
+
except Exception:
|
| 188 |
+
pass
|
| 189 |
+
|
| 190 |
+
# ── Audio: Simple MoviePy Mix (Fallback) ──────────────────────────────────
|
| 191 |
|
| 192 |
def mix_audio(self, video_clip, audio_path=None, bg_music_volume=0.1, original_volume=1.0):
|
| 193 |
"""
|
| 194 |
+
Simple MoviePy audio blend — used as fallback when FFmpeg ducking fails,
|
| 195 |
+
or when no audio_path is provided.
|
| 196 |
|
| 197 |
video_clip : MoviePy VideoFileClip or CompositeVideoClip
|
| 198 |
audio_path : path to music file (mp3/m4a/...) — None = skip
|
| 199 |
bg_music_volume : background music level (0.0 → 1.0)
|
| 200 |
original_volume : original video audio level (0.0 → 1.0)
|
| 201 |
+
Returns: video_clip with mixed audio (or original clip unchanged)
|
|
|
|
| 202 |
"""
|
| 203 |
if not audio_path or not os.path.exists(audio_path):
|
| 204 |
return video_clip
|
| 205 |
|
| 206 |
clip_duration = video_clip.duration
|
| 207 |
+
logger.info(f"🎵 Fallback mix: {audio_path} | vol={bg_music_volume}")
|
| 208 |
|
| 209 |
music = mpe.AudioFileClip(audio_path)
|
| 210 |
|
|
|
|
| 211 |
if music.duration < clip_duration:
|
| 212 |
loops = int(clip_duration / music.duration) + 1
|
| 213 |
music = mpe.concatenate_audioclips([music] * loops)
|
| 214 |
logger.info(f"🔁 Music looped x{loops}")
|
| 215 |
|
|
|
|
| 216 |
music = music.subclip(0, clip_duration).volumex(bg_music_volume)
|
| 217 |
|
| 218 |
original_audio = video_clip.audio
|
| 219 |
|
|
|
|
| 220 |
if original_audio is None:
|
| 221 |
logger.info("⚠️ No original audio — using music only")
|
| 222 |
return video_clip.set_audio(music)
|
| 223 |
|
|
|
|
| 224 |
mixed = mpe.CompositeAudioClip([
|
| 225 |
original_audio.volumex(original_volume),
|
| 226 |
music,
|
| 227 |
])
|
| 228 |
+
logger.info("✅ Fallback audio mixed successfully")
|
| 229 |
return video_clip.set_audio(mixed)
|
| 230 |
|
| 231 |
# ── JSON helpers ──────────────────────────────────────────────────────────
|
|
|
|
| 406 |
"""
|
| 407 |
Cuts, styles, captions, and exports each viral clip.
|
| 408 |
|
| 409 |
+
Audio strategy:
|
| 410 |
+
1. MoviePy renders the styled clip with original audio only.
|
| 411 |
+
2. _apply_ducking_ffmpeg() applies sidechaincompress as a post-process
|
| 412 |
+
on the written .mp4 (video stream copied, no re-encode).
|
| 413 |
+
3. If FFmpeg is unavailable or fails, mix_audio() is called as fallback
|
| 414 |
+
and the file is re-written with the simple blend.
|
| 415 |
+
|
| 416 |
✅ Returns: (output_files, transcripts_per_clip)
|
| 417 |
output_files : list of str — paths to rendered .mp4 files
|
| 418 |
transcripts_per_clip : list of dicts, one per successfully rendered clip:
|
|
|
|
| 448 |
if "." in style_str:
|
| 449 |
style_str = style_str.split(".")[-1]
|
| 450 |
|
| 451 |
+
# ── kwargs ────────────────────────────────────────────────────────────
|
| 452 |
+
audio_path = kwargs.get("audio_path")
|
| 453 |
+
bg_music_volume = float(kwargs.get("bg_music_volume", 0.1))
|
| 454 |
+
|
| 455 |
# ── Main loop ─────────────────────────────────────────────────────────
|
| 456 |
output_files = []
|
| 457 |
transcripts_per_clip = []
|
|
|
|
| 540 |
playground_path = kwargs.get("playground_path"),
|
| 541 |
)
|
| 542 |
|
| 543 |
+
# ── Step 1: Write clip with original audio only ───────────────
|
| 544 |
+
# Background music is NOT mixed here — FFmpeg handles it below
|
| 545 |
+
# as a post-process to avoid double video encoding.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
cpu_count = os.cpu_count() or 4
|
| 547 |
logger.info(f"⚙️ Rendering with {cpu_count} thread(s) …")
|
| 548 |
|
|
|
|
| 554 |
logger = None,
|
| 555 |
)
|
| 556 |
|
| 557 |
+
# ── Step 2: Apply FFmpeg ducking as post-process ──────────────
|
| 558 |
+
if audio_path:
|
| 559 |
+
ducking_ok = self._apply_ducking_ffmpeg(
|
| 560 |
+
final_output,
|
| 561 |
+
audio_path,
|
| 562 |
+
bg_music_volume,
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
if not ducking_ok:
|
| 566 |
+
# ── Fallback: MoviePy simple blend ───────────────────
|
| 567 |
+
logger.warning("⚠️ Falling back to MoviePy simple audio blend")
|
| 568 |
+
fallback_clip = mpe.VideoFileClip(final_output)
|
| 569 |
+
fallback_mixed = self.mix_audio(
|
| 570 |
+
fallback_clip,
|
| 571 |
+
audio_path = audio_path,
|
| 572 |
+
bg_music_volume = bg_music_volume,
|
| 573 |
+
original_volume = 1.0,
|
| 574 |
+
)
|
| 575 |
+
fallback_mixed.write_videofile(
|
| 576 |
+
final_output,
|
| 577 |
+
codec = "libx264",
|
| 578 |
+
audio_codec = "aac",
|
| 579 |
+
threads = cpu_count,
|
| 580 |
+
logger = None,
|
| 581 |
+
)
|
| 582 |
+
try:
|
| 583 |
+
fallback_mixed.close()
|
| 584 |
+
fallback_clip.close()
|
| 585 |
+
except Exception:
|
| 586 |
+
pass
|
| 587 |
+
|
| 588 |
output_files.append(final_output)
|
| 589 |
logger.info(f"✅ Saved: {final_output}")
|
| 590 |
|
|
|
|
| 628 |
def process_video(video_path, style="cinematic_blur", model_size="base", **kwargs):
|
| 629 |
"""
|
| 630 |
End-to-end pipeline: STT → AI analysis → clip export.
|
|
|
|
| 631 |
✅ Returns a dict with:
|
| 632 |
{
|
| 633 |
"output_files" : list[str],
|
|
|
|
| 636 |
"full_transcript": str,
|
| 637 |
"duration" : float,
|
| 638 |
}
|
|
|
|
| 639 |
Important kwargs:
|
| 640 |
source_language : language of the original video → passed to Whisper.
|
| 641 |
language : desired output language (translation + captions).
|