video-dubbing-agent / services /vocal_separator.py
dashhdata's picture
Upload folder using huggingface_hub
4ec3855 verified
"""
Stage 2B — Vocal/Background Separation using Demucs
Separates vocals from background music/noise for cleaner dubbing.
Falls back to using raw audio if demucs is not available.
"""
import logging
import subprocess
import shutil
from pathlib import Path
logger = logging.getLogger(__name__)
def separate_vocals(audio_path: Path, output_dir: Path) -> dict:
"""
Use demucs to separate vocals from background audio.
Returns dict with 'vocals' and 'background' paths.
Falls back to raw audio if demucs is unavailable.
"""
vocals_dir = output_dir / "separated"
vocals_dir.mkdir(exist_ok=True)
# Check if demucs is available
if not shutil.which("python") and not shutil.which("demucs"):
logger.warning("Demucs not found. Using raw audio without separation.")
return _fallback_no_separation(audio_path, output_dir)
try:
cmd = [
"python", "-m", "demucs",
"--two-stems", "vocals", # Only separate vocals vs rest
"-n", "htdemucs", # Best free model
"-o", str(vocals_dir),
"--mp3", # Smaller output
str(audio_path)
]
logger.info("Running demucs vocal separation (this takes a while for long audio)...")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=3600 # 1 hour timeout for long videos
)
if result.returncode != 0:
logger.warning(f"Demucs failed: {result.stderr}. Falling back to raw audio.")
return _fallback_no_separation(audio_path, output_dir)
# Demucs outputs to: separated/htdemucs/<filename>/vocals.mp3 and no_vocals.mp3
stem_name = audio_path.stem
demucs_out = vocals_dir / "htdemucs" / stem_name
vocals_path = demucs_out / "vocals.mp3"
background_path = demucs_out / "no_vocals.mp3"
if not vocals_path.exists():
# Try wav extension
vocals_path = demucs_out / "vocals.wav"
background_path = demucs_out / "no_vocals.wav"
if not vocals_path.exists():
logger.warning("Demucs output not found. Falling back.")
return _fallback_no_separation(audio_path, output_dir)
logger.info(f"Vocal separation complete: vocals={vocals_path}, bg={background_path}")
return {
"vocals": vocals_path,
"background": background_path,
"separated": True
}
except subprocess.TimeoutExpired:
logger.warning("Demucs timed out. Falling back to raw audio.")
return _fallback_no_separation(audio_path, output_dir)
except Exception as e:
logger.warning(f"Demucs error: {e}. Falling back to raw audio.")
return _fallback_no_separation(audio_path, output_dir)
def _fallback_no_separation(audio_path: Path, output_dir: Path) -> dict:
"""Fallback: use raw audio as vocals, no background track."""
logger.info("Using raw audio without vocal separation.")
return {
"vocals": audio_path,
"background": None,
"separated": False
}