AJ50's picture
Add song generation backend: Demucs vocal separation + voice synthesis + audio mixing
e049981
"""Vocal separation using Demucs model."""
import torch
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
from typing import Tuple
import sys
try:
from demucs.pretrained import get_model
DEMUCS_AVAILABLE = True
except ImportError:
DEMUCS_AVAILABLE = False
print("[Warning] Demucs not available. Song conversion will not work.")
class VocalSeparator:
"""Separates vocals from instrumental music using Demucs."""
def __init__(self, model_name: str = "htdemucs", device: str = None):
"""
Initialize vocal separator.
Args:
model_name: Demucs model to use ('htdemucs', 'mdx_extra', etc.)
device: 'cuda' or 'cpu'. Auto-detects if None.
"""
if not DEMUCS_AVAILABLE:
raise RuntimeError("Demucs not installed. Install with: pip install demucs")
self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[VocalSeparator] Loading {model_name} on {self.device}...")
self.model = get_model(model_name)
self.model = self.model.to(self.device)
self.model.eval()
print(f"[VocalSeparator] Model loaded successfully")
def separate(self, audio_path: Path, sr: int = 16000) -> Tuple[np.ndarray, np.ndarray]:
"""
Separate vocals and instrumental from audio file.
Args:
audio_path: Path to audio file
sr: Sample rate (default 16000)
Returns:
Tuple of (vocals, instrumental) as numpy arrays
"""
print(f"[VocalSeparator] Loading audio: {audio_path}")
# Load audio
if isinstance(audio_path, str):
audio_path = Path(audio_path)
# Use librosa to load and resample
wav, original_sr = librosa.load(str(audio_path), sr=None, mono=True)
# Resample if needed
if original_sr != sr:
wav = librosa.resample(wav, orig_sr=original_sr, target_sr=sr)
print(f"[VocalSeparator] Audio loaded: {len(wav)} samples at {sr}Hz")
# Convert to tensor (Demucs expects shape: [1, channels, samples])
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
wav_tensor = wav_tensor.to(self.device)
print(f"[VocalSeparator] Separating vocals and instrumental...")
sys.stdout.flush()
# Perform separation
with torch.no_grad():
sources = self.model.separate(wav_tensor)
# Extract vocals and other sources
# sources dict typically has: 'drums', 'bass', 'other', 'vocals'
sources = {k: v.cpu().numpy().squeeze() for k, v in sources.items()}
vocals = sources.get('vocals', np.zeros_like(wav))
# Combine other sources as instrumental
instrumental = np.zeros_like(wav)
for key in sources:
if key != 'vocals':
instrumental += sources[key]
print(f"[VocalSeparator] Separation complete")
print(f"[VocalSeparator] Vocals shape: {vocals.shape}")
print(f"[VocalSeparator] Instrumental shape: {instrumental.shape}")
return vocals, instrumental
def separate_and_save(self, audio_path: Path, output_dir: Path, sr: int = 16000) -> Tuple[Path, Path]:
"""
Separate vocals and save to files.
Args:
audio_path: Input audio file
output_dir: Directory to save separated audio
sr: Sample rate
Returns:
Tuple of (vocals_path, instrumental_path)
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
vocals, instrumental = self.separate(audio_path, sr)
vocals_path = output_dir / "vocals.wav"
instrumental_path = output_dir / "instrumental.wav"
print(f"[VocalSeparator] Saving vocals to {vocals_path}")
sf.write(vocals_path, vocals, sr)
print(f"[VocalSeparator] Saving instrumental to {instrumental_path}")
sf.write(instrumental_path, instrumental, sr)
return vocals_path, instrumental_path