Spaces:

AJ50
/

voice-cloning-backend

Sleeping

App Files Files Community

voice-cloning-backend / backend /app /song_conversion /vocal_separator.py

AJ50

Add song generation backend: Demucs vocal separation + voice synthesis + audio mixing

e049981 20 days ago

raw

history blame contribute delete

4.33 kB

	"""Vocal separation using Demucs model."""

	import torch
	import numpy as np
	import librosa
	import soundfile as sf
	from pathlib import Path
	from typing import Tuple
	import sys

	try:
	from demucs.pretrained import get_model
	DEMUCS_AVAILABLE = True
	except ImportError:
	DEMUCS_AVAILABLE = False
	print("[Warning] Demucs not available. Song conversion will not work.")


	class VocalSeparator:
	"""Separates vocals from instrumental music using Demucs."""

	def __init__(self, model_name: str = "htdemucs", device: str = None):
	"""
	Initialize vocal separator.

	Args:
	model_name: Demucs model to use ('htdemucs', 'mdx_extra', etc.)
	device: 'cuda' or 'cpu'. Auto-detects if None.
	"""
	if not DEMUCS_AVAILABLE:
	raise RuntimeError("Demucs not installed. Install with: pip install demucs")

	self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"[VocalSeparator] Loading {model_name} on {self.device}...")

	self.model = get_model(model_name)
	self.model = self.model.to(self.device)
	self.model.eval()

	print(f"[VocalSeparator] Model loaded successfully")

	def separate(self, audio_path: Path, sr: int = 16000) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Separate vocals and instrumental from audio file.

	Args:
	audio_path: Path to audio file
	sr: Sample rate (default 16000)

	Returns:
	Tuple of (vocals, instrumental) as numpy arrays
	"""
	print(f"[VocalSeparator] Loading audio: {audio_path}")

	# Load audio
	if isinstance(audio_path, str):
	audio_path = Path(audio_path)

	# Use librosa to load and resample
	wav, original_sr = librosa.load(str(audio_path), sr=None, mono=True)

	# Resample if needed
	if original_sr != sr:
	wav = librosa.resample(wav, orig_sr=original_sr, target_sr=sr)

	print(f"[VocalSeparator] Audio loaded: {len(wav)} samples at {sr}Hz")

	# Convert to tensor (Demucs expects shape: [1, channels, samples])
	wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
	wav_tensor = wav_tensor.to(self.device)

	print(f"[VocalSeparator] Separating vocals and instrumental...")
	sys.stdout.flush()

	# Perform separation
	with torch.no_grad():
	sources = self.model.separate(wav_tensor)

	# Extract vocals and other sources
	# sources dict typically has: 'drums', 'bass', 'other', 'vocals'
	sources = {k: v.cpu().numpy().squeeze() for k, v in sources.items()}

	vocals = sources.get('vocals', np.zeros_like(wav))

	# Combine other sources as instrumental
	instrumental = np.zeros_like(wav)
	for key in sources:
	if key != 'vocals':
	instrumental += sources[key]

	print(f"[VocalSeparator] Separation complete")
	print(f"[VocalSeparator] Vocals shape: {vocals.shape}")
	print(f"[VocalSeparator] Instrumental shape: {instrumental.shape}")

	return vocals, instrumental

	def separate_and_save(self, audio_path: Path, output_dir: Path, sr: int = 16000) -> Tuple[Path, Path]:
	"""
	Separate vocals and save to files.

	Args:
	audio_path: Input audio file
	output_dir: Directory to save separated audio
	sr: Sample rate

	Returns:
	Tuple of (vocals_path, instrumental_path)
	"""
	output_dir = Path(output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	vocals, instrumental = self.separate(audio_path, sr)

	vocals_path = output_dir / "vocals.wav"
	instrumental_path = output_dir / "instrumental.wav"

	print(f"[VocalSeparator] Saving vocals to {vocals_path}")
	sf.write(vocals_path, vocals, sr)

	print(f"[VocalSeparator] Saving instrumental to {instrumental_path}")
	sf.write(instrumental_path, instrumental, sr)

	return vocals_path, instrumental_path