neonwatty's picture
Switch to Demucs for vocal separation (SAM Audio incompatible with ZeroGPU)
a02a0e8 verified
"""
Demucs Audio Source Separation - Gradio Backend
Runs on Hugging Face Spaces with ZeroGPU
Uses Meta's Demucs model for vocal separation
"""
import os
import spaces
import gradio as gr
import torch
import torchaudio
import tempfile
import warnings
import numpy as np
warnings.filterwarnings("ignore")
# Demucs model - htdemucs is the best quality model
MODEL_NAME = "htdemucs"
print(f"Model: {MODEL_NAME} (will load on first GPU request)")
@spaces.GPU(duration=120)
def run_separation_gpu(
waveform_np: np.ndarray,
sample_rate: int,
):
"""Run Demucs separation on GPU."""
# Import demucs inside GPU function to avoid CUDA issues
from demucs.pretrained import get_model
from demucs.apply import apply_model
print(f"[GPU] run_separation_gpu called")
print(f"[GPU] waveform shape: {waveform_np.shape}, sample_rate: {sample_rate}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[GPU] Using device: {device}")
if torch.cuda.is_available():
print(f"[GPU] CUDA device: {torch.cuda.get_device_name(0)}")
print(f"[GPU] CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
# Load Demucs model
print(f"[GPU] Loading Demucs model: {MODEL_NAME}")
model = get_model(MODEL_NAME)
model.to(device)
model.eval()
print(f"[GPU] Model loaded and moved to {device}")
# Convert numpy to tensor
# Gradio passes audio as (samples, channels), we need (batch, channels, samples)
waveform = torch.from_numpy(waveform_np).float()
if waveform.dim() == 1:
# Mono: (samples,) -> (1, 1, samples)
waveform = waveform.unsqueeze(0).unsqueeze(0)
elif waveform.dim() == 2:
# Stereo from Gradio: (samples, channels) -> (1, channels, samples)
waveform = waveform.T.unsqueeze(0)
print(f"[GPU] Waveform tensor shape: {waveform.shape}")
# Resample to model's expected sample rate (44100 Hz for Demucs)
model_sr = model.samplerate
if sample_rate != model_sr:
print(f"[GPU] Resampling from {sample_rate} to {model_sr}")
resampler = torchaudio.transforms.Resample(sample_rate, model_sr)
waveform = resampler(waveform)
# Move to device
waveform = waveform.to(device)
# Run separation
print(f"[GPU] Running separation...")
with torch.inference_mode():
sources = apply_model(model, waveform, device=device, progress=False)
# sources shape: (batch, num_sources, channels, samples)
# htdemucs sources: drums, bass, other, vocals
sources = sources.squeeze(0) # Remove batch dimension
print(f"[GPU] Sources shape: {sources.shape}")
# Get vocals and create instrumental (everything except vocals)
# Source indices for htdemucs: 0=drums, 1=bass, 2=other, 3=vocals
vocals = sources[3] # vocals
instrumental = sources[0] + sources[1] + sources[2] # drums + bass + other
print(f"[GPU] Vocals shape: {vocals.shape}, Instrumental shape: {instrumental.shape}")
# Save outputs
temp_dir = tempfile.mkdtemp()
vocals_path = os.path.join(temp_dir, "vocals.wav")
instrumental_path = os.path.join(temp_dir, "instrumental.wav")
# Save at model's sample rate
torchaudio.save(vocals_path, vocals.cpu(), model_sr)
torchaudio.save(instrumental_path, instrumental.cpu(), model_sr)
print(f"[GPU] Saved outputs to {vocals_path} and {instrumental_path}")
return vocals_path, instrumental_path
def separate_audio(audio_tuple):
"""
Wrapper that receives numpy audio from Gradio and calls GPU function.
audio_tuple is (sample_rate, numpy_array) when type="numpy"
"""
print(f"[Main] separate_audio called")
if audio_tuple is None:
raise gr.Error("Please upload an audio file")
sample_rate, audio_data = audio_tuple
print(f"[Main] sample_rate: {sample_rate}, audio_data shape: {audio_data.shape}")
# Call the GPU function
return run_separation_gpu(audio_data, sample_rate)
# Create Gradio interface
demo = gr.Interface(
fn=separate_audio,
inputs=[
gr.Audio(label="Upload Audio", type="numpy"),
],
outputs=[
gr.Audio(label="Vocals"),
gr.Audio(label="Instrumental (Karaoke)")
],
title="Forgot The Words - API Backend",
description="Remove vocals from songs using [Meta Demucs](https://github.com/facebookresearch/demucs). Upload a song and get the vocals and instrumental tracks separated.",
api_name="separate_audio",
allow_flagging="never"
)
demo.launch()