Spaces:
Sleeping
Sleeping
| """ | |
| Demucs Audio Source Separation - Gradio Backend | |
| Runs on Hugging Face Spaces with ZeroGPU | |
| Uses Meta's Demucs model for vocal separation | |
| """ | |
| import os | |
| import spaces | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import warnings | |
| import numpy as np | |
| warnings.filterwarnings("ignore") | |
| # Demucs model - htdemucs is the best quality model | |
| MODEL_NAME = "htdemucs" | |
| print(f"Model: {MODEL_NAME} (will load on first GPU request)") | |
| def run_separation_gpu( | |
| waveform_np: np.ndarray, | |
| sample_rate: int, | |
| ): | |
| """Run Demucs separation on GPU.""" | |
| # Import demucs inside GPU function to avoid CUDA issues | |
| from demucs.pretrained import get_model | |
| from demucs.apply import apply_model | |
| print(f"[GPU] run_separation_gpu called") | |
| print(f"[GPU] waveform shape: {waveform_np.shape}, sample_rate: {sample_rate}") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"[GPU] Using device: {device}") | |
| if torch.cuda.is_available(): | |
| print(f"[GPU] CUDA device: {torch.cuda.get_device_name(0)}") | |
| print(f"[GPU] CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") | |
| # Load Demucs model | |
| print(f"[GPU] Loading Demucs model: {MODEL_NAME}") | |
| model = get_model(MODEL_NAME) | |
| model.to(device) | |
| model.eval() | |
| print(f"[GPU] Model loaded and moved to {device}") | |
| # Convert numpy to tensor | |
| # Gradio passes audio as (samples, channels), we need (batch, channels, samples) | |
| waveform = torch.from_numpy(waveform_np).float() | |
| if waveform.dim() == 1: | |
| # Mono: (samples,) -> (1, 1, samples) | |
| waveform = waveform.unsqueeze(0).unsqueeze(0) | |
| elif waveform.dim() == 2: | |
| # Stereo from Gradio: (samples, channels) -> (1, channels, samples) | |
| waveform = waveform.T.unsqueeze(0) | |
| print(f"[GPU] Waveform tensor shape: {waveform.shape}") | |
| # Resample to model's expected sample rate (44100 Hz for Demucs) | |
| model_sr = model.samplerate | |
| if sample_rate != model_sr: | |
| print(f"[GPU] Resampling from {sample_rate} to {model_sr}") | |
| resampler = torchaudio.transforms.Resample(sample_rate, model_sr) | |
| waveform = resampler(waveform) | |
| # Move to device | |
| waveform = waveform.to(device) | |
| # Run separation | |
| print(f"[GPU] Running separation...") | |
| with torch.inference_mode(): | |
| sources = apply_model(model, waveform, device=device, progress=False) | |
| # sources shape: (batch, num_sources, channels, samples) | |
| # htdemucs sources: drums, bass, other, vocals | |
| sources = sources.squeeze(0) # Remove batch dimension | |
| print(f"[GPU] Sources shape: {sources.shape}") | |
| # Get vocals and create instrumental (everything except vocals) | |
| # Source indices for htdemucs: 0=drums, 1=bass, 2=other, 3=vocals | |
| vocals = sources[3] # vocals | |
| instrumental = sources[0] + sources[1] + sources[2] # drums + bass + other | |
| print(f"[GPU] Vocals shape: {vocals.shape}, Instrumental shape: {instrumental.shape}") | |
| # Save outputs | |
| temp_dir = tempfile.mkdtemp() | |
| vocals_path = os.path.join(temp_dir, "vocals.wav") | |
| instrumental_path = os.path.join(temp_dir, "instrumental.wav") | |
| # Save at model's sample rate | |
| torchaudio.save(vocals_path, vocals.cpu(), model_sr) | |
| torchaudio.save(instrumental_path, instrumental.cpu(), model_sr) | |
| print(f"[GPU] Saved outputs to {vocals_path} and {instrumental_path}") | |
| return vocals_path, instrumental_path | |
| def separate_audio(audio_tuple): | |
| """ | |
| Wrapper that receives numpy audio from Gradio and calls GPU function. | |
| audio_tuple is (sample_rate, numpy_array) when type="numpy" | |
| """ | |
| print(f"[Main] separate_audio called") | |
| if audio_tuple is None: | |
| raise gr.Error("Please upload an audio file") | |
| sample_rate, audio_data = audio_tuple | |
| print(f"[Main] sample_rate: {sample_rate}, audio_data shape: {audio_data.shape}") | |
| # Call the GPU function | |
| return run_separation_gpu(audio_data, sample_rate) | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=separate_audio, | |
| inputs=[ | |
| gr.Audio(label="Upload Audio", type="numpy"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Vocals"), | |
| gr.Audio(label="Instrumental (Karaoke)") | |
| ], | |
| title="Forgot The Words - API Backend", | |
| description="Remove vocals from songs using [Meta Demucs](https://github.com/facebookresearch/demucs). Upload a song and get the vocals and instrumental tracks separated.", | |
| api_name="separate_audio", | |
| allow_flagging="never" | |
| ) | |
| demo.launch() | |