import gradio as gr import torch import tempfile import soundfile as sf import librosa import numpy as np from demucs.pretrained import get_model from demucs.apply import apply_model # Load Demucs model model = get_model("htdemucs") model.cpu() model.eval() def remove_vocals(audio_file): # load audio audio, sr = librosa.load(audio_file, sr=44100, mono=False) if audio.ndim == 1: audio = np.expand_dims(audio, axis=0) audio_tensor = torch.tensor(audio).unsqueeze(0) with torch.no_grad(): sources = apply_model(model, audio_tensor) # sources order: # 0 drums # 1 bass # 2 other # 3 vocals drums = sources[0][0].cpu().numpy() bass = sources[0][1].cpu().numpy() other = sources[0][2].cpu().numpy() # mix everything except vocals background = drums + bass + other output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name sf.write(output_file, background.T, sr) return output_file interface = gr.Interface( fn=remove_vocals, inputs=gr.Audio(type="filepath", label="Upload Audio"), outputs=gr.Audio(label="Background Music + Expressions"), title="Vocal Remover (Keep Music + Expressions)" ) interface.launch()