Spaces:

neonwatty
/

forgot-the-words-api

Sleeping

App Files Files Community

forgot-the-words-api / app.py

neonwatty

Switch to Demucs for vocal separation (SAM Audio incompatible with ZeroGPU)

a02a0e8 verified 4 months ago

raw

history blame contribute delete

4.54 kB

	"""
	Demucs Audio Source Separation - Gradio Backend
	Runs on Hugging Face Spaces with ZeroGPU
	Uses Meta's Demucs model for vocal separation
	"""

	import os
	import spaces
	import gradio as gr
	import torch
	import torchaudio
	import tempfile
	import warnings
	import numpy as np

	warnings.filterwarnings("ignore")

	# Demucs model - htdemucs is the best quality model
	MODEL_NAME = "htdemucs"
	print(f"Model: {MODEL_NAME} (will load on first GPU request)")


	@spaces.GPU(duration=120)
	def run_separation_gpu(
	waveform_np: np.ndarray,
	sample_rate: int,
	):
	"""Run Demucs separation on GPU."""
	# Import demucs inside GPU function to avoid CUDA issues
	from demucs.pretrained import get_model
	from demucs.apply import apply_model

	print(f"[GPU] run_separation_gpu called")
	print(f"[GPU] waveform shape: {waveform_np.shape}, sample_rate: {sample_rate}")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"[GPU] Using device: {device}")

	if torch.cuda.is_available():
	print(f"[GPU] CUDA device: {torch.cuda.get_device_name(0)}")
	print(f"[GPU] CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

	# Load Demucs model
	print(f"[GPU] Loading Demucs model: {MODEL_NAME}")
	model = get_model(MODEL_NAME)
	model.to(device)
	model.eval()
	print(f"[GPU] Model loaded and moved to {device}")

	# Convert numpy to tensor
	# Gradio passes audio as (samples, channels), we need (batch, channels, samples)
	waveform = torch.from_numpy(waveform_np).float()

	if waveform.dim() == 1:
	# Mono: (samples,) -> (1, 1, samples)
	waveform = waveform.unsqueeze(0).unsqueeze(0)
	elif waveform.dim() == 2:
	# Stereo from Gradio: (samples, channels) -> (1, channels, samples)
	waveform = waveform.T.unsqueeze(0)

	print(f"[GPU] Waveform tensor shape: {waveform.shape}")

	# Resample to model's expected sample rate (44100 Hz for Demucs)
	model_sr = model.samplerate
	if sample_rate != model_sr:
	print(f"[GPU] Resampling from {sample_rate} to {model_sr}")
	resampler = torchaudio.transforms.Resample(sample_rate, model_sr)
	waveform = resampler(waveform)

	# Move to device
	waveform = waveform.to(device)

	# Run separation
	print(f"[GPU] Running separation...")
	with torch.inference_mode():
	sources = apply_model(model, waveform, device=device, progress=False)

	# sources shape: (batch, num_sources, channels, samples)
	# htdemucs sources: drums, bass, other, vocals
	sources = sources.squeeze(0) # Remove batch dimension
	print(f"[GPU] Sources shape: {sources.shape}")

	# Get vocals and create instrumental (everything except vocals)
	# Source indices for htdemucs: 0=drums, 1=bass, 2=other, 3=vocals
	vocals = sources[3] # vocals
	instrumental = sources[0] + sources[1] + sources[2] # drums + bass + other

	print(f"[GPU] Vocals shape: {vocals.shape}, Instrumental shape: {instrumental.shape}")

	# Save outputs
	temp_dir = tempfile.mkdtemp()
	vocals_path = os.path.join(temp_dir, "vocals.wav")
	instrumental_path = os.path.join(temp_dir, "instrumental.wav")

	# Save at model's sample rate
	torchaudio.save(vocals_path, vocals.cpu(), model_sr)
	torchaudio.save(instrumental_path, instrumental.cpu(), model_sr)

	print(f"[GPU] Saved outputs to {vocals_path} and {instrumental_path}")
	return vocals_path, instrumental_path


	def separate_audio(audio_tuple):
	"""
	Wrapper that receives numpy audio from Gradio and calls GPU function.
	audio_tuple is (sample_rate, numpy_array) when type="numpy"
	"""
	print(f"[Main] separate_audio called")

	if audio_tuple is None:
	raise gr.Error("Please upload an audio file")

	sample_rate, audio_data = audio_tuple
	print(f"[Main] sample_rate: {sample_rate}, audio_data shape: {audio_data.shape}")

	# Call the GPU function
	return run_separation_gpu(audio_data, sample_rate)


	# Create Gradio interface
	demo = gr.Interface(
	fn=separate_audio,
	inputs=[
	gr.Audio(label="Upload Audio", type="numpy"),
	],
	outputs=[
	gr.Audio(label="Vocals"),
	gr.Audio(label="Instrumental (Karaoke)")
	],
	title="Forgot The Words - API Backend",
	description="Remove vocals from songs using [Meta Demucs](https://github.com/facebookresearch/demucs). Upload a song and get the vocals and instrumental tracks separated.",
	api_name="separate_audio",
	allow_flagging="never"
	)

	demo.launch()