Spaces:

neonwatty
/

forgot-the-words-api

Sleeping

App Files Files Community

neonwatty commited on Jan 7

Commit

a02a0e8

verified ·

1 Parent(s): af3deb4

Switch to Demucs for vocal separation (SAM Audio incompatible with ZeroGPU)

Browse files

Files changed (1) hide show

app.py +62 -96

app.py CHANGED Viewed

@@ -1,12 +1,10 @@
 """
-SAM Audio Source Separation - Gradio Backend
 Runs on Hugging Face Spaces with ZeroGPU
 """
 import os
-# Set CUDA debugging before any torch imports
-os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 import spaces
 import gradio as gr
 import torch
@@ -14,34 +12,23 @@ import torchaudio
 import tempfile
 import warnings
 import numpy as np
-from huggingface_hub import login
 warnings.filterwarnings("ignore")
-# Login to HuggingFace if token is available (for gated models)
-hf_token = os.environ.get("HF_TOKEN")
-if hf_token:
-    login(token=hf_token)
-    print("Logged in to HuggingFace Hub")
-# DO NOT import sam_audio here - it initializes CUDA
-# Import inside GPU function to avoid ZeroGPU CUDA fork issues
-MODEL_ID = "facebook/sam-audio-small"
-print(f"Model ID: {MODEL_ID} (will load on first GPU request)")
 @spaces.GPU(duration=120)
 def run_separation_gpu(
     waveform_np: np.ndarray,
     sample_rate: int,
-    description: str,
-    predict_spans: bool,
-    reranking_candidates: int
 ):
-    """Run separation on GPU with numpy waveform input."""
-    # Import sam_audio inside GPU function to avoid CUDA fork issues
-    from sam_audio import SAMAudio, SAMAudioProcessor
     print(f"[GPU] run_separation_gpu called")
     print(f"[GPU] waveform shape: {waveform_np.shape}, sample_rate: {sample_rate}")
@@ -49,81 +36,76 @@ def run_separation_gpu(
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"[GPU] Using device: {device}")
-    # Load model fresh each time (ZeroGPU workers don't persist state)
-    print(f"[GPU] Loading model to {device}...")
-    print(f"[GPU] CUDA available: {torch.cuda.is_available()}")
     if torch.cuda.is_available():
         print(f"[GPU] CUDA device: {torch.cuda.get_device_name(0)}")
         print(f"[GPU] CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-    processor = SAMAudioProcessor.from_pretrained(MODEL_ID)
-    print(f"[GPU] Processor loaded")
-    model = SAMAudio.from_pretrained(MODEL_ID)
-    print(f"[GPU] Model loaded to CPU")
-    # Clear CUDA cache before moving model
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        print(f"[GPU] CUDA cache cleared")
-    model = model.to(device)
-    print(f"[GPU] Model moved to {device}")
     model.eval()
-    print(f"[GPU] Model in eval mode")
-    # Convert numpy to tensor and save to temp file for processor
-    # Gradio passes audio as (samples, channels), torchaudio expects (channels, samples)
     waveform = torch.from_numpy(waveform_np).float()
-    if waveform.dim() == 2:
-        waveform = waveform.T  # Transpose to (channels, samples)
-    elif waveform.dim() == 1:
-        waveform = waveform.unsqueeze(0)  # Add channel dimension for mono
-    temp_dir = tempfile.mkdtemp()
-    input_path = os.path.join(temp_dir, "input.wav")
-    torchaudio.save(input_path, waveform, sample_rate)
-    print(f"[GPU] Saved input to: {input_path}")
-    # Process audio
-    batch = processor(
-        audios=[input_path],
-        descriptions=[description],
-    ).to(device)
     # Run separation
     with torch.inference_mode():
-        result = model.separate(
-            batch,
-            predict_spans=predict_spans,
-            reranking_candidates=reranking_candidates
-        )
     # Save outputs
-    output_sample_rate = processor.audio_sampling_rate
-    target_path = os.path.join(temp_dir, "target.wav")
-    residual_path = os.path.join(temp_dir, "residual.wav")
-    torchaudio.save(target_path, result.target.cpu(), output_sample_rate)
-    torchaudio.save(residual_path, result.residual.cpu(), output_sample_rate)
-    print(f"[GPU] Saved outputs to {target_path} and {residual_path}")
-    return target_path, residual_path
-def separate_audio(
-    audio_tuple,
-    description: str,
-    predict_spans: bool = True,
-    reranking_candidates: int = 1
-):
     """
     Wrapper that receives numpy audio from Gradio and calls GPU function.
     audio_tuple is (sample_rate, numpy_array) when type="numpy"
     """
     print(f"[Main] separate_audio called")
-    print(f"[Main] audio_tuple type: {type(audio_tuple)}")
     if audio_tuple is None:
         raise gr.Error("Please upload an audio file")
@@ -131,38 +113,22 @@ def separate_audio(
     sample_rate, audio_data = audio_tuple
     print(f"[Main] sample_rate: {sample_rate}, audio_data shape: {audio_data.shape}")
-    if not description:
-        raise gr.Error("Please enter a description of the sound to isolate")
-    # Call the GPU function with numpy data
-    return run_separation_gpu(
-        audio_data,
-        sample_rate,
-        description,
-        predict_spans,
-        reranking_candidates
-    )
-# Create Gradio interface with type="numpy" to avoid file path issues
 demo = gr.Interface(
     fn=separate_audio,
     inputs=[
         gr.Audio(label="Upload Audio", type="numpy"),
-        gr.Textbox(
-            label="Sound to Isolate",
-            value="singing voice, vocals, human voice",
-            placeholder="e.g., 'singing voice, vocals, human voice'"
-        ),
-        gr.Checkbox(label="Auto-detect timing", value=True),
-        gr.Slider(label="Quality", minimum=1, maximum=3, step=1, value=1)
     ],
     outputs=[
-        gr.Audio(label="Isolated Sound (Target)"),
-        gr.Audio(label="Background (Residual)")
     ],
     title="Forgot The Words - API Backend",
-    description="Remove vocals from songs using [Meta SAM Audio](https://github.com/facebookresearch/sam-audio).",
     api_name="separate_audio",
     allow_flagging="never"
 )

 """
+Demucs Audio Source Separation - Gradio Backend
 Runs on Hugging Face Spaces with ZeroGPU
+Uses Meta's Demucs model for vocal separation
 """
 import os
 import spaces
 import gradio as gr
 import torch
 import tempfile
 import warnings
 import numpy as np
 warnings.filterwarnings("ignore")
+# Demucs model - htdemucs is the best quality model
+MODEL_NAME = "htdemucs"
+print(f"Model: {MODEL_NAME} (will load on first GPU request)")
 @spaces.GPU(duration=120)
 def run_separation_gpu(
     waveform_np: np.ndarray,
     sample_rate: int,
 ):
+    """Run Demucs separation on GPU."""
+    # Import demucs inside GPU function to avoid CUDA issues
+    from demucs.pretrained import get_model
+    from demucs.apply import apply_model
     print(f"[GPU] run_separation_gpu called")
     print(f"[GPU] waveform shape: {waveform_np.shape}, sample_rate: {sample_rate}")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"[GPU] Using device: {device}")
     if torch.cuda.is_available():
         print(f"[GPU] CUDA device: {torch.cuda.get_device_name(0)}")
         print(f"[GPU] CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    # Load Demucs model
+    print(f"[GPU] Loading Demucs model: {MODEL_NAME}")
+    model = get_model(MODEL_NAME)
+    model.to(device)
     model.eval()
+    print(f"[GPU] Model loaded and moved to {device}")
+    # Convert numpy to tensor
+    # Gradio passes audio as (samples, channels), we need (batch, channels, samples)
     waveform = torch.from_numpy(waveform_np).float()
+    if waveform.dim() == 1:
+        # Mono: (samples,) -> (1, 1, samples)
+        waveform = waveform.unsqueeze(0).unsqueeze(0)
+    elif waveform.dim() == 2:
+        # Stereo from Gradio: (samples, channels) -> (1, channels, samples)
+        waveform = waveform.T.unsqueeze(0)
+    print(f"[GPU] Waveform tensor shape: {waveform.shape}")
+    # Resample to model's expected sample rate (44100 Hz for Demucs)
+    model_sr = model.samplerate
+    if sample_rate != model_sr:
+        print(f"[GPU] Resampling from {sample_rate} to {model_sr}")
+        resampler = torchaudio.transforms.Resample(sample_rate, model_sr)
+        waveform = resampler(waveform)
+    # Move to device
+    waveform = waveform.to(device)
     # Run separation
+    print(f"[GPU] Running separation...")
     with torch.inference_mode():
+        sources = apply_model(model, waveform, device=device, progress=False)
+    # sources shape: (batch, num_sources, channels, samples)
+    # htdemucs sources: drums, bass, other, vocals
+    sources = sources.squeeze(0)  # Remove batch dimension
+    print(f"[GPU] Sources shape: {sources.shape}")
+    # Get vocals and create instrumental (everything except vocals)
+    # Source indices for htdemucs: 0=drums, 1=bass, 2=other, 3=vocals
+    vocals = sources[3]  # vocals
+    instrumental = sources[0] + sources[1] + sources[2]  # drums + bass + other
+    print(f"[GPU] Vocals shape: {vocals.shape}, Instrumental shape: {instrumental.shape}")
     # Save outputs
+    temp_dir = tempfile.mkdtemp()
+    vocals_path = os.path.join(temp_dir, "vocals.wav")
+    instrumental_path = os.path.join(temp_dir, "instrumental.wav")
+    # Save at model's sample rate
+    torchaudio.save(vocals_path, vocals.cpu(), model_sr)
+    torchaudio.save(instrumental_path, instrumental.cpu(), model_sr)
+    print(f"[GPU] Saved outputs to {vocals_path} and {instrumental_path}")
+    return vocals_path, instrumental_path
+def separate_audio(audio_tuple):
     """
     Wrapper that receives numpy audio from Gradio and calls GPU function.
     audio_tuple is (sample_rate, numpy_array) when type="numpy"
     """
     print(f"[Main] separate_audio called")
     if audio_tuple is None:
         raise gr.Error("Please upload an audio file")
     sample_rate, audio_data = audio_tuple
     print(f"[Main] sample_rate: {sample_rate}, audio_data shape: {audio_data.shape}")
+    # Call the GPU function
+    return run_separation_gpu(audio_data, sample_rate)
+# Create Gradio interface
 demo = gr.Interface(
     fn=separate_audio,
     inputs=[
         gr.Audio(label="Upload Audio", type="numpy"),
     ],
     outputs=[
+        gr.Audio(label="Vocals"),
+        gr.Audio(label="Instrumental (Karaoke)")
     ],
     title="Forgot The Words - API Backend",
+    description="Remove vocals from songs using [Meta Demucs](https://github.com/facebookresearch/demucs). Upload a song and get the vocals and instrumental tracks separated.",
     api_name="separate_audio",
     allow_flagging="never"
 )