Spaces:

neonwatty
/

forgot-the-words-api

Sleeping

App Files Files Community

neonwatty commited on Jan 3

Commit

8f3d55f

verified ·

1 Parent(s): 9e93271

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +41 -5
app.py +241 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,12 +1,48 @@
 ---
-title: Forgot The Words Api
-emoji: 👁
 colorFrom: purple
-colorTo: indigo
 sdk: gradio
-sdk_version: 6.2.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Forgot The Words API
+emoji: 🎤
 colorFrom: purple
+colorTo: pink
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: mit
+hardware: zero-a10g
 ---
+# Forgot The Words - API Backend
+Backend API for "I Forgot The Words To This Song" - removes vocals from songs so you can sing your own version.
+Powered by [Meta SAM Audio](https://github.com/facebookresearch/sam-audio).
+## API Endpoints
+### `/separate_audio`
+Separates audio based on text description.
+**Parameters:**
+- `audio_path`: Audio file
+- `description`: What to isolate (e.g., "singing voice, vocals")
+- `predict_spans`: Auto-detect timing (default: true)
+- `reranking_candidates`: Quality setting (default: 1)
+**Returns:** `[target_audio, residual_audio]`
+## Usage
+```python
+from gradio_client import Client
+client = Client("neonwatty/forgot-the-words-api")
+result = client.predict(
+    audio_path="song.mp3",
+    description="singing voice, vocals, human voice",
+    predict_spans=True,
+    reranking_candidates=1,
+    api_name="/separate_audio"
+)
+vocals, instrumentals = result
+```

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+SAM Audio Source Separation - Gradio Backend
+Runs on Hugging Face Spaces with ZeroGPU
+"""
+import gradio as gr
+import spaces
+import torch
+import torchaudio
+import tempfile
+import os
+from pathlib import Path
+# Global model references (loaded lazily)
+model = None
+processor = None
+def load_model():
+    """Load SAM Audio model (called once, cached)"""
+    global model, processor
+    if model is None:
+        from sam_audio import SAMAudio, SAMAudioProcessor
+        print("Loading SAM Audio model...")
+        processor = SAMAudioProcessor.from_pretrained("facebook/sam-audio-large")
+        model = SAMAudio.from_pretrained("facebook/sam-audio-large")
+        model = model.eval()
+        if torch.cuda.is_available():
+            model = model.cuda()
+            print("Model loaded on CUDA")
+        else:
+            print("Model loaded on CPU")
+    return model, processor
+@spaces.GPU(duration=120)  # Up to 2 minutes of GPU time per call
+def separate_audio(
+    audio_path: str,
+    description: str,
+    predict_spans: bool = True,
+    reranking_candidates: int = 1
+):
+    """
+    Separate audio based on text description.
+    Args:
+        audio_path: Path to input audio file
+        description: Text description of sound to isolate (e.g., "vocals", "drums", "dog barking")
+        predict_spans: Auto-detect sound timing (improves quality, adds latency)
+        reranking_candidates: Number of candidates for quality (1-3 recommended)
+    Returns:
+        tuple: (target_audio_path, residual_audio_path)
+    """
+    model, processor = load_model()
+    # Move model to GPU for this inference
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    # Prepare input batch
+    batch = processor(
+        audios=[audio_path],
+        descriptions=[description],
+    ).to(device)
+    # Run separation
+    with torch.inference_mode():
+        result = model.separate(
+            batch,
+            predict_spans=predict_spans,
+            reranking_candidates=reranking_candidates
+        )
+    # Save outputs to temporary files
+    sample_rate = processor.audio_sampling_rate
+    # Create temp directory for outputs
+    temp_dir = tempfile.mkdtemp()
+    target_path = os.path.join(temp_dir, "target.wav")
+    residual_path = os.path.join(temp_dir, "residual.wav")
+    torchaudio.save(target_path, result.target.cpu(), sample_rate)
+    torchaudio.save(residual_path, result.residual.cpu(), sample_rate)
+    return target_path, residual_path
+@spaces.GPU(duration=180)  # Up to 3 minutes for multi-stem
+def separate_music_stems(audio_path: str):
+    """
+    Separate music into standard stems: vocals, drums, bass, other.
+    Makes 4 separate calls to SAM Audio with different descriptions.
+    Args:
+        audio_path: Path to input audio file
+    Returns:
+        tuple: (vocals_path, drums_path, bass_path, other_path)
+    """
+    model, processor = load_model()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    # Standard music stems with descriptions
+    stems = [
+        ("vocals", "singing voice, human vocals"),
+        ("drums", "drums, percussion, drum kit"),
+        ("bass", "bass guitar, bass instrument"),
+        ("other", "other instruments, melody, harmony"),
+    ]
+    temp_dir = tempfile.mkdtemp()
+    output_paths = []
+    for stem_name, description in stems:
+        # Prepare batch
+        batch = processor(
+            audios=[audio_path],
+            descriptions=[description],
+        ).to(device)
+        # Run separation
+        with torch.inference_mode():
+            result = model.separate(
+                batch,
+                predict_spans=True,
+                reranking_candidates=1
+            )
+        # Save stem
+        sample_rate = processor.audio_sampling_rate
+        stem_path = os.path.join(temp_dir, f"{stem_name}.wav")
+        torchaudio.save(stem_path, result.target.cpu(), sample_rate)
+        output_paths.append(stem_path)
+    return tuple(output_paths)
+# Create Gradio interface
+with gr.Blocks(
+    title="Audio Source Separation",
+    theme=gr.themes.Soft(
+        primary_hue="violet",
+        secondary_hue="slate",
+    ),
+    css="""
+        .gradio-container { max-width: 900px !important; }
+        .gr-button-primary { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; }
+    """
+) as demo:
+    gr.Markdown("""
+    # Audio Source Separation
+    Powered by [Meta SAM Audio](https://github.com/facebookresearch/sam-audio) - separate any sound from audio using text descriptions.
+    """)
+    with gr.Tabs():
+        # Tab 1: Custom separation
+        with gr.TabItem("Custom Separation"):
+            gr.Markdown("Describe the sound you want to isolate:")
+            with gr.Row():
+                with gr.Column():
+                    audio_input = gr.Audio(
+                        label="Upload Audio",
+                        type="filepath",
+                        sources=["upload", "microphone"]
+                    )
+                    description_input = gr.Textbox(
+                        label="Sound Description",
+                        placeholder="e.g., 'singing voice', 'dog barking', 'piano melody'",
+                        info="Use lowercase noun-phrase or verb-phrase format"
+                    )
+                    with gr.Accordion("Advanced Options", open=False):
+                        predict_spans = gr.Checkbox(
+                            label="Auto-detect timing",
+                            value=True,
+                            info="Improves quality but adds latency"
+                        )
+                        reranking = gr.Slider(
+                            label="Quality (reranking candidates)",
+                            minimum=1,
+                            maximum=3,
+                            step=1,
+                            value=1,
+                            info="Higher = better quality, more latency"
+                        )
+                    separate_btn = gr.Button("Separate Audio", variant="primary")
+                with gr.Column():
+                    target_output = gr.Audio(label="Isolated Sound (Target)")
+                    residual_output = gr.Audio(label="Everything Else (Residual)")
+            separate_btn.click(
+                fn=separate_audio,
+                inputs=[audio_input, description_input, predict_spans, reranking],
+                outputs=[target_output, residual_output]
+            )
+        # Tab 2: Music stem separation
+        with gr.TabItem("Music Stems"):
+            gr.Markdown("Separate music into vocals, drums, bass, and other instruments:")
+            with gr.Row():
+                with gr.Column():
+                    music_input = gr.Audio(
+                        label="Upload Music",
+                        type="filepath",
+                        sources=["upload"]
+                    )
+                    stems_btn = gr.Button("Separate into Stems", variant="primary")
+                with gr.Column():
+                    vocals_output = gr.Audio(label="Vocals")
+                    drums_output = gr.Audio(label="Drums")
+                    bass_output = gr.Audio(label="Bass")
+                    other_output = gr.Audio(label="Other")
+            stems_btn.click(
+                fn=separate_music_stems,
+                inputs=[music_input],
+                outputs=[vocals_output, drums_output, bass_output, other_output]
+            )
+    gr.Markdown("""
+    ---
+    **Tips:**
+    - For best results, use clear descriptions like "singing voice" rather than "the singer"
+    - Processing time depends on audio length (typically 30-60 seconds for a 3-minute song)
+    - GPU time is limited to 25 minutes/day on free tier, 5x more on Pro
+    """)
+# Launch with API enabled for frontend integration
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.0.0
+spaces
+torch>=2.0.0
+torchaudio>=2.0.0
+transformers>=4.35.0
+accelerate
+sam-audio @ git+https://github.com/facebookresearch/sam-audio.git