neonwatty commited on
Commit
8f3d55f
·
verified ·
1 Parent(s): 9e93271

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +41 -5
  2. app.py +241 -0
  3. requirements.txt +7 -0
README.md CHANGED
@@ -1,12 +1,48 @@
1
  ---
2
- title: Forgot The Words Api
3
- emoji: 👁
4
  colorFrom: purple
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 6.2.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Forgot The Words API
3
+ emoji: 🎤
4
  colorFrom: purple
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ hardware: zero-a10g
12
  ---
13
 
14
+ # Forgot The Words - API Backend
15
+
16
+ Backend API for "I Forgot The Words To This Song" - removes vocals from songs so you can sing your own version.
17
+
18
+ Powered by [Meta SAM Audio](https://github.com/facebookresearch/sam-audio).
19
+
20
+ ## API Endpoints
21
+
22
+ ### `/separate_audio`
23
+ Separates audio based on text description.
24
+
25
+ **Parameters:**
26
+ - `audio_path`: Audio file
27
+ - `description`: What to isolate (e.g., "singing voice, vocals")
28
+ - `predict_spans`: Auto-detect timing (default: true)
29
+ - `reranking_candidates`: Quality setting (default: 1)
30
+
31
+ **Returns:** `[target_audio, residual_audio]`
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ from gradio_client import Client
37
+
38
+ client = Client("neonwatty/forgot-the-words-api")
39
+ result = client.predict(
40
+ audio_path="song.mp3",
41
+ description="singing voice, vocals, human voice",
42
+ predict_spans=True,
43
+ reranking_candidates=1,
44
+ api_name="/separate_audio"
45
+ )
46
+
47
+ vocals, instrumentals = result
48
+ ```
app.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SAM Audio Source Separation - Gradio Backend
3
+ Runs on Hugging Face Spaces with ZeroGPU
4
+ """
5
+
6
+ import gradio as gr
7
+ import spaces
8
+ import torch
9
+ import torchaudio
10
+ import tempfile
11
+ import os
12
+ from pathlib import Path
13
+
14
+ # Global model references (loaded lazily)
15
+ model = None
16
+ processor = None
17
+
18
+ def load_model():
19
+ """Load SAM Audio model (called once, cached)"""
20
+ global model, processor
21
+ if model is None:
22
+ from sam_audio import SAMAudio, SAMAudioProcessor
23
+
24
+ print("Loading SAM Audio model...")
25
+ processor = SAMAudioProcessor.from_pretrained("facebook/sam-audio-large")
26
+ model = SAMAudio.from_pretrained("facebook/sam-audio-large")
27
+ model = model.eval()
28
+
29
+ if torch.cuda.is_available():
30
+ model = model.cuda()
31
+ print("Model loaded on CUDA")
32
+ else:
33
+ print("Model loaded on CPU")
34
+
35
+ return model, processor
36
+
37
+
38
+ @spaces.GPU(duration=120) # Up to 2 minutes of GPU time per call
39
+ def separate_audio(
40
+ audio_path: str,
41
+ description: str,
42
+ predict_spans: bool = True,
43
+ reranking_candidates: int = 1
44
+ ):
45
+ """
46
+ Separate audio based on text description.
47
+
48
+ Args:
49
+ audio_path: Path to input audio file
50
+ description: Text description of sound to isolate (e.g., "vocals", "drums", "dog barking")
51
+ predict_spans: Auto-detect sound timing (improves quality, adds latency)
52
+ reranking_candidates: Number of candidates for quality (1-3 recommended)
53
+
54
+ Returns:
55
+ tuple: (target_audio_path, residual_audio_path)
56
+ """
57
+ model, processor = load_model()
58
+
59
+ # Move model to GPU for this inference
60
+ device = "cuda" if torch.cuda.is_available() else "cpu"
61
+ model = model.to(device)
62
+
63
+ # Prepare input batch
64
+ batch = processor(
65
+ audios=[audio_path],
66
+ descriptions=[description],
67
+ ).to(device)
68
+
69
+ # Run separation
70
+ with torch.inference_mode():
71
+ result = model.separate(
72
+ batch,
73
+ predict_spans=predict_spans,
74
+ reranking_candidates=reranking_candidates
75
+ )
76
+
77
+ # Save outputs to temporary files
78
+ sample_rate = processor.audio_sampling_rate
79
+
80
+ # Create temp directory for outputs
81
+ temp_dir = tempfile.mkdtemp()
82
+ target_path = os.path.join(temp_dir, "target.wav")
83
+ residual_path = os.path.join(temp_dir, "residual.wav")
84
+
85
+ torchaudio.save(target_path, result.target.cpu(), sample_rate)
86
+ torchaudio.save(residual_path, result.residual.cpu(), sample_rate)
87
+
88
+ return target_path, residual_path
89
+
90
+
91
+ @spaces.GPU(duration=180) # Up to 3 minutes for multi-stem
92
+ def separate_music_stems(audio_path: str):
93
+ """
94
+ Separate music into standard stems: vocals, drums, bass, other.
95
+ Makes 4 separate calls to SAM Audio with different descriptions.
96
+
97
+ Args:
98
+ audio_path: Path to input audio file
99
+
100
+ Returns:
101
+ tuple: (vocals_path, drums_path, bass_path, other_path)
102
+ """
103
+ model, processor = load_model()
104
+
105
+ device = "cuda" if torch.cuda.is_available() else "cpu"
106
+ model = model.to(device)
107
+
108
+ # Standard music stems with descriptions
109
+ stems = [
110
+ ("vocals", "singing voice, human vocals"),
111
+ ("drums", "drums, percussion, drum kit"),
112
+ ("bass", "bass guitar, bass instrument"),
113
+ ("other", "other instruments, melody, harmony"),
114
+ ]
115
+
116
+ temp_dir = tempfile.mkdtemp()
117
+ output_paths = []
118
+
119
+ for stem_name, description in stems:
120
+ # Prepare batch
121
+ batch = processor(
122
+ audios=[audio_path],
123
+ descriptions=[description],
124
+ ).to(device)
125
+
126
+ # Run separation
127
+ with torch.inference_mode():
128
+ result = model.separate(
129
+ batch,
130
+ predict_spans=True,
131
+ reranking_candidates=1
132
+ )
133
+
134
+ # Save stem
135
+ sample_rate = processor.audio_sampling_rate
136
+ stem_path = os.path.join(temp_dir, f"{stem_name}.wav")
137
+ torchaudio.save(stem_path, result.target.cpu(), sample_rate)
138
+ output_paths.append(stem_path)
139
+
140
+ return tuple(output_paths)
141
+
142
+
143
+ # Create Gradio interface
144
+ with gr.Blocks(
145
+ title="Audio Source Separation",
146
+ theme=gr.themes.Soft(
147
+ primary_hue="violet",
148
+ secondary_hue="slate",
149
+ ),
150
+ css="""
151
+ .gradio-container { max-width: 900px !important; }
152
+ .gr-button-primary { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; }
153
+ """
154
+ ) as demo:
155
+ gr.Markdown("""
156
+ # Audio Source Separation
157
+
158
+ Powered by [Meta SAM Audio](https://github.com/facebookresearch/sam-audio) - separate any sound from audio using text descriptions.
159
+ """)
160
+
161
+ with gr.Tabs():
162
+ # Tab 1: Custom separation
163
+ with gr.TabItem("Custom Separation"):
164
+ gr.Markdown("Describe the sound you want to isolate:")
165
+
166
+ with gr.Row():
167
+ with gr.Column():
168
+ audio_input = gr.Audio(
169
+ label="Upload Audio",
170
+ type="filepath",
171
+ sources=["upload", "microphone"]
172
+ )
173
+ description_input = gr.Textbox(
174
+ label="Sound Description",
175
+ placeholder="e.g., 'singing voice', 'dog barking', 'piano melody'",
176
+ info="Use lowercase noun-phrase or verb-phrase format"
177
+ )
178
+
179
+ with gr.Accordion("Advanced Options", open=False):
180
+ predict_spans = gr.Checkbox(
181
+ label="Auto-detect timing",
182
+ value=True,
183
+ info="Improves quality but adds latency"
184
+ )
185
+ reranking = gr.Slider(
186
+ label="Quality (reranking candidates)",
187
+ minimum=1,
188
+ maximum=3,
189
+ step=1,
190
+ value=1,
191
+ info="Higher = better quality, more latency"
192
+ )
193
+
194
+ separate_btn = gr.Button("Separate Audio", variant="primary")
195
+
196
+ with gr.Column():
197
+ target_output = gr.Audio(label="Isolated Sound (Target)")
198
+ residual_output = gr.Audio(label="Everything Else (Residual)")
199
+
200
+ separate_btn.click(
201
+ fn=separate_audio,
202
+ inputs=[audio_input, description_input, predict_spans, reranking],
203
+ outputs=[target_output, residual_output]
204
+ )
205
+
206
+ # Tab 2: Music stem separation
207
+ with gr.TabItem("Music Stems"):
208
+ gr.Markdown("Separate music into vocals, drums, bass, and other instruments:")
209
+
210
+ with gr.Row():
211
+ with gr.Column():
212
+ music_input = gr.Audio(
213
+ label="Upload Music",
214
+ type="filepath",
215
+ sources=["upload"]
216
+ )
217
+ stems_btn = gr.Button("Separate into Stems", variant="primary")
218
+
219
+ with gr.Column():
220
+ vocals_output = gr.Audio(label="Vocals")
221
+ drums_output = gr.Audio(label="Drums")
222
+ bass_output = gr.Audio(label="Bass")
223
+ other_output = gr.Audio(label="Other")
224
+
225
+ stems_btn.click(
226
+ fn=separate_music_stems,
227
+ inputs=[music_input],
228
+ outputs=[vocals_output, drums_output, bass_output, other_output]
229
+ )
230
+
231
+ gr.Markdown("""
232
+ ---
233
+ **Tips:**
234
+ - For best results, use clear descriptions like "singing voice" rather than "the singer"
235
+ - Processing time depends on audio length (typically 30-60 seconds for a 3-minute song)
236
+ - GPU time is limited to 25 minutes/day on free tier, 5x more on Pro
237
+ """)
238
+
239
+
240
+ # Launch with API enabled for frontend integration
241
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ spaces
3
+ torch>=2.0.0
4
+ torchaudio>=2.0.0
5
+ transformers>=4.35.0
6
+ accelerate
7
+ sam-audio @ git+https://github.com/facebookresearch/sam-audio.git