Spaces:
Running
on
Zero
Running
on
Zero
Peter Shi
commited on
Commit
·
f36ee58
1
Parent(s):
8c0dc30
feat: Added the SAM Audio audio segmentation Web UI based on Gradio and its dependencies.
Browse files- README.md +54 -4
- app.py +140 -0
- requirements.txt +11 -0
README.md
CHANGED
|
@@ -1,13 +1,63 @@
|
|
| 1 |
---
|
| 2 |
title: Sam Audio Webui
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.2.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Sam Audio Webui
|
| 3 |
+
emoji: 🎵
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 6.2.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
+
fullWidth: true
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# SAM Audio WebUI
|
| 15 |
+
|
| 16 |
+
This Space hosts a WebUI for the **SAM Audio** model by Meta (Facebook), designed to segment and isolate specific sounds from audio files using text prompts.
|
| 17 |
+
|
| 18 |
+
## Features
|
| 19 |
+
|
| 20 |
+
- **Model**: Uses `facebook/sam-audio-small` for a balance of performance and resource usage.
|
| 21 |
+
- **ZeroGPU Support**: Optimized to run on Hugging Face ZeroGPU (A100/A10G) with automatic GPU handling.
|
| 22 |
+
- **Dynamic Fallback**:
|
| 23 |
+
- Attempts to load the model in `float16` for best quality.
|
| 24 |
+
- Falls back to **8-bit quantization** (`bitsandbytes`) if VRAM is insufficient.
|
| 25 |
+
- **Audio Reconstruction**: Converts model masks to audio using STFT/ISTFT processing.
|
| 26 |
+
|
| 27 |
+
## Local Development
|
| 28 |
+
|
| 29 |
+
To run this application locally on your machine:
|
| 30 |
+
|
| 31 |
+
1. **Clone the repository:**
|
| 32 |
+
```bash
|
| 33 |
+
git clone https://huggingface.co/spaces/lpeterl/sam-audio-webui
|
| 34 |
+
cd sam-audio-webui
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
2. **Create a virtual environment (Recommended):**
|
| 38 |
+
```bash
|
| 39 |
+
python3 -m venv venv
|
| 40 |
+
source venv/bin/activate
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
3. **Install dependencies:**
|
| 44 |
+
```bash
|
| 45 |
+
pip install -r requirements.txt
|
| 46 |
+
pip install gradio
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
4. **Run the app:**
|
| 50 |
+
```bash
|
| 51 |
+
python3 app.py
|
| 52 |
+
```
|
| 53 |
+
*Note: `spaces` GPU decorators are mocked locally, so you don't need a ZeroGPU environment.*
|
| 54 |
+
|
| 55 |
+
## System Requirements
|
| 56 |
+
|
| 57 |
+
- **VRAM**: ~21.6 GB for standard loading. ~12 GB with 8-bit quantization.
|
| 58 |
+
- **Platform**: CUDA (NVIDIA GPU) required for quantization. Mac (MPS) supported for standard loading (requires high unified memory).
|
| 59 |
+
|
| 60 |
+
## Acknowledgements
|
| 61 |
+
|
| 62 |
+
- Model: [facebook/sam-audio](https://huggingface.co/facebook/sam-audio)
|
| 63 |
+
- Library: [Hugging Face Transformers](https://huggingface.co/docs/transformers/index)
|
app.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import torch
|
| 3 |
+
try:
|
| 4 |
+
import spaces
|
| 5 |
+
except ImportError:
|
| 6 |
+
class spaces:
|
| 7 |
+
@staticmethod
|
| 8 |
+
def GPU(duration=60):
|
| 9 |
+
def decorator(func):
|
| 10 |
+
return func
|
| 11 |
+
return decorator
|
| 12 |
+
|
| 13 |
+
from transformers import AutoProcessor, AutoModelForAudioSegmentation
|
| 14 |
+
import numpy as np
|
| 15 |
+
import librosa
|
| 16 |
+
import tempfile
|
| 17 |
+
import soundfile as sf
|
| 18 |
+
|
| 19 |
+
# Model configuration
|
| 20 |
+
MODEL_ID = "facebook/sam-audio-small"
|
| 21 |
+
|
| 22 |
+
print(f"Loading model: {MODEL_ID}...")
|
| 23 |
+
try:
|
| 24 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 25 |
+
model = AutoModelForAudioSegmentation.from_pretrained(
|
| 26 |
+
MODEL_ID,
|
| 27 |
+
device_map="auto",
|
| 28 |
+
torch_dtype=torch.float16
|
| 29 |
+
)
|
| 30 |
+
print("Model loaded successfully.")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error loading model: {e}")
|
| 33 |
+
print("Attempting to load with 8-bit quantization...")
|
| 34 |
+
try:
|
| 35 |
+
from transformers import BitsAndBytesConfig
|
| 36 |
+
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
| 37 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 38 |
+
model = AutoModelForAudioSegmentation.from_pretrained(
|
| 39 |
+
MODEL_ID,
|
| 40 |
+
quantization_config=quantization_config,
|
| 41 |
+
device_map="auto"
|
| 42 |
+
)
|
| 43 |
+
print("Model loaded with 8-bit quantization.")
|
| 44 |
+
except Exception as e2:
|
| 45 |
+
print(f"Critical error loading model: {e2}")
|
| 46 |
+
raise e2
|
| 47 |
+
|
| 48 |
+
@spaces.GPU(duration=120)
|
| 49 |
+
def infer(audio_path, prompt_text):
|
| 50 |
+
if not audio_path:
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
print(f"Processing audio: {audio_path}, Prompt: {prompt_text}")
|
| 54 |
+
|
| 55 |
+
# Load audio with librosa (standardizes sample rate)
|
| 56 |
+
target_sr = 16000 # SAM Audio often works at 16k, or check processor.feature_extractor.sampling_rate
|
| 57 |
+
if hasattr(processor, "feature_extractor"):
|
| 58 |
+
target_sr = processor.feature_extractor.sampling_rate
|
| 59 |
+
|
| 60 |
+
audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
|
| 61 |
+
|
| 62 |
+
# Prepare inputs
|
| 63 |
+
inputs = processor(
|
| 64 |
+
audios=[audio],
|
| 65 |
+
sampling_rate=sr,
|
| 66 |
+
text=[[prompt_text]] if prompt_text else None,
|
| 67 |
+
return_tensors="pt"
|
| 68 |
+
).to(model.device)
|
| 69 |
+
|
| 70 |
+
with torch.no_grad():
|
| 71 |
+
outputs = model(**inputs)
|
| 72 |
+
|
| 73 |
+
# Post-process to get likelihoods or masks
|
| 74 |
+
# Note: transformers implementation details vary.
|
| 75 |
+
# Usually we get logits. sigmoid -> prob.
|
| 76 |
+
# pred_masks shape: (batch_size, num_masks, freq, time) or similar.
|
| 77 |
+
|
| 78 |
+
pred_masks = torch.sigmoid(outputs.pred_masks)
|
| 79 |
+
|
| 80 |
+
# For audio reconstruction, we need to apply this mask to the STFT of the original audio.
|
| 81 |
+
# We calculate STFT using the same parameters as the model training if possible.
|
| 82 |
+
# If parameters are unknown, we try standard values or rely on processor logic if available.
|
| 83 |
+
|
| 84 |
+
# Standard STFT for AudioLDM/MusicGen etc often use n_fft=1024, hop=160.
|
| 85 |
+
# Let's inspect the mask shape to infer Time dimensions.
|
| 86 |
+
|
| 87 |
+
mask = pred_masks[0, 0] # Take first batch, first predicted mask
|
| 88 |
+
# Resize mask to inputs size if needed?
|
| 89 |
+
# Usually SAM Audio outputs a mask corresponding to the spectrogram features.
|
| 90 |
+
|
| 91 |
+
# Let's try to reconstruct using a generic STFT approach
|
| 92 |
+
n_fft = 1024
|
| 93 |
+
hop_length = 320 # Common for 16k
|
| 94 |
+
stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
|
| 95 |
+
|
| 96 |
+
# stft shape: (1 + n_fft/2, time_frames)
|
| 97 |
+
# mask shape from model might be different. Resize mask to match stft.
|
| 98 |
+
|
| 99 |
+
# Convert mask to numpy
|
| 100 |
+
mask_np = mask.cpu().float().numpy()
|
| 101 |
+
|
| 102 |
+
# Resize mask to match STFT shape
|
| 103 |
+
# stft.shape is (freq, time)
|
| 104 |
+
import cv2
|
| 105 |
+
# cv2.resize expects (width, height) -> (time, freq)
|
| 106 |
+
try:
|
| 107 |
+
mask_resized = cv2.resize(mask_np, (stft.shape[1], stft.shape[0]), interpolation=cv2.INTER_LINEAR)
|
| 108 |
+
# Apply mask
|
| 109 |
+
stft_masked = stft * mask_resized
|
| 110 |
+
# ISTFT
|
| 111 |
+
audio_masked = librosa.istft(stft_masked, hop_length=hop_length)
|
| 112 |
+
|
| 113 |
+
# Save to temp file
|
| 114 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 115 |
+
sf.write(tmp.name, audio_masked, sr)
|
| 116 |
+
return tmp.name
|
| 117 |
+
except Exception as e_resize:
|
| 118 |
+
print(f"Error applying mask: {e_resize}. Returning original for debug.")
|
| 119 |
+
# Fallback to saving original just to show partial success
|
| 120 |
+
return audio_path
|
| 121 |
+
|
| 122 |
+
with gr.Blocks() as demo:
|
| 123 |
+
gr.Markdown(f"# SAM Audio WebUI ({MODEL_ID})")
|
| 124 |
+
gr.Markdown("Upload audio and provide a prompt to segment specific sounds.")
|
| 125 |
+
|
| 126 |
+
with gr.Row():
|
| 127 |
+
audio_input = gr.Audio(type="filepath", label="Input Audio")
|
| 128 |
+
text_input = gr.Textbox(label="Prompt (e.g., 'drums', 'vocals')")
|
| 129 |
+
|
| 130 |
+
submit_btn = gr.Button("Segment Audio")
|
| 131 |
+
audio_output = gr.Audio(label="Segmented Audio")
|
| 132 |
+
|
| 133 |
+
submit_btn.click(
|
| 134 |
+
fn=infer,
|
| 135 |
+
inputs=[audio_input, text_input],
|
| 136 |
+
outputs=[audio_output]
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
if __name__ == "__main__":
|
| 140 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
transformers>=4.38.0
|
| 4 |
+
accelerate>=0.27.0
|
| 5 |
+
bitsandbytes>=0.41.0
|
| 6 |
+
scipy
|
| 7 |
+
librosa
|
| 8 |
+
opencv-python-headless
|
| 9 |
+
spaces
|
| 10 |
+
|
| 11 |
+
|