Spaces:

lpeterl
/

sam-audio-webui

Running on Zero

App Files Files Community

Peter Shi commited on 6 days ago

Commit

d4c742d

1 Parent(s): 0a54840

Switch to Docker SDK with Python 3.12

Browse files

Files changed (4) hide show

Dockerfile +27 -0
README.md +2 -5
app.py +71 -128
requirements.txt +4 -9

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# Use Python 3.12 to satisfy the 'perception-models' requirement
+FROM python:3.12
+# Set the working directory
+WORKDIR /code
+# Install system dependencies (ffmpeg is required for audio)
+RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+# Set up a user (Required by HF Spaces security)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Copy application files
+COPY --chown=user . $HOME/app
+# Start the app
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -3,13 +3,10 @@ title: Sam Audio Webui
 emoji: 🎵
 colorFrom: indigo
 colorTo: pink
-sdk: gradio
-sdk_version: 6.2.0
-app_file: app.py
 pinned: false
 license: apache-2.0
-fullWidth: true
-python_version: 3.11
 ---
 # SAM Audio WebUI

 emoji: 🎵
 colorFrom: indigo
 colorTo: pink
+sdk: docker
+app_port: 7860
 pinned: false
 license: apache-2.0
 ---
 # SAM Audio WebUI

app.py CHANGED Viewed

@@ -1,148 +1,91 @@
 import gradio as gr
 import torch
-try:
-    import spaces
-except ImportError:
-    class spaces:
-        @staticmethod
-        def GPU(duration=60):
-            def decorator(func):
-                return func
-            return decorator
-import gradio as gr
-import torch
-try:
-    import spaces
-except ImportError:
-    class spaces:
-        @staticmethod
-        def GPU(duration=60):
-            def decorator(func):
-                return func
-            return decorator
-from sam_audio import SAMAudio, SAMAudioProcessor
-import numpy as np
-import librosa
 import tempfile
-import soundfile as sf
-# Model configuration
-MODEL_ID = "facebook/sam-audio-small"
-print(f"Loading model: {MODEL_ID}...")
 try:
-    processor = SAMAudioProcessor.from_pretrained(MODEL_ID)
-    model = SAMAudio.from_pretrained(
-        MODEL_ID,
-        device_map="auto",
-        torch_dtype=torch.float16
-    )
     print("Model loaded successfully.")
 except Exception as e:
-    print(f"Error loading model: {e}")
-    # Fallback currently not fully supported for SAMAudio custom class in bitsandbytes via config directly
-    # unless it inherits correctly. Let's try standard float32 if float16 fails, or keep the error.
-    print("Retrying with default precision...")
-    try:
-        processor = SAMAudioProcessor.from_pretrained(MODEL_ID)
-        model = SAMAudio.from_pretrained(MODEL_ID, device_map="auto")
-        print("Model loaded with default precision.")
-    except Exception as e2:
-        print(f"Critical error loading model: {e2}")
-        raise e2
-@spaces.GPU(duration=120)
-def infer(audio_path, prompt_text):
     if not audio_path:
-        return None
-    print(f"Processing audio: {audio_path}, Prompt: {prompt_text}")
-    # Load audio with librosa (standardizes sample rate)
-    target_sr = 16000 # SAM Audio often works at 16k, or check processor.feature_extractor.sampling_rate
-    if hasattr(processor, "feature_extractor"):
-         target_sr = processor.feature_extractor.sampling_rate
-    audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
-    # Prepare inputs
     inputs = processor(
-        audios=[audio],
-        sampling_rate=sr,
-        text=[[prompt_text]] if prompt_text else None,
-        return_tensors="pt"
-    ).to(model.device)
     with torch.no_grad():
-        outputs = model(**inputs)
-    # Post-process to get likelihoods or masks
-    # Note: transformers implementation details vary.
-    # Usually we get logits. sigmoid -> prob.
-    # pred_masks shape: (batch_size, num_masks, freq, time) or similar.
-    pred_masks = torch.sigmoid(outputs.pred_masks)
-    # For audio reconstruction, we need to apply this mask to the STFT of the original audio.
-    # We calculate STFT using the same parameters as the model training if possible.
-    # If parameters are unknown, we try standard values or rely on processor logic if available.
-    # Standard STFT for AudioLDM/MusicGen etc often use n_fft=1024, hop=160.
-    # Let's inspect the mask shape to infer Time dimensions.
-    mask = pred_masks[0, 0] # Take first batch, first predicted mask
-    # Resize mask to inputs size if needed?
-    # Usually SAM Audio outputs a mask corresponding to the spectrogram features.
-    # Let's try to reconstruct using a generic STFT approach
-    n_fft = 1024
-    hop_length = 320 # Common for 16k
-    stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
-    # stft shape: (1 + n_fft/2, time_frames)
-    # mask shape from model might be different. Resize mask to match stft.
-    # Convert mask to numpy
-    mask_np = mask.cpu().float().numpy()
-    # Resize mask to match STFT shape
-    # stft.shape is (freq, time)
-    import cv2
-    # cv2.resize expects (width, height) -> (time, freq)
-    try:
-        mask_resized = cv2.resize(mask_np, (stft.shape[1], stft.shape[0]), interpolation=cv2.INTER_LINEAR)
-        # Apply mask
-        stft_masked = stft * mask_resized
-        # ISTFT
-        audio_masked = librosa.istft(stft_masked, hop_length=hop_length)
-        # Save to temp file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-             sf.write(tmp.name, audio_masked, sr)
-             return tmp.name
-    except Exception as e_resize:
-        print(f"Error applying mask: {e_resize}. Returning original for debug.")
-        # Fallback to saving original just to show partial success
-        return audio_path
-with gr.Blocks() as demo:
-    gr.Markdown(f"# SAM Audio WebUI ({MODEL_ID})")
-    gr.Markdown("Upload audio and provide a prompt to segment specific sounds.")
     with gr.Row():
-        audio_input = gr.Audio(type="filepath", label="Input Audio")
-        text_input = gr.Textbox(label="Prompt (e.g., 'drums', 'vocals')")
-    submit_btn = gr.Button("Segment Audio")
-    audio_output = gr.Audio(label="Segmented Audio")
-    submit_btn.click(
-        fn=infer,
-        inputs=[audio_input, text_input],
-        outputs=[audio_output]
     )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import torch
+import torchaudio
 import tempfile
+from sam_audio import SAMAudio, SAMAudioProcessor
+# Configuration
+MODEL_NAME = "facebook/sam-audio-small"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading {MODEL_NAME} on {device}...")
+# Load Model and Processor
 try:
+    model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
+    processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
     print("Model loaded successfully.")
 except Exception as e:
+    print(f"Error loading model. Did you set HF_TOKEN in secrets? Error: {e}")
+    raise e
+def save_audio(tensor, sample_rate):
+    """Helper to save torch tensor to a temp file for Gradio output."""
+    if tensor.dim() == 1:
+        tensor = tensor.unsqueeze(0)
+    tensor = tensor.detach().cpu()
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        torchaudio.save(tmp.name, tensor, sample_rate)
+        return tmp.name
+def separate_audio(audio_path, text_prompt):
     if not audio_path:
+        return None, None
+    # Process Inputs
     inputs = processor(
+        audios=[audio_path],
+        descriptions=[text_prompt]
+    ).to(device)
+    # Inference
     with torch.no_grad():
+        result = model.separate(inputs)
+    # Extract Outputs
+    target_audio = result.target[0]   # The sound you asked for
+    residual_audio = result.residual[0] # Everything else
+    # Get sampling rate from the processor config
+    sr = processor.feature_extractor.sampling_rate
+    # Save to files
+    target_path = save_audio(target_audio, sr)
+    residual_path = save_audio(residual_audio, sr)
+    return target_path, residual_path
+# Build Gradio Interface
+with gr.Blocks(title="SAM-Audio Demo") as demo:
+    gr.Markdown(
+        """
+        # 🎵 SAM-Audio: Segment Anything for Audio
+        Isolate specific sounds from an audio file using natural language prompts.
+        **Model:** `facebook/sam-audio-small`
+        """
+    )
     with gr.Row():
+        with gr.Column():
+            input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
+            text_prompt = gr.Textbox(
+                label="Text Prompt",
+                placeholder="e.g., 'dog barking', 'man speaking', 'typing keyboard'",
+                info="Describe the sound you want to isolate."
+            )
+            run_btn = gr.Button("Separate Audio", variant="primary")
+        with gr.Column():
+            output_target = gr.Audio(label="Isolated Sound (Target)")
+            output_residual = gr.Audio(label="Background (Residual)")
+    run_btn.click(
+        fn=separate_audio,
+        inputs=[input_audio, text_prompt],
+        outputs=[output_target, output_residual]
     )
+# Launch
+demo.queue().launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,11 +1,6 @@
-gradio>=4.0.0
-torch>=2.0.0
-transformers>=4.38.0
-accelerate>=0.27.0
-bitsandbytes>=0.41.0
-scipy
-librosa
-opencv-python-headless
-spaces
 git+https://github.com/facebookresearch/sam-audio.git
 torchaudio

 git+https://github.com/facebookresearch/sam-audio.git
+torch
 torchaudio
+gradio
+numpy
+scipy