Spaces:

lpeterl
/

sam-audio-webui

Running on Zero

App Files Files Community

Peter Shi commited on Dec 20, 2025

Commit

2922fa7

1 Parent(s): b02c18a

Add MP4 and video file support

Browse files

Files changed (1) hide show

app.py +168 -47

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import torchaudio
 import tempfile
 import warnings
 warnings.filterwarnings("ignore")
 from sam_audio import SAMAudio, SAMAudioProcessor
@@ -11,96 +12,216 @@ from sam_audio import SAMAudio, SAMAudioProcessor
 # Configuration
 MODEL_NAME = "facebook/sam-audio-small"
-# Load model and processor (following official HuggingFace example)
 print(f"Loading {MODEL_NAME}...")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
 print(f"Model loaded on {device}.")
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
 @spaces.GPU(duration=300)
-def separate_audio(audio_path, text_prompt):
-    if not audio_path:
-        return None, None, "❌ Please upload an audio file."
     if not text_prompt or not text_prompt.strip():
-        text_prompt = "vocals"
     try:
-        # Process and separate (following official example)
         inputs = processor(
-            audios=[audio_path],
             descriptions=[text_prompt.strip()]
         ).to(device)
         with torch.inference_mode():
             result = model.separate(inputs, predict_spans=False, reranking_candidates=1)
-        # Save results (following official example: result.target[0].unsqueeze(0).cpu())
         sample_rate = processor.audio_sampling_rate
         target_path = save_audio(result.target[0].unsqueeze(0).cpu(), sample_rate)
         residual_path = save_audio(result.residual[0].unsqueeze(0).cpu(), sample_rate)
-        return target_path, residual_path, f"✅ Successfully separated '{text_prompt}' from the audio."
     except Exception as e:
         import traceback
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
 # Build Gradio Interface
 with gr.Blocks(
-    theme=gr.themes.Soft(),
-    title="SAM-Audio - Segment Anything for Audio"
 ) as demo:
-    gr.Markdown(
-        """
-        # 🎵 SAM-Audio: Segment Anything for Audio
-        Isolate specific sounds from an audio file using natural language prompts.
-        **Model:** [facebook/sam-audio-small](https://huggingface.co/facebook/sam-audio-small)
-        """
-    )
     with gr.Row():
-        with gr.Column():
-            input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
             text_prompt = gr.Textbox(
-                label="Text Prompt",
-                placeholder="e.g., 'A man speaking', 'Piano playing', 'Dog barking'",
-                value="A man speaking",
-                info="Describe the sound you want to isolate."
             )
-            run_btn = gr.Button("🎯 Separate Audio", variant="primary", size="lg")
-        with gr.Column():
-            output_target = gr.Audio(label="Isolated Sound (Target)")
-            output_residual = gr.Audio(label="Background (Residual)")
-    info_output = gr.Markdown(value="📝 Upload an audio file and enter a prompt to start.")
     run_btn.click(
-        fn=separate_audio,
-        inputs=[input_audio, text_prompt],
-        outputs=[output_target, output_residual, info_output]
-    )
-    gr.Markdown(
-        """
-        ### Example Prompts
-        - "A person coughing"
-        - "Piano playing a melody"
-        - "Dog barking"
-        - "Car engine revving"
-        - "Raindrops falling"
-        """
     )
 if __name__ == "__main__":

 import torchaudio
 import tempfile
 import warnings
+import os
 warnings.filterwarnings("ignore")
 from sam_audio import SAMAudio, SAMAudioProcessor
 # Configuration
 MODEL_NAME = "facebook/sam-audio-small"
+# Load model and processor
 print(f"Loading {MODEL_NAME}...")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
 print(f"Model loaded on {device}.")
+# Supported file extensions
+SUPPORTED_EXTENSIONS = ['.mp3', '.wav', '.flac', '.ogg', '.m4a', '.mp4', '.mkv', '.avi', '.mov', '.webm']
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
+def validate_file(file_path):
+    """Check if file extension is supported."""
+    if not file_path:
+        return False, "No file uploaded"
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext not in SUPPORTED_EXTENSIONS:
+        return False, f"Unsupported format: {ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}"
+    return True, "OK"
 @spaces.GPU(duration=300)
+def separate_audio(file_path, text_prompt):
+    if not file_path:
+        return None, None, "❌ Please upload an audio or video file."
+    # Validate file
+    valid, msg = validate_file(file_path)
+    if not valid:
+        return None, None, f"❌ {msg}"
     if not text_prompt or not text_prompt.strip():
+        return None, None, "❌ Please enter a text prompt describing the sound to isolate."
     try:
+        # SAM-Audio processor accepts both audio and video files directly
         inputs = processor(
+            audios=[file_path],
             descriptions=[text_prompt.strip()]
         ).to(device)
         with torch.inference_mode():
             result = model.separate(inputs, predict_spans=False, reranking_candidates=1)
         sample_rate = processor.audio_sampling_rate
         target_path = save_audio(result.target[0].unsqueeze(0).cpu(), sample_rate)
         residual_path = save_audio(result.residual[0].unsqueeze(0).cpu(), sample_rate)
+        return target_path, residual_path, f"✅ Successfully isolated **'{text_prompt}'**"
     except Exception as e:
         import traceback
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
+# Custom CSS for dark theme
+custom_css = """
+.gradio-container {
+    background: #0a0a0a !important;
+    max-width: 1400px !important;
+}
+.upload-box {
+    border: 2px dashed #444 !important;
+    border-radius: 12px !important;
+    background: #1a1a1a !important;
+    min-height: 200px !important;
+    transition: border-color 0.3s !important;
+}
+.upload-box:hover {
+    border-color: #e91e8c !important;
+}
+.result-card {
+    background: #1a1a1a !important;
+    border: 1px solid #333 !important;
+    border-radius: 12px !important;
+    padding: 1rem !important;
+}
+.primary-btn {
+    background: linear-gradient(135deg, #e91e8c, #9c27b0) !important;
+    border: none !important;
+    border-radius: 24px !important;
+}
+.sidebar-text {
+    color: #888 !important;
+    font-size: 0.9rem !important;
+}
+.step-text {
+    color: #ccc !important;
+    padding: 0.3rem 0 !important;
+}
+.pink-text {
+    color: #e91e8c !important;
+}
+"""
 # Build Gradio Interface
 with gr.Blocks(
+    title="SAM-Audio - Isolate Sounds",
+    theme=gr.themes.Base(
+        primary_hue="pink",
+        secondary_hue="purple",
+        neutral_hue="gray",
+    ).set(
+        body_background_fill="#0a0a0a",
+        body_background_fill_dark="#0a0a0a",
+        block_background_fill="#1a1a1a",
+        block_background_fill_dark="#1a1a1a",
+        input_background_fill="#1a1a1a",
+        input_background_fill_dark="#1a1a1a",
+        button_primary_background_fill="linear-gradient(135deg, #e91e8c, #9c27b0)",
+        button_primary_background_fill_hover="linear-gradient(135deg, #d1187d, #8a22a0)",
+        border_color_primary="#333",
+    ),
+    css=custom_css
 ) as demo:
     with gr.Row():
+        # Sidebar
+        with gr.Column(scale=1, min_width=250):
+            gr.Markdown("## 🎵 Isolate Sounds")
+            gr.Markdown("Extract and isolate any sound from audio or video using AI.", elem_classes=["sidebar-text"])
+            gr.Markdown("---")
+            gr.Markdown("### How it works")
+            gr.Markdown("**1.** Add audio or video", elem_classes=["step-text"])
+            gr.Markdown("**2.** Describe the sound", elem_classes=["step-text"])
+            gr.Markdown("**3.** Get separated tracks", elem_classes=["step-text"])
+            gr.Markdown("---")
+            gr.Markdown("**Model**")
+            gr.Markdown("🤖 SAM-Audio Small")
+            gr.Markdown("---")
+            gr.Markdown("**Supported Formats**")
+            gr.Markdown("🎵 MP3, WAV, FLAC, OGG, M4A", elem_classes=["sidebar-text"])
+            gr.Markdown("🎬 MP4, MKV, AVI, MOV, WebM", elem_classes=["sidebar-text"])
+        # Main content area
+        with gr.Column(scale=4):
+            gr.Markdown("### 📤 Upload Audio or Video")
+            # Use File component to accept both audio and video
+            input_file = gr.File(
+                label="Drop your audio or video file here",
+                file_types=SUPPORTED_EXTENSIONS,
+                elem_classes=["upload-box"]
+            )
+            gr.Markdown("### 💬 Describe the Sound to Isolate")
             text_prompt = gr.Textbox(
+                label="",
+                placeholder="e.g., 'A man speaking', 'Piano melody', 'Dog barking', 'Background music'",
+                lines=1
             )
+            with gr.Row():
+                run_btn = gr.Button(
+                    "🎯 Isolate Sound",
+                    variant="primary",
+                    size="lg",
+                    elem_classes=["primary-btn"]
+                )
+            status_output = gr.Markdown(
+                value="*Upload a file and describe what sound you want to isolate.*"
+            )
+            gr.Markdown("---")
+            gr.Markdown("### 🎧 Results")
+            with gr.Row():
+                with gr.Column(elem_classes=["result-card"]):
+                    gr.Markdown("**🎯 Isolated Sound** (Target)")
+                    output_target = gr.Audio(label="", show_label=False)
+                with gr.Column(elem_classes=["result-card"]):
+                    gr.Markdown("**🔇 Background** (Residual)")
+                    output_residual = gr.Audio(label="", show_label=False)
+            gr.Markdown("---")
+            gr.Markdown("### 💡 Example Prompts")
+            gr.Markdown("Click any example below to use it:")
+            with gr.Row():
+                for prompt in ["A man speaking", "A woman singing", "Piano", "Drums", "Guitar", "Dog barking"]:
+                    gr.Button(prompt, size="sm").click(
+                        fn=lambda p=prompt: p,
+                        outputs=[text_prompt]
+                    )
+    def process_file(file, prompt):
+        if file is None:
+            return None, None, "❌ Please upload a file."
+        return separate_audio(file.name, prompt)
     run_btn.click(
+        fn=process_file,
+        inputs=[input_file, text_prompt],
+        outputs=[output_target, output_residual, status_output]
     )
 if __name__ == "__main__":