Spaces:

lpeterl
/

sam-audio-webui

Running on Zero

App Files Files Community

Peter Shi commited on Dec 20, 2025

Commit

832604f

1 Parent(s): 2922fa7

Add video preview with tabs for audio/video upload

Browse files

Files changed (1) hide show

app.py +79 -103

app.py CHANGED Viewed

@@ -19,39 +19,24 @@ model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
 print(f"Model loaded on {device}.")
-# Supported file extensions
-SUPPORTED_EXTENSIONS = ['.mp3', '.wav', '.flac', '.ogg', '.m4a', '.mp4', '.mkv', '.avi', '.mov', '.webm']
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
-def validate_file(file_path):
-    """Check if file extension is supported."""
-    if not file_path:
-        return False, "No file uploaded"
-    ext = os.path.splitext(file_path)[1].lower()
-    if ext not in SUPPORTED_EXTENSIONS:
-        return False, f"Unsupported format: {ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}"
-    return True, "OK"
 @spaces.GPU(duration=300)
-def separate_audio(file_path, text_prompt):
     if not file_path:
         return None, None, "❌ Please upload an audio or video file."
-    # Validate file
-    valid, msg = validate_file(file_path)
-    if not valid:
-        return None, None, f"❌ {msg}"
     if not text_prompt or not text_prompt.strip():
         return None, None, "❌ Please enter a text prompt describing the sound to isolate."
     try:
-        # SAM-Audio processor accepts both audio and video files directly
         inputs = processor(
             audios=[file_path],
             descriptions=[text_prompt.strip()]
@@ -71,156 +56,147 @@ def separate_audio(file_path, text_prompt):
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
-# Custom CSS for dark theme
 custom_css = """
 .gradio-container {
-    background: #0a0a0a !important;
     max-width: 1400px !important;
 }
-.upload-box {
-    border: 2px dashed #444 !important;
-    border-radius: 12px !important;
-    background: #1a1a1a !important;
-    min-height: 200px !important;
-    transition: border-color 0.3s !important;
 }
-.upload-box:hover {
-    border-color: #e91e8c !important;
 }
-.result-card {
-    background: #1a1a1a !important;
-    border: 1px solid #333 !important;
-    border-radius: 12px !important;
-    padding: 1rem !important;
 }
 .primary-btn {
     background: linear-gradient(135deg, #e91e8c, #9c27b0) !important;
     border: none !important;
     border-radius: 24px !important;
 }
-.sidebar-text {
-    color: #888 !important;
-    font-size: 0.9rem !important;
 }
-.step-text {
-    color: #ccc !important;
-    padding: 0.3rem 0 !important;
 }
-.pink-text {
-    color: #e91e8c !important;
 }
 """
 # Build Gradio Interface
 with gr.Blocks(
     title="SAM-Audio - Isolate Sounds",
-    theme=gr.themes.Base(
         primary_hue="pink",
-        secondary_hue="purple",
         neutral_hue="gray",
-    ).set(
-        body_background_fill="#0a0a0a",
-        body_background_fill_dark="#0a0a0a",
-        block_background_fill="#1a1a1a",
-        block_background_fill_dark="#1a1a1a",
-        input_background_fill="#1a1a1a",
-        input_background_fill_dark="#1a1a1a",
-        button_primary_background_fill="linear-gradient(135deg, #e91e8c, #9c27b0)",
-        button_primary_background_fill_hover="linear-gradient(135deg, #d1187d, #8a22a0)",
-        border_color_primary="#333",
     ),
     css=custom_css
 ) as demo:
     with gr.Row():
         # Sidebar
-        with gr.Column(scale=1, min_width=250):
-            gr.Markdown("## 🎵 Isolate Sounds")
-            gr.Markdown("Extract and isolate any sound from audio or video using AI.", elem_classes=["sidebar-text"])
             gr.Markdown("---")
             gr.Markdown("### How it works")
-            gr.Markdown("**1.** Add audio or video", elem_classes=["step-text"])
-            gr.Markdown("**2.** Describe the sound", elem_classes=["step-text"])
-            gr.Markdown("**3.** Get separated tracks", elem_classes=["step-text"])
-            gr.Markdown("---")
-            gr.Markdown("**Model**")
-            gr.Markdown("🤖 SAM-Audio Small")
             gr.Markdown("---")
-            gr.Markdown("**Supported Formats**")
-            gr.Markdown("🎵 MP3, WAV, FLAC, OGG, M4A", elem_classes=["sidebar-text"])
-            gr.Markdown("🎬 MP4, MKV, AVI, MOV, WebM", elem_classes=["sidebar-text"])
         # Main content area
         with gr.Column(scale=4):
-            gr.Markdown("### 📤 Upload Audio or Video")
-            # Use File component to accept both audio and video
-            input_file = gr.File(
-                label="Drop your audio or video file here",
-                file_types=SUPPORTED_EXTENSIONS,
-                elem_classes=["upload-box"]
-            )
-            gr.Markdown("### 💬 Describe the Sound to Isolate")
             text_prompt = gr.Textbox(
                 label="",
-                placeholder="e.g., 'A man speaking', 'Piano melody', 'Dog barking', 'Background music'",
                 lines=1
             )
-            with gr.Row():
-                run_btn = gr.Button(
-                    "🎯 Isolate Sound",
-                    variant="primary",
-                    size="lg",
-                    elem_classes=["primary-btn"]
-                )
             status_output = gr.Markdown(
-                value="*Upload a file and describe what sound you want to isolate.*"
             )
             gr.Markdown("---")
-            gr.Markdown("### 🎧 Results")
             with gr.Row():
-                with gr.Column(elem_classes=["result-card"]):
-                    gr.Markdown("**🎯 Isolated Sound** (Target)")
                     output_target = gr.Audio(label="", show_label=False)
-                with gr.Column(elem_classes=["result-card"]):
-                    gr.Markdown("**🔇 Background** (Residual)")
                     output_residual = gr.Audio(label="", show_label=False)
             gr.Markdown("---")
-            gr.Markdown("### 💡 Example Prompts")
-            gr.Markdown("Click any example below to use it:")
             with gr.Row():
-                for prompt in ["A man speaking", "A woman singing", "Piano", "Drums", "Guitar", "Dog barking"]:
-                    gr.Button(prompt, size="sm").click(
-                        fn=lambda p=prompt: p,
-                        outputs=[text_prompt]
-                    )
-    def process_file(file, prompt):
-        if file is None:
-            return None, None, "❌ Please upload a file."
-        return separate_audio(file.name, prompt)
     run_btn.click(
-        fn=process_file,
-        inputs=[input_file, text_prompt],
         outputs=[output_target, output_residual, status_output]
     )

 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
 print(f"Model loaded on {device}.")
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
 @spaces.GPU(duration=300)
+def separate_audio(audio_path, video_path, text_prompt):
+    # Determine which input to use
+    file_path = video_path if video_path else audio_path
     if not file_path:
         return None, None, "❌ Please upload an audio or video file."
     if not text_prompt or not text_prompt.strip():
         return None, None, "❌ Please enter a text prompt describing the sound to isolate."
     try:
         inputs = processor(
             audios=[file_path],
             descriptions=[text_prompt.strip()]
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
+# Custom CSS
 custom_css = """
 .gradio-container {
+    background: #0d0d0d !important;
     max-width: 1400px !important;
 }
+.gradio-container *, .gradio-container p, .gradio-container span,
+.gradio-container h1, .gradio-container h2, .gradio-container h3,
+.gradio-container label, .gradio-container .markdown-text {
+    color: #ffffff !important;
 }
+input, textarea {
+    background: #1a1a1a !important;
+    border: 1px solid #444 !important;
+    color: #ffffff !important;
 }
+input::placeholder, textarea::placeholder {
+    color: #888 !important;
 }
 .primary-btn {
     background: linear-gradient(135deg, #e91e8c, #9c27b0) !important;
     border: none !important;
     border-radius: 24px !important;
+    color: #ffffff !important;
+    font-weight: 600 !important;
 }
+.example-btn {
+    background: #2a2a2a !important;
+    border: 1px solid #444 !important;
+    color: #ffffff !important;
+    border-radius: 8px !important;
 }
+.example-btn:hover {
+    background: #3a3a3a !important;
+    border-color: #e91e8c !important;
 }
+hr {
+    border-color: #333 !important;
 }
 """
 # Build Gradio Interface
 with gr.Blocks(
     title="SAM-Audio - Isolate Sounds",
+    theme=gr.themes.Default(
         primary_hue="pink",
+        secondary_hue="purple",
         neutral_hue="gray",
     ),
     css=custom_css
 ) as demo:
     with gr.Row():
         # Sidebar
+        with gr.Column(scale=1, min_width=260):
+            gr.Markdown("# 🎵 Isolate Sounds")
+            gr.Markdown("Extract and isolate any sound from audio or video using AI.")
             gr.Markdown("---")
             gr.Markdown("### How it works")
+            gr.Markdown("**1.** Add audio or video")
+            gr.Markdown("**2.** Describe the sound")
+            gr.Markdown("**3.** Get separated tracks")
             gr.Markdown("---")
+            gr.Markdown("**Model:** SAM-Audio Small")
         # Main content area
         with gr.Column(scale=4):
+            gr.Markdown("## 📤 Upload Audio or Video")
+            with gr.Tabs():
+                with gr.TabItem("� Audio"):
+                    input_audio = gr.Audio(
+                        label="Upload audio file (MP3, WAV, FLAC, etc.)",
+                        type="filepath"
+                    )
+                with gr.TabItem("🎬 Video"):
+                    input_video = gr.Video(
+                        label="Upload video file (MP4, MKV, AVI, etc.)"
+                    )
+            gr.Markdown("## 💬 Describe the Sound")
             text_prompt = gr.Textbox(
                 label="",
+                placeholder="e.g., 'A man speaking', 'Piano melody', 'Dog barking'",
                 lines=1
             )
+            run_btn = gr.Button(
+                "🎯 Isolate Sound",
+                variant="primary",
+                size="lg",
+                elem_classes=["primary-btn"]
+            )
             status_output = gr.Markdown(
+                value="Upload a file and describe what sound you want to isolate."
             )
             gr.Markdown("---")
+            gr.Markdown("## 🎧 Results")
             with gr.Row():
+                with gr.Column():
+                    gr.Markdown("**🎯 Isolated Sound (Target)**")
                     output_target = gr.Audio(label="", show_label=False)
+                with gr.Column():
+                    gr.Markdown("**🔇 Background (Residual)**")
                     output_residual = gr.Audio(label="", show_label=False)
             gr.Markdown("---")
+            gr.Markdown("## 💡 Example Prompts")
             with gr.Row():
+                btn1 = gr.Button("A man speaking", elem_classes=["example-btn"])
+                btn2 = gr.Button("A woman singing", elem_classes=["example-btn"])
+                btn3 = gr.Button("Piano", elem_classes=["example-btn"])
+                btn4 = gr.Button("Drums", elem_classes=["example-btn"])
+                btn5 = gr.Button("Guitar", elem_classes=["example-btn"])
+                btn6 = gr.Button("Dog barking", elem_classes=["example-btn"])
+            btn1.click(fn=lambda: "A man speaking", outputs=[text_prompt])
+            btn2.click(fn=lambda: "A woman singing", outputs=[text_prompt])
+            btn3.click(fn=lambda: "Piano", outputs=[text_prompt])
+            btn4.click(fn=lambda: "Drums", outputs=[text_prompt])
+            btn5.click(fn=lambda: "Guitar", outputs=[text_prompt])
+            btn6.click(fn=lambda: "Dog barking", outputs=[text_prompt])
     run_btn.click(
+        fn=separate_audio,
+        inputs=[input_audio, input_video, text_prompt],
         outputs=[output_target, output_residual, status_output]
     )