Spaces:

lpeterl
/

sam-audio-webui

Running on Zero

App Files Files Community

Peter Shi commited on 6 days ago

Commit

a11009d

1 Parent(s): cebdac8

Add progress bar, example only fills data, switch to video tab

Browse files

Files changed (1) hide show

app.py +35 -40

app.py CHANGED Viewed

@@ -29,11 +29,13 @@ current_model_name = None
 model = None
 processor = None
-def load_model(model_name):
     global current_model_name, model, processor
     model_id = MODELS.get(model_name, MODELS[DEFAULT_MODEL])
     if current_model_name == model_name and model is not None:
         return
     print(f"Loading {model_id}...")
     model = SAMAudio.from_pretrained(model_id).to(device).eval()
     processor = SAMAudioProcessor.from_pretrained(model_id)
@@ -48,9 +50,10 @@ def save_audio(tensor, sample_rate):
         return tmp.name
 @spaces.GPU(duration=300)
-def separate_audio(model_name, file_path, text_prompt):
     global model, processor
-    load_model(model_name)
     if not file_path:
         return None, None, "❌ Please upload an audio or video file."
@@ -58,35 +61,37 @@ def separate_audio(model_name, file_path, text_prompt):
         return None, None, "❌ Please enter a text prompt."
     try:
         inputs = processor(audios=[file_path], descriptions=[text_prompt.strip()]).to(device)
         with torch.inference_mode():
             result = model.separate(inputs, predict_spans=False, reranking_candidates=1)
         sample_rate = processor.audio_sampling_rate
         target_path = save_audio(result.target[0].unsqueeze(0).cpu(), sample_rate)
         residual_path = save_audio(result.residual[0].unsqueeze(0).cpu(), sample_rate)
         return target_path, residual_path, f"✅ Isolated '{text_prompt}' using {model_name}"
     except Exception as e:
         import traceback
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
-def process_audio(model_name, audio_path, prompt):
     if not audio_path:
         return None, None, "❌ Please upload an audio file."
-    return separate_audio(model_name, audio_path, prompt)
-def process_video(model_name, video_path, prompt):
     if not video_path:
         return None, None, "❌ Please upload a video file."
-    return separate_audio(model_name, video_path, prompt)
-def process_example(model_name, prompt):
-    if not os.path.exists(EXAMPLE_FILE):
-        return None, None, "❌ Example file not found."
-    return separate_audio(model_name, EXAMPLE_FILE, prompt)
-def load_example(prompt):
-    return EXAMPLE_FILE, prompt
 # Build Interface
 with gr.Blocks(title="SAM-Audio Demo") as demo:
@@ -105,10 +110,11 @@ with gr.Blocks(title="SAM-Audio Demo") as demo:
                 label="Model"
             )
-            with gr.Tabs():
-                with gr.TabItem("🎵 Audio"):
                     input_audio = gr.Audio(label="Upload Audio", type="filepath")
-                with gr.TabItem("🎬 Video"):
                     input_video = gr.Video(label="Upload Video")
             text_prompt = gr.Textbox(
@@ -116,7 +122,7 @@ with gr.Blocks(title="SAM-Audio Demo") as demo:
                 placeholder="e.g., 'A man speaking', 'Piano', 'Dog barking'"
             )
-            run_btn = gr.Button("🎯 Isolate Sound", variant="primary")
             status_output = gr.Markdown("")
         with gr.Column(scale=1):
@@ -125,7 +131,8 @@ with gr.Blocks(title="SAM-Audio Demo") as demo:
             output_residual = gr.Audio(label="Background (Residual)")
     gr.Markdown("---")
-    gr.Markdown("### 🎬 Demo Examples (click to auto-process)")
     with gr.Row():
         if os.path.exists(EXAMPLE_FILE):
@@ -133,40 +140,28 @@ with gr.Blocks(title="SAM-Audio Demo") as demo:
             example_btn2 = gr.Button("🎤 Woman Speaking")
             example_btn3 = gr.Button("🎵 Background Music")
-    # Audio processing
     run_btn.click(
         fn=lambda m, a, v, p: process_audio(m, a, p) if a else process_video(m, v, p),
         inputs=[model_selector, input_audio, input_video, text_prompt],
         outputs=[output_target, output_residual, status_output]
     )
-    # Example buttons
     if os.path.exists(EXAMPLE_FILE):
         example_btn1.click(
-            fn=lambda: (EXAMPLE_FILE, "A man speaking"),
-            outputs=[input_video, text_prompt]
-        ).then(
-            fn=lambda m: process_example(m, "A man speaking"),
-            inputs=[model_selector],
-            outputs=[output_target, output_residual, status_output]
         )
         example_btn2.click(
-            fn=lambda: (EXAMPLE_FILE, "A woman speaking"),
-            outputs=[input_video, text_prompt]
-        ).then(
-            fn=lambda m: process_example(m, "A woman speaking"),
-            inputs=[model_selector],
-            outputs=[output_target, output_residual, status_output]
         )
         example_btn3.click(
-            fn=lambda: (EXAMPLE_FILE, "Background music"),
-            outputs=[input_video, text_prompt]
-        ).then(
-            fn=lambda m: process_example(m, "Background music"),
-            inputs=[model_selector],
-            outputs=[output_target, output_residual, status_output]
         )
 if __name__ == "__main__":

 model = None
 processor = None
+def load_model(model_name, progress=None):
     global current_model_name, model, processor
     model_id = MODELS.get(model_name, MODELS[DEFAULT_MODEL])
     if current_model_name == model_name and model is not None:
         return
+    if progress:
+        progress(0.1, desc="Loading model...")
     print(f"Loading {model_id}...")
     model = SAMAudio.from_pretrained(model_id).to(device).eval()
     processor = SAMAudioProcessor.from_pretrained(model_id)
         return tmp.name
 @spaces.GPU(duration=300)
+def separate_audio(model_name, file_path, text_prompt, progress=gr.Progress()):
     global model, processor
+    progress(0.1, desc="Checking inputs...")
     if not file_path:
         return None, None, "❌ Please upload an audio or video file."
         return None, None, "❌ Please enter a text prompt."
     try:
+        progress(0.2, desc="Loading model...")
+        load_model(model_name)
+        progress(0.4, desc="Processing audio...")
         inputs = processor(audios=[file_path], descriptions=[text_prompt.strip()]).to(device)
+        progress(0.6, desc="Separating sounds...")
         with torch.inference_mode():
             result = model.separate(inputs, predict_spans=False, reranking_candidates=1)
+        progress(0.8, desc="Saving results...")
         sample_rate = processor.audio_sampling_rate
         target_path = save_audio(result.target[0].unsqueeze(0).cpu(), sample_rate)
         residual_path = save_audio(result.residual[0].unsqueeze(0).cpu(), sample_rate)
+        progress(1.0, desc="Done!")
         return target_path, residual_path, f"✅ Isolated '{text_prompt}' using {model_name}"
     except Exception as e:
         import traceback
         traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
+def process_audio(model_name, audio_path, prompt, progress=gr.Progress()):
     if not audio_path:
         return None, None, "❌ Please upload an audio file."
+    return separate_audio(model_name, audio_path, prompt, progress)
+def process_video(model_name, video_path, prompt, progress=gr.Progress()):
     if not video_path:
         return None, None, "❌ Please upload a video file."
+    return separate_audio(model_name, video_path, prompt, progress)
 # Build Interface
 with gr.Blocks(title="SAM-Audio Demo") as demo:
                 label="Model"
             )
+            tabs = gr.Tabs()
+            with tabs:
+                with gr.TabItem("🎵 Audio", id=0):
                     input_audio = gr.Audio(label="Upload Audio", type="filepath")
+                with gr.TabItem("🎬 Video", id=1):
                     input_video = gr.Video(label="Upload Video")
             text_prompt = gr.Textbox(
                 placeholder="e.g., 'A man speaking', 'Piano', 'Dog barking'"
             )
+            run_btn = gr.Button("🎯 Isolate Sound", variant="primary", size="lg")
             status_output = gr.Markdown("")
         with gr.Column(scale=1):
             output_residual = gr.Audio(label="Background (Residual)")
     gr.Markdown("---")
+    gr.Markdown("### 🎬 Demo Examples")
+    gr.Markdown("Click to load example, then click 'Isolate Sound' to process:")
     with gr.Row():
         if os.path.exists(EXAMPLE_FILE):
             example_btn2 = gr.Button("🎤 Woman Speaking")
             example_btn3 = gr.Button("🎵 Background Music")
+    # Main process button - check which tab has content
     run_btn.click(
         fn=lambda m, a, v, p: process_audio(m, a, p) if a else process_video(m, v, p),
         inputs=[model_selector, input_audio, input_video, text_prompt],
         outputs=[output_target, output_residual, status_output]
     )
+    # Example buttons - only fill in data, switch to video tab
     if os.path.exists(EXAMPLE_FILE):
         example_btn1.click(
+            fn=lambda: (EXAMPLE_FILE, "A man speaking", gr.Tabs(selected=1)),
+            outputs=[input_video, text_prompt, tabs]
         )
         example_btn2.click(
+            fn=lambda: (EXAMPLE_FILE, "A woman speaking", gr.Tabs(selected=1)),
+            outputs=[input_video, text_prompt, tabs]
         )
         example_btn3.click(
+            fn=lambda: (EXAMPLE_FILE, "Background music", gr.Tabs(selected=1)),
+            outputs=[input_video, text_prompt, tabs]
         )
 if __name__ == "__main__":