Spaces:

lpeterl
/

sam-audio-webui

Running on Zero

App Files Files Community

Peter Shi commited on Dec 20, 2025

Commit

79ced89

1 Parent(s): e299ffc

Follow official sam-audio example exactly

Browse files

Files changed (1) hide show

app.py +19 -23

app.py CHANGED Viewed

@@ -12,11 +12,11 @@ from sam_audio import SAMAudio, SAMAudioProcessor
 MODEL_NAME = "facebook/sam-audio-small"
 # Global model and processor
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading {MODEL_NAME} on {device}...")
-model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
-print("Model loaded successfully.")
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
@@ -28,7 +28,7 @@ def save_audio(tensor, sample_rate):
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
-@spaces.GPU(duration=180)
 def separate_audio(audio_path, text_prompt):
     if not audio_path:
         return None, None, "❌ Please upload an audio file."
@@ -37,30 +37,28 @@ def separate_audio(audio_path, text_prompt):
         text_prompt = "vocals"
     try:
-        # Process Inputs
-        inputs = processor(
             audios=[audio_path],
             descriptions=[text_prompt.strip()]
-        ).to(device)
-        # Inference
-        with torch.no_grad():
-            result = model.separate(inputs)
-        # Extract Outputs
-        target_audio = result.target[0]
-        residual_audio = result.residual[0]
-        # Get sampling rate from the processor config
-        sr = processor.feature_extractor.sampling_rate
         # Save to files
-        target_path = save_audio(target_audio, sr)
-        residual_path = save_audio(residual_audio, sr)
         return target_path, residual_path, f"✅ Successfully separated '{text_prompt}' from the audio."
     except Exception as e:
         return None, None, f"❌ Error: {str(e)}"
 # Build Gradio Interface
@@ -83,7 +81,7 @@ with gr.Blocks(
             input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
             text_prompt = gr.Textbox(
                 label="Text Prompt",
-                placeholder="e.g., 'drums', 'vocals', 'speech', 'piano'",
                 value="drums",
                 info="Describe the sound you want to isolate."
             )
@@ -104,9 +102,7 @@ with gr.Blocks(
     gr.Markdown(
         """
         ### Tips
-        - Use prompts like: `drums`, `vocals`, `speech`, `piano`, `guitar`, `bass`, `synth`
-        - For mixed audio with speech, try: `man speaking`, `woman singing`
-        - GPU recommended for faster inference
         """
     )

 MODEL_NAME = "facebook/sam-audio-small"
 # Global model and processor
+print(f"Loading {MODEL_NAME}...")
+model = SAMAudio.from_pretrained(MODEL_NAME)
 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
+model = model.eval().cuda()
+print("Model loaded on CUDA.")
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
+@spaces.GPU(duration=300)
 def separate_audio(audio_path, text_prompt):
     if not audio_path:
         return None, None, "❌ Please upload an audio file."
         text_prompt = "vocals"
     try:
+        # Process Inputs (following official example)
+        batch = processor(
             audios=[audio_path],
             descriptions=[text_prompt.strip()]
+        ).to("cuda")
+        # Inference using inference_mode (as per official docs)
+        with torch.inference_mode():
+            result = model.separate(batch, predict_spans=False, reranking_candidates=1)
+        # Get sampling rate
+        sample_rate = processor.audio_sampling_rate
         # Save to files
+        target_path = save_audio(result.target, sample_rate)
+        residual_path = save_audio(result.residual, sample_rate)
         return target_path, residual_path, f"✅ Successfully separated '{text_prompt}' from the audio."
     except Exception as e:
+        import traceback
+        traceback.print_exc()
         return None, None, f"❌ Error: {str(e)}"
 # Build Gradio Interface
             input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
             text_prompt = gr.Textbox(
                 label="Text Prompt",
+                placeholder="e.g., 'drums', 'vocals', 'A man speaking'",
                 value="drums",
                 info="Describe the sound you want to isolate."
             )
     gr.Markdown(
         """
         ### Tips
+        - Use prompts like: `drums`, `vocals`, `A man speaking`, `piano`, `guitar`
         """
     )