Spaces:

lpeterl
/

sam-audio-webui

Running on Zero

App Files Files Community

Peter Shi commited on Dec 20, 2025

Commit

b02c18a

1 Parent(s): 79ced89

Fix: follow official HF example exactly

Browse files

Files changed (1) hide show

app.py +21 -24

app.py CHANGED Viewed

@@ -11,19 +11,15 @@ from sam_audio import SAMAudio, SAMAudioProcessor
 # Configuration
 MODEL_NAME = "facebook/sam-audio-small"
-# Global model and processor
 print(f"Loading {MODEL_NAME}...")
-model = SAMAudio.from_pretrained(MODEL_NAME)
 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
-model = model.eval().cuda()
-print("Model loaded on CUDA.")
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
-    if tensor.dim() == 1:
-        tensor = tensor.unsqueeze(0)
-    tensor = tensor.detach().cpu()
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
@@ -37,22 +33,19 @@ def separate_audio(audio_path, text_prompt):
         text_prompt = "vocals"
     try:
-        # Process Inputs (following official example)
-        batch = processor(
             audios=[audio_path],
             descriptions=[text_prompt.strip()]
-        ).to("cuda")
-        # Inference using inference_mode (as per official docs)
         with torch.inference_mode():
-            result = model.separate(batch, predict_spans=False, reranking_candidates=1)
-        # Get sampling rate
         sample_rate = processor.audio_sampling_rate
-        # Save to files
-        target_path = save_audio(result.target, sample_rate)
-        residual_path = save_audio(result.residual, sample_rate)
         return target_path, residual_path, f"✅ Successfully separated '{text_prompt}' from the audio."
@@ -81,8 +74,8 @@ with gr.Blocks(
             input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
             text_prompt = gr.Textbox(
                 label="Text Prompt",
-                placeholder="e.g., 'drums', 'vocals', 'A man speaking'",
-                value="drums",
                 info="Describe the sound you want to isolate."
             )
             run_btn = gr.Button("🎯 Separate Audio", variant="primary", size="lg")
@@ -101,8 +94,12 @@ with gr.Blocks(
     gr.Markdown(
         """
-        ### Tips
-        - Use prompts like: `drums`, `vocals`, `A man speaking`, `piano`, `guitar`
         """
     )

 # Configuration
 MODEL_NAME = "facebook/sam-audio-small"
+# Load model and processor (following official HuggingFace example)
 print(f"Loading {MODEL_NAME}...")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
 processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
+print(f"Model loaded on {device}.")
 def save_audio(tensor, sample_rate):
     """Helper to save torch tensor to a temp file for Gradio output."""
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
         torchaudio.save(tmp.name, tensor, sample_rate)
         return tmp.name
         text_prompt = "vocals"
     try:
+        # Process and separate (following official example)
+        inputs = processor(
             audios=[audio_path],
             descriptions=[text_prompt.strip()]
+        ).to(device)
         with torch.inference_mode():
+            result = model.separate(inputs, predict_spans=False, reranking_candidates=1)
+        # Save results (following official example: result.target[0].unsqueeze(0).cpu())
         sample_rate = processor.audio_sampling_rate
+        target_path = save_audio(result.target[0].unsqueeze(0).cpu(), sample_rate)
+        residual_path = save_audio(result.residual[0].unsqueeze(0).cpu(), sample_rate)
         return target_path, residual_path, f"✅ Successfully separated '{text_prompt}' from the audio."
             input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
             text_prompt = gr.Textbox(
                 label="Text Prompt",
+                placeholder="e.g., 'A man speaking', 'Piano playing', 'Dog barking'",
+                value="A man speaking",
                 info="Describe the sound you want to isolate."
             )
             run_btn = gr.Button("🎯 Separate Audio", variant="primary", size="lg")
     gr.Markdown(
         """
+        ### Example Prompts
+        - "A person coughing"
+        - "Piano playing a melody"
+        - "Dog barking"
+        - "Car engine revving"
+        - "Raindrops falling"
         """
     )