Peter Shi commited on
Commit
d4c742d
·
1 Parent(s): 0a54840

Switch to Docker SDK with Python 3.12

Browse files
Files changed (4) hide show
  1. Dockerfile +27 -0
  2. README.md +2 -5
  3. app.py +71 -128
  4. requirements.txt +4 -9
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.12 to satisfy the 'perception-models' requirement
2
+ FROM python:3.12
3
+
4
+ # Set the working directory
5
+ WORKDIR /code
6
+
7
+ # Install system dependencies (ffmpeg is required for audio)
8
+ RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements and install Python dependencies
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir --upgrade pip
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Set up a user (Required by HF Spaces security)
16
+ RUN useradd -m -u 1000 user
17
+ USER user
18
+ ENV HOME=/home/user \
19
+ PATH=/home/user/.local/bin:$PATH
20
+
21
+ WORKDIR $HOME/app
22
+
23
+ # Copy application files
24
+ COPY --chown=user . $HOME/app
25
+
26
+ # Start the app
27
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -3,13 +3,10 @@ title: Sam Audio Webui
3
  emoji: 🎵
4
  colorFrom: indigo
5
  colorTo: pink
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- fullWidth: true
12
- python_version: 3.11
13
  ---
14
 
15
  # SAM Audio WebUI
 
3
  emoji: 🎵
4
  colorFrom: indigo
5
  colorTo: pink
6
+ sdk: docker
7
+ app_port: 7860
 
8
  pinned: false
9
  license: apache-2.0
 
 
10
  ---
11
 
12
  # SAM Audio WebUI
app.py CHANGED
@@ -1,148 +1,91 @@
1
  import gradio as gr
2
  import torch
3
- try:
4
- import spaces
5
- except ImportError:
6
- class spaces:
7
- @staticmethod
8
- def GPU(duration=60):
9
- def decorator(func):
10
- return func
11
- return decorator
12
-
13
- import gradio as gr
14
- import torch
15
- try:
16
- import spaces
17
- except ImportError:
18
- class spaces:
19
- @staticmethod
20
- def GPU(duration=60):
21
- def decorator(func):
22
- return func
23
- return decorator
24
-
25
- from sam_audio import SAMAudio, SAMAudioProcessor
26
- import numpy as np
27
- import librosa
28
  import tempfile
29
- import soundfile as sf
30
 
31
- # Model configuration
32
- MODEL_ID = "facebook/sam-audio-small"
 
33
 
34
- print(f"Loading model: {MODEL_ID}...")
 
 
35
  try:
36
- processor = SAMAudioProcessor.from_pretrained(MODEL_ID)
37
- model = SAMAudio.from_pretrained(
38
- MODEL_ID,
39
- device_map="auto",
40
- torch_dtype=torch.float16
41
- )
42
  print("Model loaded successfully.")
43
  except Exception as e:
44
- print(f"Error loading model: {e}")
45
- # Fallback currently not fully supported for SAMAudio custom class in bitsandbytes via config directly
46
- # unless it inherits correctly. Let's try standard float32 if float16 fails, or keep the error.
47
- print("Retrying with default precision...")
48
- try:
49
- processor = SAMAudioProcessor.from_pretrained(MODEL_ID)
50
- model = SAMAudio.from_pretrained(MODEL_ID, device_map="auto")
51
- print("Model loaded with default precision.")
52
- except Exception as e2:
53
- print(f"Critical error loading model: {e2}")
54
- raise e2
55
 
56
- @spaces.GPU(duration=120)
57
- def infer(audio_path, prompt_text):
 
 
 
 
 
 
 
 
 
58
  if not audio_path:
59
- return None
60
-
61
- print(f"Processing audio: {audio_path}, Prompt: {prompt_text}")
62
 
63
- # Load audio with librosa (standardizes sample rate)
64
- target_sr = 16000 # SAM Audio often works at 16k, or check processor.feature_extractor.sampling_rate
65
- if hasattr(processor, "feature_extractor"):
66
- target_sr = processor.feature_extractor.sampling_rate
67
-
68
- audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
69
-
70
- # Prepare inputs
71
  inputs = processor(
72
- audios=[audio],
73
- sampling_rate=sr,
74
- text=[[prompt_text]] if prompt_text else None,
75
- return_tensors="pt"
76
- ).to(model.device)
77
 
 
78
  with torch.no_grad():
79
- outputs = model(**inputs)
80
-
81
- # Post-process to get likelihoods or masks
82
- # Note: transformers implementation details vary.
83
- # Usually we get logits. sigmoid -> prob.
84
- # pred_masks shape: (batch_size, num_masks, freq, time) or similar.
85
-
86
- pred_masks = torch.sigmoid(outputs.pred_masks)
87
-
88
- # For audio reconstruction, we need to apply this mask to the STFT of the original audio.
89
- # We calculate STFT using the same parameters as the model training if possible.
90
- # If parameters are unknown, we try standard values or rely on processor logic if available.
91
-
92
- # Standard STFT for AudioLDM/MusicGen etc often use n_fft=1024, hop=160.
93
- # Let's inspect the mask shape to infer Time dimensions.
94
-
95
- mask = pred_masks[0, 0] # Take first batch, first predicted mask
96
- # Resize mask to inputs size if needed?
97
- # Usually SAM Audio outputs a mask corresponding to the spectrogram features.
98
-
99
- # Let's try to reconstruct using a generic STFT approach
100
- n_fft = 1024
101
- hop_length = 320 # Common for 16k
102
- stft = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
103
-
104
- # stft shape: (1 + n_fft/2, time_frames)
105
- # mask shape from model might be different. Resize mask to match stft.
106
-
107
- # Convert mask to numpy
108
- mask_np = mask.cpu().float().numpy()
109
 
110
- # Resize mask to match STFT shape
111
- # stft.shape is (freq, time)
112
- import cv2
113
- # cv2.resize expects (width, height) -> (time, freq)
114
- try:
115
- mask_resized = cv2.resize(mask_np, (stft.shape[1], stft.shape[0]), interpolation=cv2.INTER_LINEAR)
116
- # Apply mask
117
- stft_masked = stft * mask_resized
118
- # ISTFT
119
- audio_masked = librosa.istft(stft_masked, hop_length=hop_length)
120
-
121
- # Save to temp file
122
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
123
- sf.write(tmp.name, audio_masked, sr)
124
- return tmp.name
125
- except Exception as e_resize:
126
- print(f"Error applying mask: {e_resize}. Returning original for debug.")
127
- # Fallback to saving original just to show partial success
128
- return audio_path
129
 
130
- with gr.Blocks() as demo:
131
- gr.Markdown(f"# SAM Audio WebUI ({MODEL_ID})")
132
- gr.Markdown("Upload audio and provide a prompt to segment specific sounds.")
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  with gr.Row():
135
- audio_input = gr.Audio(type="filepath", label="Input Audio")
136
- text_input = gr.Textbox(label="Prompt (e.g., 'drums', 'vocals')")
137
-
138
- submit_btn = gr.Button("Segment Audio")
139
- audio_output = gr.Audio(label="Segmented Audio")
140
-
141
- submit_btn.click(
142
- fn=infer,
143
- inputs=[audio_input, text_input],
144
- outputs=[audio_output]
 
 
 
 
 
 
 
145
  )
146
 
147
- if __name__ == "__main__":
148
- demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import torchaudio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import tempfile
5
+ from sam_audio import SAMAudio, SAMAudioProcessor
6
 
7
+ # Configuration
8
+ MODEL_NAME = "facebook/sam-audio-small"
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
+ print(f"Loading {MODEL_NAME} on {device}...")
12
+
13
+ # Load Model and Processor
14
  try:
15
+ model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
16
+ processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
 
 
 
 
17
  print("Model loaded successfully.")
18
  except Exception as e:
19
+ print(f"Error loading model. Did you set HF_TOKEN in secrets? Error: {e}")
20
+ raise e
 
 
 
 
 
 
 
 
 
21
 
22
+ def save_audio(tensor, sample_rate):
23
+ """Helper to save torch tensor to a temp file for Gradio output."""
24
+ if tensor.dim() == 1:
25
+ tensor = tensor.unsqueeze(0)
26
+ tensor = tensor.detach().cpu()
27
+
28
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
29
+ torchaudio.save(tmp.name, tensor, sample_rate)
30
+ return tmp.name
31
+
32
+ def separate_audio(audio_path, text_prompt):
33
  if not audio_path:
34
+ return None, None
 
 
35
 
36
+ # Process Inputs
 
 
 
 
 
 
 
37
  inputs = processor(
38
+ audios=[audio_path],
39
+ descriptions=[text_prompt]
40
+ ).to(device)
 
 
41
 
42
+ # Inference
43
  with torch.no_grad():
44
+ result = model.separate(inputs)
45
+
46
+ # Extract Outputs
47
+ target_audio = result.target[0] # The sound you asked for
48
+ residual_audio = result.residual[0] # Everything else
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Get sampling rate from the processor config
51
+ sr = processor.feature_extractor.sampling_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ # Save to files
54
+ target_path = save_audio(target_audio, sr)
55
+ residual_path = save_audio(residual_audio, sr)
56
+
57
+ return target_path, residual_path
58
+
59
+ # Build Gradio Interface
60
+ with gr.Blocks(title="SAM-Audio Demo") as demo:
61
+ gr.Markdown(
62
+ """
63
+ # 🎵 SAM-Audio: Segment Anything for Audio
64
+ Isolate specific sounds from an audio file using natural language prompts.
65
+
66
+ **Model:** `facebook/sam-audio-small`
67
+ """
68
+ )
69
 
70
  with gr.Row():
71
+ with gr.Column():
72
+ input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
73
+ text_prompt = gr.Textbox(
74
+ label="Text Prompt",
75
+ placeholder="e.g., 'dog barking', 'man speaking', 'typing keyboard'",
76
+ info="Describe the sound you want to isolate."
77
+ )
78
+ run_btn = gr.Button("Separate Audio", variant="primary")
79
+
80
+ with gr.Column():
81
+ output_target = gr.Audio(label="Isolated Sound (Target)")
82
+ output_residual = gr.Audio(label="Background (Residual)")
83
+
84
+ run_btn.click(
85
+ fn=separate_audio,
86
+ inputs=[input_audio, text_prompt],
87
+ outputs=[output_target, output_residual]
88
  )
89
 
90
+ # Launch
91
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,11 +1,6 @@
1
- gradio>=4.0.0
2
- torch>=2.0.0
3
- transformers>=4.38.0
4
- accelerate>=0.27.0
5
- bitsandbytes>=0.41.0
6
- scipy
7
- librosa
8
- opencv-python-headless
9
- spaces
10
  git+https://github.com/facebookresearch/sam-audio.git
 
11
  torchaudio
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  git+https://github.com/facebookresearch/sam-audio.git
2
+ torch
3
  torchaudio
4
+ gradio
5
+ numpy
6
+ scipy