Peter Shi commited on
Commit
832604f
Β·
1 Parent(s): 2922fa7

Add video preview with tabs for audio/video upload

Browse files
Files changed (1) hide show
  1. app.py +79 -103
app.py CHANGED
@@ -19,39 +19,24 @@ model = SAMAudio.from_pretrained(MODEL_NAME).to(device).eval()
19
  processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
20
  print(f"Model loaded on {device}.")
21
 
22
- # Supported file extensions
23
- SUPPORTED_EXTENSIONS = ['.mp3', '.wav', '.flac', '.ogg', '.m4a', '.mp4', '.mkv', '.avi', '.mov', '.webm']
24
-
25
  def save_audio(tensor, sample_rate):
26
  """Helper to save torch tensor to a temp file for Gradio output."""
27
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
28
  torchaudio.save(tmp.name, tensor, sample_rate)
29
  return tmp.name
30
 
31
- def validate_file(file_path):
32
- """Check if file extension is supported."""
33
- if not file_path:
34
- return False, "No file uploaded"
35
- ext = os.path.splitext(file_path)[1].lower()
36
- if ext not in SUPPORTED_EXTENSIONS:
37
- return False, f"Unsupported format: {ext}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}"
38
- return True, "OK"
39
-
40
  @spaces.GPU(duration=300)
41
- def separate_audio(file_path, text_prompt):
 
 
 
42
  if not file_path:
43
  return None, None, "❌ Please upload an audio or video file."
44
 
45
- # Validate file
46
- valid, msg = validate_file(file_path)
47
- if not valid:
48
- return None, None, f"❌ {msg}"
49
-
50
  if not text_prompt or not text_prompt.strip():
51
  return None, None, "❌ Please enter a text prompt describing the sound to isolate."
52
 
53
  try:
54
- # SAM-Audio processor accepts both audio and video files directly
55
  inputs = processor(
56
  audios=[file_path],
57
  descriptions=[text_prompt.strip()]
@@ -71,156 +56,147 @@ def separate_audio(file_path, text_prompt):
71
  traceback.print_exc()
72
  return None, None, f"❌ Error: {str(e)}"
73
 
74
- # Custom CSS for dark theme
75
  custom_css = """
76
  .gradio-container {
77
- background: #0a0a0a !important;
78
  max-width: 1400px !important;
79
  }
80
 
81
- .upload-box {
82
- border: 2px dashed #444 !important;
83
- border-radius: 12px !important;
84
- background: #1a1a1a !important;
85
- min-height: 200px !important;
86
- transition: border-color 0.3s !important;
87
  }
88
 
89
- .upload-box:hover {
90
- border-color: #e91e8c !important;
 
 
91
  }
92
 
93
- .result-card {
94
- background: #1a1a1a !important;
95
- border: 1px solid #333 !important;
96
- border-radius: 12px !important;
97
- padding: 1rem !important;
98
  }
99
 
100
  .primary-btn {
101
  background: linear-gradient(135deg, #e91e8c, #9c27b0) !important;
102
  border: none !important;
103
  border-radius: 24px !important;
 
 
104
  }
105
 
106
- .sidebar-text {
107
- color: #888 !important;
108
- font-size: 0.9rem !important;
 
 
109
  }
110
 
111
- .step-text {
112
- color: #ccc !important;
113
- padding: 0.3rem 0 !important;
114
  }
115
 
116
- .pink-text {
117
- color: #e91e8c !important;
118
  }
119
  """
120
 
121
  # Build Gradio Interface
122
  with gr.Blocks(
123
  title="SAM-Audio - Isolate Sounds",
124
- theme=gr.themes.Base(
125
  primary_hue="pink",
126
- secondary_hue="purple",
127
  neutral_hue="gray",
128
- ).set(
129
- body_background_fill="#0a0a0a",
130
- body_background_fill_dark="#0a0a0a",
131
- block_background_fill="#1a1a1a",
132
- block_background_fill_dark="#1a1a1a",
133
- input_background_fill="#1a1a1a",
134
- input_background_fill_dark="#1a1a1a",
135
- button_primary_background_fill="linear-gradient(135deg, #e91e8c, #9c27b0)",
136
- button_primary_background_fill_hover="linear-gradient(135deg, #d1187d, #8a22a0)",
137
- border_color_primary="#333",
138
  ),
139
  css=custom_css
140
  ) as demo:
141
 
142
  with gr.Row():
143
  # Sidebar
144
- with gr.Column(scale=1, min_width=250):
145
- gr.Markdown("## 🎡 Isolate Sounds")
146
- gr.Markdown("Extract and isolate any sound from audio or video using AI.", elem_classes=["sidebar-text"])
147
 
148
  gr.Markdown("---")
149
  gr.Markdown("### How it works")
150
- gr.Markdown("**1.** Add audio or video", elem_classes=["step-text"])
151
- gr.Markdown("**2.** Describe the sound", elem_classes=["step-text"])
152
- gr.Markdown("**3.** Get separated tracks", elem_classes=["step-text"])
153
-
154
- gr.Markdown("---")
155
- gr.Markdown("**Model**")
156
- gr.Markdown("πŸ€– SAM-Audio Small")
157
 
158
  gr.Markdown("---")
159
- gr.Markdown("**Supported Formats**")
160
- gr.Markdown("🎡 MP3, WAV, FLAC, OGG, M4A", elem_classes=["sidebar-text"])
161
- gr.Markdown("🎬 MP4, MKV, AVI, MOV, WebM", elem_classes=["sidebar-text"])
162
 
163
  # Main content area
164
  with gr.Column(scale=4):
165
- gr.Markdown("### πŸ“€ Upload Audio or Video")
166
 
167
- # Use File component to accept both audio and video
168
- input_file = gr.File(
169
- label="Drop your audio or video file here",
170
- file_types=SUPPORTED_EXTENSIONS,
171
- elem_classes=["upload-box"]
172
- )
 
 
 
 
 
173
 
174
- gr.Markdown("### πŸ’¬ Describe the Sound to Isolate")
175
  text_prompt = gr.Textbox(
176
  label="",
177
- placeholder="e.g., 'A man speaking', 'Piano melody', 'Dog barking', 'Background music'",
178
  lines=1
179
  )
180
 
181
- with gr.Row():
182
- run_btn = gr.Button(
183
- "🎯 Isolate Sound",
184
- variant="primary",
185
- size="lg",
186
- elem_classes=["primary-btn"]
187
- )
188
 
189
  status_output = gr.Markdown(
190
- value="*Upload a file and describe what sound you want to isolate.*"
191
  )
192
 
193
  gr.Markdown("---")
194
- gr.Markdown("### 🎧 Results")
195
 
196
  with gr.Row():
197
- with gr.Column(elem_classes=["result-card"]):
198
- gr.Markdown("**🎯 Isolated Sound** (Target)")
199
  output_target = gr.Audio(label="", show_label=False)
200
 
201
- with gr.Column(elem_classes=["result-card"]):
202
- gr.Markdown("**πŸ”‡ Background** (Residual)")
203
  output_residual = gr.Audio(label="", show_label=False)
204
 
205
  gr.Markdown("---")
206
- gr.Markdown("### πŸ’‘ Example Prompts")
207
- gr.Markdown("Click any example below to use it:")
208
 
209
  with gr.Row():
210
- for prompt in ["A man speaking", "A woman singing", "Piano", "Drums", "Guitar", "Dog barking"]:
211
- gr.Button(prompt, size="sm").click(
212
- fn=lambda p=prompt: p,
213
- outputs=[text_prompt]
214
- )
215
-
216
- def process_file(file, prompt):
217
- if file is None:
218
- return None, None, "❌ Please upload a file."
219
- return separate_audio(file.name, prompt)
 
 
 
220
 
221
  run_btn.click(
222
- fn=process_file,
223
- inputs=[input_file, text_prompt],
224
  outputs=[output_target, output_residual, status_output]
225
  )
226
 
 
19
  processor = SAMAudioProcessor.from_pretrained(MODEL_NAME)
20
  print(f"Model loaded on {device}.")
21
 
 
 
 
22
  def save_audio(tensor, sample_rate):
23
  """Helper to save torch tensor to a temp file for Gradio output."""
24
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
25
  torchaudio.save(tmp.name, tensor, sample_rate)
26
  return tmp.name
27
 
 
 
 
 
 
 
 
 
 
28
  @spaces.GPU(duration=300)
29
+ def separate_audio(audio_path, video_path, text_prompt):
30
+ # Determine which input to use
31
+ file_path = video_path if video_path else audio_path
32
+
33
  if not file_path:
34
  return None, None, "❌ Please upload an audio or video file."
35
 
 
 
 
 
 
36
  if not text_prompt or not text_prompt.strip():
37
  return None, None, "❌ Please enter a text prompt describing the sound to isolate."
38
 
39
  try:
 
40
  inputs = processor(
41
  audios=[file_path],
42
  descriptions=[text_prompt.strip()]
 
56
  traceback.print_exc()
57
  return None, None, f"❌ Error: {str(e)}"
58
 
59
+ # Custom CSS
60
  custom_css = """
61
  .gradio-container {
62
+ background: #0d0d0d !important;
63
  max-width: 1400px !important;
64
  }
65
 
66
+ .gradio-container *, .gradio-container p, .gradio-container span,
67
+ .gradio-container h1, .gradio-container h2, .gradio-container h3,
68
+ .gradio-container label, .gradio-container .markdown-text {
69
+ color: #ffffff !important;
 
 
70
  }
71
 
72
+ input, textarea {
73
+ background: #1a1a1a !important;
74
+ border: 1px solid #444 !important;
75
+ color: #ffffff !important;
76
  }
77
 
78
+ input::placeholder, textarea::placeholder {
79
+ color: #888 !important;
 
 
 
80
  }
81
 
82
  .primary-btn {
83
  background: linear-gradient(135deg, #e91e8c, #9c27b0) !important;
84
  border: none !important;
85
  border-radius: 24px !important;
86
+ color: #ffffff !important;
87
+ font-weight: 600 !important;
88
  }
89
 
90
+ .example-btn {
91
+ background: #2a2a2a !important;
92
+ border: 1px solid #444 !important;
93
+ color: #ffffff !important;
94
+ border-radius: 8px !important;
95
  }
96
 
97
+ .example-btn:hover {
98
+ background: #3a3a3a !important;
99
+ border-color: #e91e8c !important;
100
  }
101
 
102
+ hr {
103
+ border-color: #333 !important;
104
  }
105
  """
106
 
107
  # Build Gradio Interface
108
  with gr.Blocks(
109
  title="SAM-Audio - Isolate Sounds",
110
+ theme=gr.themes.Default(
111
  primary_hue="pink",
112
+ secondary_hue="purple",
113
  neutral_hue="gray",
 
 
 
 
 
 
 
 
 
 
114
  ),
115
  css=custom_css
116
  ) as demo:
117
 
118
  with gr.Row():
119
  # Sidebar
120
+ with gr.Column(scale=1, min_width=260):
121
+ gr.Markdown("# 🎡 Isolate Sounds")
122
+ gr.Markdown("Extract and isolate any sound from audio or video using AI.")
123
 
124
  gr.Markdown("---")
125
  gr.Markdown("### How it works")
126
+ gr.Markdown("**1.** Add audio or video")
127
+ gr.Markdown("**2.** Describe the sound")
128
+ gr.Markdown("**3.** Get separated tracks")
 
 
 
 
129
 
130
  gr.Markdown("---")
131
+ gr.Markdown("**Model:** SAM-Audio Small")
 
 
132
 
133
  # Main content area
134
  with gr.Column(scale=4):
135
+ gr.Markdown("## πŸ“€ Upload Audio or Video")
136
 
137
+ with gr.Tabs():
138
+ with gr.TabItem("οΏ½ Audio"):
139
+ input_audio = gr.Audio(
140
+ label="Upload audio file (MP3, WAV, FLAC, etc.)",
141
+ type="filepath"
142
+ )
143
+
144
+ with gr.TabItem("🎬 Video"):
145
+ input_video = gr.Video(
146
+ label="Upload video file (MP4, MKV, AVI, etc.)"
147
+ )
148
 
149
+ gr.Markdown("## πŸ’¬ Describe the Sound")
150
  text_prompt = gr.Textbox(
151
  label="",
152
+ placeholder="e.g., 'A man speaking', 'Piano melody', 'Dog barking'",
153
  lines=1
154
  )
155
 
156
+ run_btn = gr.Button(
157
+ "🎯 Isolate Sound",
158
+ variant="primary",
159
+ size="lg",
160
+ elem_classes=["primary-btn"]
161
+ )
 
162
 
163
  status_output = gr.Markdown(
164
+ value="Upload a file and describe what sound you want to isolate."
165
  )
166
 
167
  gr.Markdown("---")
168
+ gr.Markdown("## 🎧 Results")
169
 
170
  with gr.Row():
171
+ with gr.Column():
172
+ gr.Markdown("**🎯 Isolated Sound (Target)**")
173
  output_target = gr.Audio(label="", show_label=False)
174
 
175
+ with gr.Column():
176
+ gr.Markdown("**πŸ”‡ Background (Residual)**")
177
  output_residual = gr.Audio(label="", show_label=False)
178
 
179
  gr.Markdown("---")
180
+ gr.Markdown("## πŸ’‘ Example Prompts")
 
181
 
182
  with gr.Row():
183
+ btn1 = gr.Button("A man speaking", elem_classes=["example-btn"])
184
+ btn2 = gr.Button("A woman singing", elem_classes=["example-btn"])
185
+ btn3 = gr.Button("Piano", elem_classes=["example-btn"])
186
+ btn4 = gr.Button("Drums", elem_classes=["example-btn"])
187
+ btn5 = gr.Button("Guitar", elem_classes=["example-btn"])
188
+ btn6 = gr.Button("Dog barking", elem_classes=["example-btn"])
189
+
190
+ btn1.click(fn=lambda: "A man speaking", outputs=[text_prompt])
191
+ btn2.click(fn=lambda: "A woman singing", outputs=[text_prompt])
192
+ btn3.click(fn=lambda: "Piano", outputs=[text_prompt])
193
+ btn4.click(fn=lambda: "Drums", outputs=[text_prompt])
194
+ btn5.click(fn=lambda: "Guitar", outputs=[text_prompt])
195
+ btn6.click(fn=lambda: "Dog barking", outputs=[text_prompt])
196
 
197
  run_btn.click(
198
+ fn=separate_audio,
199
+ inputs=[input_audio, input_video, text_prompt],
200
  outputs=[output_target, output_residual, status_output]
201
  )
202