Spaces:
Running
on
Zero
Running
on
Zero
Adjust tabs
Browse files
app.py
CHANGED
|
@@ -2,6 +2,10 @@
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
import spaces
|
| 7 |
|
|
@@ -167,6 +171,37 @@ def speech_enhancement(
|
|
| 167 |
return None, f"Error: {str(e)}"
|
| 168 |
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
@spaces.GPU(duration=60)
|
| 171 |
def audio_super_resolution(
|
| 172 |
low_sr_audio,
|
|
@@ -187,9 +222,18 @@ def audio_super_resolution(
|
|
| 187 |
num_steps=num_steps,
|
| 188 |
output_path=output_path
|
| 189 |
)
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
except Exception as e:
|
| 192 |
-
return None, f"Error: {str(e)}"
|
| 193 |
|
| 194 |
|
| 195 |
@spaces.GPU(duration=60)
|
|
@@ -217,9 +261,24 @@ def video_to_audio(
|
|
| 217 |
return None, f"Error: {str(e)}"
|
| 218 |
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
# Create Gradio Interface
|
| 221 |
with gr.Blocks(
|
| 222 |
-
title="UniFlow-Audio Inference Demo",
|
|
|
|
|
|
|
| 223 |
) as demo:
|
| 224 |
gr.Markdown("# π UniFlow-Audio Inference Demo")
|
| 225 |
gr.Markdown(
|
|
@@ -228,7 +287,7 @@ with gr.Blocks(
|
|
| 228 |
|
| 229 |
with gr.Tabs():
|
| 230 |
# Tab 1: Text to Audio
|
| 231 |
-
with gr.Tab("π’ Text to Audio
|
| 232 |
with gr.Row():
|
| 233 |
with gr.Column():
|
| 234 |
t2a_caption = gr.Textbox(
|
|
@@ -279,7 +338,7 @@ with gr.Blocks(
|
|
| 279 |
)
|
| 280 |
|
| 281 |
# Tab 2: Text to Music
|
| 282 |
-
with gr.Tab("πΌ Text to Music
|
| 283 |
with gr.Row():
|
| 284 |
with gr.Column():
|
| 285 |
t2m_caption = gr.Textbox(
|
|
@@ -330,7 +389,7 @@ with gr.Blocks(
|
|
| 330 |
)
|
| 331 |
|
| 332 |
# Tab 3: Text to Speech
|
| 333 |
-
with gr.Tab("π£οΈ Text to Speech
|
| 334 |
with gr.Row():
|
| 335 |
with gr.Column():
|
| 336 |
tts_transcript = gr.Textbox(
|
|
@@ -393,7 +452,7 @@ with gr.Blocks(
|
|
| 393 |
)
|
| 394 |
|
| 395 |
# Tab 4: Singing Voice Synthesis
|
| 396 |
-
with gr.Tab("π€ Singing Voice Synthesis
|
| 397 |
with gr.Row():
|
| 398 |
with gr.Column():
|
| 399 |
svs_singer = gr.Dropdown(
|
|
@@ -487,7 +546,7 @@ with gr.Blocks(
|
|
| 487 |
)
|
| 488 |
|
| 489 |
# Tab 5: Speech Enhancement
|
| 490 |
-
with gr.Tab("π Speech Enhancement
|
| 491 |
with gr.Row():
|
| 492 |
with gr.Column():
|
| 493 |
se_input = gr.Audio(label="Noisy Speech", type="filepath")
|
|
@@ -533,7 +592,7 @@ with gr.Blocks(
|
|
| 533 |
)
|
| 534 |
|
| 535 |
# Tab 6: Audio Super Resolution
|
| 536 |
-
with gr.Tab("β¬οΈ Audio
|
| 537 |
with gr.Row():
|
| 538 |
with gr.Column():
|
| 539 |
sr_input = gr.Audio(
|
|
@@ -569,10 +628,21 @@ with gr.Blocks(
|
|
| 569 |
)
|
| 570 |
sr_status = gr.Textbox(label="Status")
|
| 571 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
sr_button.click(
|
| 573 |
fn=audio_super_resolution,
|
| 574 |
inputs=[sr_input, sr_model, sr_guidance, sr_steps],
|
| 575 |
-
outputs=[sr_output, sr_status]
|
| 576 |
)
|
| 577 |
|
| 578 |
gr.Examples(
|
|
@@ -583,7 +653,7 @@ with gr.Blocks(
|
|
| 583 |
)
|
| 584 |
|
| 585 |
# Tab 7: Video to Audio
|
| 586 |
-
with gr.Tab("π¬ Video to Audio
|
| 587 |
with gr.Row():
|
| 588 |
with gr.Column():
|
| 589 |
v2a_input = gr.Video(label="Input Video")
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import gradio as gr
|
| 5 |
+
import numpy as np
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import librosa
|
| 8 |
+
import librosa.display
|
| 9 |
|
| 10 |
import spaces
|
| 11 |
|
|
|
|
| 171 |
return None, f"Error: {str(e)}"
|
| 172 |
|
| 173 |
|
| 174 |
+
def generate_spectrogram(audio_path, title="Spectrogram"):
|
| 175 |
+
"""Generate spectrogram from audio file"""
|
| 176 |
+
try:
|
| 177 |
+
# Load audio file
|
| 178 |
+
y, sr = librosa.load(audio_path, sr=None)
|
| 179 |
+
|
| 180 |
+
# Create figure
|
| 181 |
+
fig, ax = plt.subplots(figsize=(10, 4))
|
| 182 |
+
|
| 183 |
+
# Generate mel spectrogram
|
| 184 |
+
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
|
| 185 |
+
|
| 186 |
+
# Display spectrogram
|
| 187 |
+
img = librosa.display.specshow(
|
| 188 |
+
D, y_axis='hz', x_axis='time', sr=sr, ax=ax
|
| 189 |
+
)
|
| 190 |
+
ax.set_title(f'{title} (Sample Rate: {sr} Hz)')
|
| 191 |
+
fig.colorbar(img, ax=ax, format='%+2.0f dB')
|
| 192 |
+
|
| 193 |
+
# Save to file
|
| 194 |
+
spec_path = audio_path.replace('.wav', '_spec.png')
|
| 195 |
+
plt.tight_layout()
|
| 196 |
+
fig.savefig(spec_path, dpi=100, bbox_inches='tight')
|
| 197 |
+
plt.close(fig)
|
| 198 |
+
|
| 199 |
+
return spec_path
|
| 200 |
+
except Exception as e:
|
| 201 |
+
print(f"Error generating spectrogram: {str(e)}")
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
|
| 205 |
@spaces.GPU(duration=60)
|
| 206 |
def audio_super_resolution(
|
| 207 |
low_sr_audio,
|
|
|
|
| 222 |
num_steps=num_steps,
|
| 223 |
output_path=output_path
|
| 224 |
)
|
| 225 |
+
|
| 226 |
+
# Generate spectrograms for input and output
|
| 227 |
+
input_spec = generate_spectrogram(
|
| 228 |
+
low_sr_audio, "Input Audio Spectrogram"
|
| 229 |
+
)
|
| 230 |
+
output_spec = generate_spectrogram(
|
| 231 |
+
output_path, "Output Audio Spectrogram"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
return output_path, "Super-resolution successful!", input_spec, output_spec
|
| 235 |
except Exception as e:
|
| 236 |
+
return None, f"Error: {str(e)}", None, None
|
| 237 |
|
| 238 |
|
| 239 |
@spaces.GPU(duration=60)
|
|
|
|
| 261 |
return None, f"Error: {str(e)}"
|
| 262 |
|
| 263 |
|
| 264 |
+
# Custom CSS for better tab display
|
| 265 |
+
custom_css = """
|
| 266 |
+
.tab-nav button {
|
| 267 |
+
font-size: 14px !important;
|
| 268 |
+
padding: 8px 12px !important;
|
| 269 |
+
min-width: fit-content !important;
|
| 270 |
+
}
|
| 271 |
+
.tab-nav {
|
| 272 |
+
overflow-x: auto !important;
|
| 273 |
+
flex-wrap: nowrap !important;
|
| 274 |
+
}
|
| 275 |
+
"""
|
| 276 |
+
|
| 277 |
# Create Gradio Interface
|
| 278 |
with gr.Blocks(
|
| 279 |
+
title="UniFlow-Audio Inference Demo",
|
| 280 |
+
theme=gr.themes.Soft(),
|
| 281 |
+
css=custom_css
|
| 282 |
) as demo:
|
| 283 |
gr.Markdown("# π UniFlow-Audio Inference Demo")
|
| 284 |
gr.Markdown(
|
|
|
|
| 287 |
|
| 288 |
with gr.Tabs():
|
| 289 |
# Tab 1: Text to Audio
|
| 290 |
+
with gr.Tab("π’ Text to Audio"):
|
| 291 |
with gr.Row():
|
| 292 |
with gr.Column():
|
| 293 |
t2a_caption = gr.Textbox(
|
|
|
|
| 338 |
)
|
| 339 |
|
| 340 |
# Tab 2: Text to Music
|
| 341 |
+
with gr.Tab("πΌ Text to Music"):
|
| 342 |
with gr.Row():
|
| 343 |
with gr.Column():
|
| 344 |
t2m_caption = gr.Textbox(
|
|
|
|
| 389 |
)
|
| 390 |
|
| 391 |
# Tab 3: Text to Speech
|
| 392 |
+
with gr.Tab("π£οΈ Text to Speech"):
|
| 393 |
with gr.Row():
|
| 394 |
with gr.Column():
|
| 395 |
tts_transcript = gr.Textbox(
|
|
|
|
| 452 |
)
|
| 453 |
|
| 454 |
# Tab 4: Singing Voice Synthesis
|
| 455 |
+
with gr.Tab("π€ Singing Voice Synthesis"):
|
| 456 |
with gr.Row():
|
| 457 |
with gr.Column():
|
| 458 |
svs_singer = gr.Dropdown(
|
|
|
|
| 546 |
)
|
| 547 |
|
| 548 |
# Tab 5: Speech Enhancement
|
| 549 |
+
with gr.Tab("π Speech Enhancement"):
|
| 550 |
with gr.Row():
|
| 551 |
with gr.Column():
|
| 552 |
se_input = gr.Audio(label="Noisy Speech", type="filepath")
|
|
|
|
| 592 |
)
|
| 593 |
|
| 594 |
# Tab 6: Audio Super Resolution
|
| 595 |
+
with gr.Tab("β¬οΈ Audio SR"):
|
| 596 |
with gr.Row():
|
| 597 |
with gr.Column():
|
| 598 |
sr_input = gr.Audio(
|
|
|
|
| 628 |
)
|
| 629 |
sr_status = gr.Textbox(label="Status")
|
| 630 |
|
| 631 |
+
# Spectrograms display
|
| 632 |
+
with gr.Row():
|
| 633 |
+
with gr.Column():
|
| 634 |
+
sr_input_spec = gr.Image(
|
| 635 |
+
label="Input Spectrogram", type="filepath"
|
| 636 |
+
)
|
| 637 |
+
with gr.Column():
|
| 638 |
+
sr_output_spec = gr.Image(
|
| 639 |
+
label="Output Spectrogram", type="filepath"
|
| 640 |
+
)
|
| 641 |
+
|
| 642 |
sr_button.click(
|
| 643 |
fn=audio_super_resolution,
|
| 644 |
inputs=[sr_input, sr_model, sr_guidance, sr_steps],
|
| 645 |
+
outputs=[sr_output, sr_status, sr_input_spec, sr_output_spec]
|
| 646 |
)
|
| 647 |
|
| 648 |
gr.Examples(
|
|
|
|
| 653 |
)
|
| 654 |
|
| 655 |
# Tab 7: Video to Audio
|
| 656 |
+
with gr.Tab("π¬ Video to Audio"):
|
| 657 |
with gr.Row():
|
| 658 |
with gr.Column():
|
| 659 |
v2a_input = gr.Video(label="Input Video")
|