chenxie95 commited on
Commit
2a74df9
·
verified ·
1 Parent(s): bbf0f02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -8
app.py CHANGED
@@ -1,5 +1,9 @@
1
  import gradio as gr
2
- import spaces
 
 
 
 
3
  import torch
4
  import numpy as np
5
  import librosa
@@ -9,6 +13,7 @@ import json5
9
  import torchaudio
10
  import tempfile
11
  import os
 
12
  from audio_controlnet.infer import AudioControlNet
13
 
14
  import logging
@@ -16,6 +21,84 @@ logging.getLogger("gradio").setLevel(logging.WARNING)
16
 
17
  MAX_DURATION = 10.0 # seconds
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # -----------------------------
20
  # Feature extraction utilities
21
  # -----------------------------
@@ -129,7 +212,7 @@ def save_temp_wav(audio):
129
  # -----------------------------
130
  # Generate audio
131
  # -----------------------------
132
- @spaces.GPU
133
  def generate_audio(text, cond_loudness, cond_pitch, cond_events):
134
  control = {}
135
  temp_files = []
@@ -169,23 +252,26 @@ def generate_audio(text, cond_loudness, cond_pitch, cond_events):
169
  # -----------------------------
170
  blue_theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky", neutral_hue="slate")
171
 
 
 
172
  EVENTS_PLACEHOLDER = '''
173
  // example
174
  {
175
- "Video game sound": [[0.0, 10.0]],
176
- "Male speech, man speaking": [[0.015, 3.829], [4.293, 4.875], [5.089, 7.349], [8.071, 9.978]]
 
177
  }
178
  '''.strip()
179
 
180
  with gr.Blocks(theme=blue_theme, title="Audio ControlNet – Text to Audio") as demo:
181
  gr.Markdown("""
182
  # 🎵 Audio ControlNet
183
- ## Text-to-Audio Generation with Conditions
184
- Base T2A interface with conditional inputs for **Audio ControlNet**.
185
  """)
186
  gr.HTML("""
187
  <style>
188
- .plot-small { height: 250px !important; }
189
  </style>
190
  """)
191
 
@@ -193,7 +279,7 @@ with gr.Blocks(theme=blue_theme, title="Audio ControlNet – Text to Audio") as
193
  with gr.Column(scale=2):
194
  text_prompt = gr.Textbox(
195
  label="Text Prompt",
196
- placeholder="A calm ambient soundscape with soft pads and distant piano",
197
  lines=4,
198
  )
199
 
@@ -202,6 +288,7 @@ with gr.Blocks(theme=blue_theme, title="Audio ControlNet – Text to Audio") as
202
  with gr.Row():
203
  with gr.Column(scale=1):
204
  sound_events = gr.Textbox(label="Sound Events (JSON)", placeholder=EVENTS_PLACEHOLDER, lines=8)
 
205
  with gr.Column(scale=1):
206
  events_plot = gr.Plot(label="Sound Events Roll", elem_classes="plot-small")
207
 
@@ -227,6 +314,13 @@ with gr.Blocks(theme=blue_theme, title="Audio ControlNet – Text to Audio") as
227
  loudness_audio.change(fn=extract_loudness, inputs=loudness_audio, outputs=loudness_plot)
228
  pitch_audio.change(fn=extract_pitch, inputs=pitch_audio, outputs=pitch_plot)
229
  sound_events.change(fn=visualize_events, inputs=sound_events, outputs=events_plot)
 
 
 
 
 
 
 
230
 
231
  generate_btn.click(
232
  fn=generate_audio,
 
1
  import gradio as gr
2
+ try:
3
+ import spaces
4
+ require_gpu = spaces.GPU
5
+ except:
6
+ require_gpu = lambda f: f
7
  import torch
8
  import numpy as np
9
  import librosa
 
13
  import torchaudio
14
  import tempfile
15
  import os
16
+ import random
17
  from audio_controlnet.infer import AudioControlNet
18
 
19
  import logging
 
21
 
22
  MAX_DURATION = 10.0 # seconds
23
 
24
+ # -----------------------------
25
+ # Random Examples Data
26
+ # -----------------------------
27
+ RANDOM_EXAMPLES = [
28
+ {
29
+ "caption": "People speak and clap, a child speaks and a camera clicks.",
30
+ "events": {
31
+ "Female speech, woman speaking": [[0.0, 3.969], [7.913, 8.157], [8.189, 9.654]],
32
+ "Child speech, kid speaking": [[9.724, 10.0]]
33
+ }
34
+ },
35
+ {
36
+ "caption": "Background noise, tapping, and cat sounds are interspersed with purring.",
37
+ "events": {
38
+ "Cat": [[0.978, 2.291], [9.032, 10.0]]
39
+ }
40
+ },
41
+ {
42
+ "caption": "Animals, dogs, and people are growling, shouting, and speaking.",
43
+ "events": {
44
+ "Dog": [[0.005, 0.165], [0.717, 1.529], [1.981, 3.139], [3.569, 4.562], [4.87, 5.964], [6.389, 7.621], [8.067, 8.98], [9.299, 9.878]],
45
+ "Speech": [[0.149, 0.738], [1.609, 1.954], [4.583, 4.886], [7.631, 8.024], [9.007, 9.288]],
46
+ "Male speech, man speaking": [[3.202, 3.532], [5.975, 6.378], [9.878, 10.0]]
47
+ }
48
+ },
49
+ {
50
+ "caption": "Water flows and dishes clatter with child speech and laughter.",
51
+ "events": {
52
+ "Child speech, kid speaking": [[0.0, 1.503], [1.732, 2.12], [2.942, 3.541], [7.803, 8.493]],
53
+ "Dishes, pots, and pans": [[1.983, 2.156], [3.175, 3.298], [4.774, 5.076], [5.711, 5.834], [6.076, 6.24], [6.423, 7.012]],
54
+ "Male speech, man speaking": [[8.547, 9.557]],
55
+ "Water tap, faucet": [[0.0, 10.0]]
56
+ }
57
+ },
58
+ {
59
+ "caption": "Speech babble and clattering dishes and silverware can be heard, along with a child's voice.",
60
+ "events": {
61
+ "Dishes, pots, and pans": [[0.85, 0.969], [1.386, 1.504], [7.717, 7.874]],
62
+ "Male speech, man speaking": [[0.748, 1.173]],
63
+ "Cutlery, silverware": [[4.693, 4.843], [5.299, 5.52]],
64
+ "Female speech, woman speaking": [[1.63, 3.409]],
65
+ "Child speech, kid speaking": [[8.756, 9.354]]
66
+ }
67
+ },
68
+ {
69
+ "caption": "A man is speaking, with background sounds of wind and a river, and another man sighing and speaking.",
70
+ "events": {"Male speech, man speaking": [[0.0, 7.851], [8.903, 9.129], [9.328, 9.98]], "Conversation": [[0.0, 9.98]], "Wind": [[0.0, 9.98]], "Stream, river": [[0.0, 9.98]], "Sigh": [[8.157, 8.707]]}
71
+ },
72
+ {
73
+ "caption": "Wind noise and cowbell are heard twice.",
74
+ "events": {"Wind noise (microphone)": [[0.0, 1.15], [2.378, 2.961]], "Cowbell": [[0.0, 10.0]]}
75
+ },
76
+ {
77
+ "caption": "There are mechanisms, bird calls, clicking, and male speech.",
78
+ "events": {"Mechanisms": [[0.0, 10.0]], "Bird vocalization, bird call, bird song": [[1.122, 1.423]], "Clicking": [[1.139, 1.238], [4.737, 4.858]], "Male speech, man speaking": [[1.95, 2.875], [5.182, 5.795], [6.113, 6.807], [7.386, 8.138], [8.236, 8.803], [9.427, 10.0]]}
79
+ },
80
+ {
81
+ "caption": "Propeller noise and a sound effect.",
82
+ "events": {"Propeller, airscrew": [[1.779, 10.0]], "Sound effect": [[1.811, 2.868]]}
83
+ },
84
+ {
85
+ "caption": "Women converse and laugh in a noisy crowd.",
86
+ "events": {"Female speech, woman speaking": [[0.0, 1.669], [2.097, 2.976], [4.66, 8.98]], "Conversation": [[0.0, 9.379]], "Background noise": [[0.0, 9.379]], "Generic impact sounds": [[0.096, 0.318], [3.707, 3.944], [6.107, 6.314], [7.584, 7.695], [8.256, 8.367]], "Laughter": [[1.573, 2.947], [4.461, 6.174], [9.002, 9.364]], "Crowd": [[1.573, 2.954], [4.512, 6.129], [9.002, 9.379]], "Tick": [[1.691, 1.795], [4.276, 4.372]], "Sound effect": [[3.212, 4.416]]}
87
+ }
88
+ ]
89
+ def build_events_json_text(events):
90
+ ret = ''
91
+ for key,times in events.items():
92
+ ret += f' "{key}": {times},\n'
93
+ ret = ret.strip(',')
94
+ return '{\n'+ret+'}'
95
+
96
+ def generate_random_example():
97
+ """Generate a random example with caption and sound events"""
98
+ example = random.choice(RANDOM_EXAMPLES)
99
+ events_json = build_events_json_text(example["events"])
100
+ return example["caption"], events_json
101
+
102
  # -----------------------------
103
  # Feature extraction utilities
104
  # -----------------------------
 
212
  # -----------------------------
213
  # Generate audio
214
  # -----------------------------
215
+ @require_gpu
216
  def generate_audio(text, cond_loudness, cond_pitch, cond_events):
217
  control = {}
218
  temp_files = []
 
252
  # -----------------------------
253
  blue_theme = gr.themes.Soft(primary_hue="blue", secondary_hue="sky", neutral_hue="slate")
254
 
255
+ CAPTION_PLACEHOLDER = 'Water flows and dishes clatter with child speech and laughter.'
256
+
257
  EVENTS_PLACEHOLDER = '''
258
  // example
259
  {
260
+ "Child speech, kid speaking": [[0.0, 1.503], [1.732, 2.12], [2.942, 3.541], [7.803, 8.493]],
261
+ "Dishes, pots, and pans": [[1.983, 2.156], [3.175, 3.298], [4.774, 5.076], [5.711, 5.834], [6.076, 6.24], [6.423, 7.012]],
262
+ "Water tap, faucet": [[0.0, 10.0]]
263
  }
264
  '''.strip()
265
 
266
  with gr.Blocks(theme=blue_theme, title="Audio ControlNet – Text to Audio") as demo:
267
  gr.Markdown("""
268
  # 🎵 Audio ControlNet
269
+ ## Fine-Grained Text-to-Audio Generation with Conditions
270
+ T2A GUI interface with conditional inputs for **Audio ControlNet**.
271
  """)
272
  gr.HTML("""
273
  <style>
274
+ .plot-small { height: 280px !important; }
275
  </style>
276
  """)
277
 
 
279
  with gr.Column(scale=2):
280
  text_prompt = gr.Textbox(
281
  label="Text Prompt",
282
+ placeholder=CAPTION_PLACEHOLDER,
283
  lines=4,
284
  )
285
 
 
288
  with gr.Row():
289
  with gr.Column(scale=1):
290
  sound_events = gr.Textbox(label="Sound Events (JSON)", placeholder=EVENTS_PLACEHOLDER, lines=8)
291
+ random_example_btn = gr.Button("🎲 Random Example", variant="primary", size="sm")
292
  with gr.Column(scale=1):
293
  events_plot = gr.Plot(label="Sound Events Roll", elem_classes="plot-small")
294
 
 
314
  loudness_audio.change(fn=extract_loudness, inputs=loudness_audio, outputs=loudness_plot)
315
  pitch_audio.change(fn=extract_pitch, inputs=pitch_audio, outputs=pitch_plot)
316
  sound_events.change(fn=visualize_events, inputs=sound_events, outputs=events_plot)
317
+
318
+ # Random example button event
319
+ random_example_btn.click(
320
+ fn=generate_random_example,
321
+ inputs=[],
322
+ outputs=[text_prompt, sound_events]
323
+ )
324
 
325
  generate_btn.click(
326
  fn=generate_audio,