Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,52 +34,62 @@ class InferRunner:
|
|
| 34 |
|
| 35 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
runner = InferRunner(device)
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
|
| 39 |
with torch.no_grad():
|
| 40 |
latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
|
| 41 |
mel = runner.vae.decode_first_stage(latents)
|
| 42 |
wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
|
| 43 |
-
outpath = f"
|
| 44 |
sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
|
| 45 |
return outpath
|
| 46 |
-
with gr.Blocks() as demo:
|
| 47 |
-
with gr.Row():
|
| 48 |
-
gr.Markdown("## PicoAudio")
|
| 49 |
|
| 50 |
-
with gr.Row():
|
| 51 |
-
with gr.Column():
|
| 52 |
-
prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
|
| 53 |
-
value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
|
| 54 |
-
run_button = gr.Button()
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
run_button.click(fn=infer,
|
| 67 |
-
inputs=[prompt, num_steps, guidance_scale],
|
| 68 |
-
outputs=[outaudio])
|
| 69 |
-
# with gr.Row():
|
| 70 |
-
# with gr.Column():
|
| 71 |
-
# gr.Examples(
|
| 72 |
-
# examples = [['An amateur recording features a steel drum playing in a higher register',25,5,55],
|
| 73 |
-
# ['An instrumental song with a caribbean feel, happy mood, and featuring steel pan music, programmed percussion, and bass',25,5,55],
|
| 74 |
-
# ['This musical piece features a playful and emotionally melodic male vocal accompanied by piano',25,5,55],
|
| 75 |
-
# ['A eerie yet calming experimental electronic track featuring haunting synthesizer strings and pads',25,5,55],
|
| 76 |
-
# ['A slow tempo pop instrumental piece featuring only acoustic guitar with fingerstyle and percussive strumming techniques',25,5,55]],
|
| 77 |
-
# inputs = [prompt, ddim_steps, scale, seed],
|
| 78 |
-
# outputs = [outaudio],
|
| 79 |
-
# )
|
| 80 |
-
# cache_examples="lazy", # Turn on to cache.
|
| 81 |
-
# with gr.Column():
|
| 82 |
-
# pass
|
| 83 |
|
| 84 |
demo.launch()
|
| 85 |
|
|
|
|
| 34 |
|
| 35 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
runner = InferRunner(device)
|
| 37 |
+
event_list = [
|
| 38 |
+
"burping_belching", # 0
|
| 39 |
+
"car_horn_honking", #
|
| 40 |
+
"cat_meowing", #
|
| 41 |
+
"cow_mooing", #
|
| 42 |
+
"dog_barking", #
|
| 43 |
+
"door_knocking", #
|
| 44 |
+
"door_slamming", #
|
| 45 |
+
"explosion", #
|
| 46 |
+
"gunshot", # 8
|
| 47 |
+
"sheep_goat_bleating", #
|
| 48 |
+
"sneeze", #
|
| 49 |
+
"spraying", #
|
| 50 |
+
"thump_thud", #
|
| 51 |
+
"train_horn", #
|
| 52 |
+
"tapping_clicking_clanking", #
|
| 53 |
+
"woman_laughing", #
|
| 54 |
+
"duck_quacking", # 16
|
| 55 |
+
"whistling", #
|
| 56 |
+
]
|
| 57 |
def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
|
| 58 |
with torch.no_grad():
|
| 59 |
latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
|
| 60 |
mel = runner.vae.decode_first_stage(latents)
|
| 61 |
wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
|
| 62 |
+
outpath = f"output.wav"
|
| 63 |
sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
|
| 64 |
return outpath
|
|
|
|
|
|
|
|
|
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
gr.Markdown("## PicoAudio")
|
| 68 |
+
gr.Markdown("18 events: " + ", ".join(event_list))
|
| 69 |
+
prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
|
| 70 |
+
value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
num_steps = gr.Slider(label="num_steps",
|
| 74 |
+
minimum=1, maximum=300, value=200, step=1)
|
| 75 |
+
guidance_scale = gr.Slider(label="guidance_scale Scale:(Large => more relevant to text but the quality may drop)",
|
| 76 |
+
minimum=0.1, maximum=8.0, value=3.0, step=0.1)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
gr_interface = gr.Interface(
|
| 80 |
+
fn=infer,
|
| 81 |
+
inputs=[prompt, num_steps, guidance_scale],
|
| 82 |
+
outputs=[outaudio],
|
| 83 |
+
# title="
|
| 84 |
+
allow_flagging=False,
|
| 85 |
+
examples=[
|
| 86 |
+
["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
|
| 87 |
+
["dog_barking at 0.562-2.562_4.25-6.25."],
|
| 88 |
+
["cow_mooing at 0.958-3.582_5.272-7.896."],
|
| 89 |
+
],
|
| 90 |
+
cache_examples="lazy", # Turn on to cache.
|
| 91 |
+
)
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
demo.launch()
|
| 95 |
|