Spaces:
Paused
Paused
Expose AudioToken/Syllable ratio as param to allow manual speed and audio length control
Browse files
app.py
CHANGED
|
@@ -27,11 +27,50 @@ def run_asr(audio):
|
|
| 27 |
|
| 28 |
return transcript.text, transcript.text, word_times
|
| 29 |
|
| 30 |
-
def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk):
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
def run_inpainter_tts(input_text, voice_audio):
|
| 34 |
-
return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio))
|
| 35 |
|
| 36 |
if __name__ == '__main__':
|
| 37 |
with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo:
|
|
@@ -43,13 +82,7 @@ if __name__ == '__main__':
|
|
| 43 |
gr.Markdown("### Run the inpainter to generate the modified audio.")
|
| 44 |
gr.Markdown("### Note: The model and demo are currently targeted for English.")
|
| 45 |
|
| 46 |
-
|
| 47 |
-
num_steps_slider = gr.Slider(minimum=1, maximum=100, step=1, label="number of sampling steps codebook", value=30)
|
| 48 |
-
init_temp_slider = gr.Slider(minimum=0.5, maximum=10, step=0.1, label="Initial temperature", value=1)
|
| 49 |
-
init_diversity_slider = gr.Slider(minimum=0, maximum=10, step=0.1, label="Initial diversity", value=1)
|
| 50 |
-
guidance_slider = gr.Slider(minimum=0, maximum=10, step=0.1, label="guidance", value=0.5)
|
| 51 |
-
rescale_slider = gr.Slider(minimum=0, maximum=1, step=0.1, label="guidance rescale factor", value=0.7)
|
| 52 |
-
topk_slider = gr.Slider(minimum=1, maximum=10000, step=1, label="sampling from top-k logits", value=25)
|
| 53 |
|
| 54 |
with gr.Row():
|
| 55 |
audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath")
|
|
@@ -71,10 +104,15 @@ if __name__ == '__main__':
|
|
| 71 |
audio_output = gr.Audio(label="Output audio")
|
| 72 |
|
| 73 |
asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times])
|
| 74 |
-
inpainter_submit.click(
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
with gr.Tab("Text to Speech"):
|
| 77 |
gr.Markdown("### Text to Speech")
|
|
|
|
|
|
|
| 78 |
tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2)
|
| 79 |
tts_voice = gr.Audio(label="Voice to use for TTS",
|
| 80 |
sources=["upload", "microphone"], type="filepath",
|
|
@@ -84,7 +122,7 @@ if __name__ == '__main__':
|
|
| 84 |
|
| 85 |
tts_submit.click(
|
| 86 |
run_inpainter_tts,
|
| 87 |
-
inputs=[tts_text, tts_voice],
|
| 88 |
outputs=[tts_output]
|
| 89 |
)
|
| 90 |
|
|
|
|
| 27 |
|
| 28 |
return transcript.text, transcript.text, word_times
|
| 29 |
|
| 30 |
+
def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
|
| 31 |
+
if not use_manual_ratio:
|
| 32 |
+
audio_token_syllable_ratio = None
|
| 33 |
+
return inpainter.inpaint(InpaintInput(input_text=input_text, output_text=output_text, input_word_times=word_times, audio=audio, num_steps=num_steps,
|
| 34 |
+
init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
|
| 35 |
+
audio_token_syllable_ratio=audio_token_syllable_ratio))
|
| 36 |
+
|
| 37 |
+
def run_inpainter_tts(input_text, voice_audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio):
|
| 38 |
+
if not use_manual_ratio:
|
| 39 |
+
audio_token_syllable_ratio = None
|
| 40 |
+
return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio, num_steps=num_steps, init_temp=init_temp,
|
| 41 |
+
init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk,
|
| 42 |
+
audio_token_syllable_ratio=audio_token_syllable_ratio))
|
| 43 |
+
|
| 44 |
+
def toggle_ratio_input(use_manual):
|
| 45 |
+
return gr.update(visible=use_manual, interactive=use_manual)
|
| 46 |
+
|
| 47 |
+
def create_advanced_options_accordion():
|
| 48 |
+
with gr.Accordion("Advanced options", open=False):
|
| 49 |
+
num_steps_slider = gr.Slider(1, 100, 30, step=1, label="number of sampling steps codebook")
|
| 50 |
+
init_temp_slider = gr.Slider(0.5, 10, 1, step=0.1, label="Initial temperature")
|
| 51 |
+
init_diversity_slider = gr.Slider(0, 10, 1, step=0.1, label="Initial diversity")
|
| 52 |
+
guidance_slider = gr.Slider(0, 10, 0.5, step=0.1, label="guidance")
|
| 53 |
+
rescale_slider = gr.Slider(0, 1, 0.7, step=0.1, label="guidance rescale factor")
|
| 54 |
+
topk_slider = gr.Slider(1, 10000, 25, step=1, label="sampling from top-k logits")
|
| 55 |
+
|
| 56 |
+
gr.Markdown("#### Audio Token Syllable Ratio")
|
| 57 |
+
gr.Markdown("*Automatic calculation (recommended) provides the best results in most cases.*")
|
| 58 |
+
use_manual_ratio = gr.Checkbox(label="Use manual audio token syllable ratio", value=False)
|
| 59 |
+
audio_token_syllable_ratio = gr.Number(
|
| 60 |
+
label="Audio token syllable ratio (manual)",
|
| 61 |
+
value=12.5, precision=2, minimum=5.0, maximum=25.0,
|
| 62 |
+
visible=False, interactive=False
|
| 63 |
+
)
|
| 64 |
+
use_manual_ratio.change(
|
| 65 |
+
toggle_ratio_input,
|
| 66 |
+
inputs=[use_manual_ratio],
|
| 67 |
+
outputs=[audio_token_syllable_ratio]
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
return (num_steps_slider, init_temp_slider, init_diversity_slider,
|
| 71 |
+
guidance_slider, rescale_slider, topk_slider,
|
| 72 |
+
use_manual_ratio, audio_token_syllable_ratio)
|
| 73 |
|
|
|
|
|
|
|
| 74 |
|
| 75 |
if __name__ == '__main__':
|
| 76 |
with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo:
|
|
|
|
| 82 |
gr.Markdown("### Run the inpainter to generate the modified audio.")
|
| 83 |
gr.Markdown("### Note: The model and demo are currently targeted for English.")
|
| 84 |
|
| 85 |
+
inpaint_advanced_options = create_advanced_options_accordion()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
with gr.Row():
|
| 88 |
audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath")
|
|
|
|
| 104 |
audio_output = gr.Audio(label="Output audio")
|
| 105 |
|
| 106 |
asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times])
|
| 107 |
+
inpainter_submit.click(
|
| 108 |
+
run_inpainter,
|
| 109 |
+
inputs=[text_input, text_output, word_times, audio_input] + list(inpaint_advanced_options),
|
| 110 |
+
outputs=[audio_output])
|
| 111 |
|
| 112 |
with gr.Tab("Text to Speech"):
|
| 113 |
gr.Markdown("### Text to Speech")
|
| 114 |
+
tts_advanced_options = create_advanced_options_accordion()
|
| 115 |
+
|
| 116 |
tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2)
|
| 117 |
tts_voice = gr.Audio(label="Voice to use for TTS",
|
| 118 |
sources=["upload", "microphone"], type="filepath",
|
|
|
|
| 122 |
|
| 123 |
tts_submit.click(
|
| 124 |
run_inpainter_tts,
|
| 125 |
+
inputs=[tts_text, tts_voice] + list(tts_advanced_options),
|
| 126 |
outputs=[tts_output]
|
| 127 |
)
|
| 128 |
|