|
|
|
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import torch |
|
|
import soundfile |
|
|
from audiocraft import AudioGen |
|
|
|
|
|
audiogen = AudioGen().eval().to('cpu') |
|
|
|
|
|
|
|
|
def audionar_tts(text='frogs', |
|
|
duration=20.4, |
|
|
max_tokens=24, |
|
|
cache_lim=-1 |
|
|
): |
|
|
|
|
|
if text and text.strip(): |
|
|
|
|
|
|
|
|
duration = max(duration + 0.74, 2.0) |
|
|
|
|
|
background_audio = audiogen.generate( |
|
|
text[:64], |
|
|
duration=duration, |
|
|
max_tokens=min(min(max(7, int(max_tokens)), int(duration * 50 / 2)), 1400), |
|
|
cache_lim=max(6, int(cache_lim)), |
|
|
).numpy() |
|
|
|
|
|
else: |
|
|
|
|
|
background_audio = np.zeros(16000, dtype=np.float32) |
|
|
|
|
|
|
|
|
wavfile = '_vits_.wav' |
|
|
|
|
|
soundfile.write(wavfile, background_audio, 16000) |
|
|
return wavfile |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
with gr.Row(): |
|
|
text = gr.Textbox( |
|
|
label="AudioGen Txt:", |
|
|
placeholder="Describe sound - Type Any language", |
|
|
lines=2, |
|
|
value='dogs barg', |
|
|
) |
|
|
duration = gr.Number( |
|
|
label="Duration (s)", |
|
|
value=7.24, |
|
|
) |
|
|
n_tokens = gr.Number( |
|
|
label="Tokens", |
|
|
value=24, |
|
|
) |
|
|
cache_lim = gr.Number( |
|
|
label="kv Flush", |
|
|
value=71, |
|
|
) |
|
|
generate_button = gr.Button("Generate Audio", |
|
|
variant="primary") |
|
|
|
|
|
output_audio = gr.Audio(label="TTS Output") |
|
|
|
|
|
generate_button.click( |
|
|
fn=audionar_tts, |
|
|
inputs=[text, duration, n_tokens, cache_lim], |
|
|
outputs=[output_audio] |
|
|
) |
|
|
demo.launch(debug=True) |
|
|
|