# -*- coding: utf-8 -*- import gradio as gr import numpy as np import torch import soundfile from audiocraft import AudioGen audiogen = AudioGen().eval().to('cpu') def audionar_tts(text='frogs', duration=20.4, # seconds max_tokens=24, # True A/R steps (repeats the rest of duration) cache_lim=-1 ): if text and text.strip(): duration = max(duration + 0.74, 2.0) background_audio = audiogen.generate( text[:64], # soundscape text - discard if too long cross attention duration=duration, # dont use all A/R tokens if duration is shorter than tokens<< max_tokens=min(min(max(7, int(max_tokens)), int(duration * 50 / 2)), 1400), # kv cache lowest n_preserve cache_lim=max(6, int(cache_lim)), # Sink Attn ).numpy() else: background_audio = np.zeros(16000, dtype=np.float32) wavfile = '_vits_.wav' soundfile.write(wavfile, background_audio, 16000) # soundfile needs [time, channels] return wavfile # SOUNDSCAPES with gr.Blocks() as demo: with gr.Row(): text = gr.Textbox( label="AudioGen Txt:", placeholder="Describe sound - Type Any language", lines=2, value='dogs barg', ) duration = gr.Number( label="Duration (s)", value=7.24, ) n_tokens = gr.Number( label="Tokens", value=24, ) cache_lim = gr.Number( label="kv Flush", value=71, ) generate_button = gr.Button("Generate Audio", variant="primary") output_audio = gr.Audio(label="TTS Output") generate_button.click( fn=audionar_tts, inputs=[text, duration, n_tokens, cache_lim], outputs=[output_audio] ) demo.launch(debug=True)