audiogen2 / app.py
Dionyssos's picture
fx lims
6cf8a01
# -*- coding: utf-8 -*-
import gradio as gr
import numpy as np
import torch
import soundfile
from audiocraft import AudioGen
audiogen = AudioGen().eval().to('cpu')
def audionar_tts(text='frogs',
duration=20.4, # seconds
max_tokens=24, # True A/R steps (repeats the rest of duration)
cache_lim=-1
):
if text and text.strip():
duration = max(duration + 0.74, 2.0)
background_audio = audiogen.generate(
text[:64], # soundscape text - discard if too long cross attention
duration=duration, # dont use all A/R tokens if duration is shorter than tokens<<
max_tokens=min(min(max(7, int(max_tokens)), int(duration * 50 / 2)), 1400), # kv cache lowest n_preserve
cache_lim=max(6, int(cache_lim)), # Sink Attn
).numpy()
else:
background_audio = np.zeros(16000, dtype=np.float32)
wavfile = '_vits_.wav'
soundfile.write(wavfile, background_audio, 16000) # soundfile needs [time, channels]
return wavfile
# SOUNDSCAPES
with gr.Blocks() as demo:
with gr.Row():
text = gr.Textbox(
label="AudioGen Txt:",
placeholder="Describe sound - Type Any language",
lines=2,
value='dogs barg',
)
duration = gr.Number(
label="Duration (s)",
value=7.24,
)
n_tokens = gr.Number(
label="Tokens",
value=24,
)
cache_lim = gr.Number(
label="kv Flush",
value=71,
)
generate_button = gr.Button("Generate Audio",
variant="primary")
output_audio = gr.Audio(label="TTS Output")
generate_button.click(
fn=audionar_tts,
inputs=[text, duration, n_tokens, cache_lim],
outputs=[output_audio]
)
demo.launch(debug=True)