Spaces:

dkounadis
/

audiogen2

Running

audiogen2 / app.py

fx lims

6cf8a01 3 months ago

1.96 kB

	# -- coding: utf-8 --
	import gradio as gr
	import numpy as np
	import torch
	import soundfile
	from audiocraft import AudioGen

	audiogen = AudioGen().eval().to('cpu')


	def audionar_tts(text='frogs',
	duration=20.4, # seconds
	max_tokens=24, # True A/R steps (repeats the rest of duration)
	cache_lim=-1
	):

	if text and text.strip():


	duration = max(duration + 0.74, 2.0)

	background_audio = audiogen.generate(
	text[:64], # soundscape text - discard if too long cross attention
	duration=duration, # dont use all A/R tokens if duration is shorter than tokens<<
	max_tokens=min(min(max(7, int(max_tokens)), int(duration * 50 / 2)), 1400), # kv cache lowest n_preserve
	cache_lim=max(6, int(cache_lim)), # Sink Attn
	).numpy()

	else:

	background_audio = np.zeros(16000, dtype=np.float32)


	wavfile = '_vits_.wav'

	soundfile.write(wavfile, background_audio, 16000) # soundfile needs [time, channels]
	return wavfile

	# SOUNDSCAPES

	with gr.Blocks() as demo:
	with gr.Row():
	text = gr.Textbox(
	label="AudioGen Txt:",
	placeholder="Describe sound - Type Any language",
	lines=2,
	value='dogs barg',
	)
	duration = gr.Number(
	label="Duration (s)",
	value=7.24,
	)
	n_tokens = gr.Number(
	label="Tokens",
	value=24,
	)
	cache_lim = gr.Number(
	label="kv Flush",
	value=71,
	)
	generate_button = gr.Button("Generate Audio",
	variant="primary")

	output_audio = gr.Audio(label="TTS Output")

	generate_button.click(
	fn=audionar_tts,
	inputs=[text, duration, n_tokens, cache_lim],
	outputs=[output_audio]
	)
	demo.launch(debug=True)