# -*- coding: utf-8 -*-
import gradio as gr
import numpy as np
import torch
import soundfile
from audiocraft import AudioGen

audiogen = AudioGen().eval().to('cpu')


def audionar_tts(text='frogs',
                 duration=20.4,  # seconds
                 max_tokens=24,  # True A/R steps (repeats the rest of duration)
                 cache_lim=-1
                 ):

    if text and text.strip():


        duration = max(duration + 0.74, 2.0)

        background_audio = audiogen.generate(
            text[:64],         # soundscape text - discard if too long cross attention
            duration=duration, # dont use all A/R tokens if duration is shorter than tokens<<
            max_tokens=min(min(max(7, int(max_tokens)), int(duration * 50 / 2)), 1400),  # kv cache lowest n_preserve
            cache_lim=max(6, int(cache_lim)),         # Sink Attn
         ).numpy()
        
    else:
        
        background_audio = np.zeros(16000, dtype=np.float32)


    wavfile = '_vits_.wav'

    soundfile.write(wavfile, background_audio, 16000)   # soundfile needs [time, channels]
    return wavfile

# SOUNDSCAPES

with gr.Blocks() as demo:
    with gr.Row():
        text = gr.Textbox(
            label="AudioGen Txt:",
            placeholder="Describe sound - Type Any language",
            lines=2,
            value='dogs barg',
        )
        duration = gr.Number(
            label="Duration (s)",
            value=7.24,
        )
        n_tokens = gr.Number(
            label="Tokens",
            value=24,
        )
        cache_lim = gr.Number(
            label="kv Flush",
            value=71,
        )
        generate_button = gr.Button("Generate Audio",
                                    variant="primary")

    output_audio = gr.Audio(label="TTS Output")

    generate_button.click(
        fn=audionar_tts,
        inputs=[text, duration, n_tokens, cache_lim],
        outputs=[output_audio]
    )
demo.launch(debug=True)