import spaces
from kokoro import KModel, KPipeline
import gradio as gr
import os
import random
import torch
import numpy as np

from pyharp.core import ModelCard, build_endpoint
from pyharp.media.audio import save_audio
from audiotools import AudioSignal

model_card = ModelCard(
    name="Kokoro Text To Speech",
    description=("Kokoro is an open-weight TTS model with 82 million parameters.\n"
                 "Despite its lightweight architecture, it delivers comparable quality to larger models "
                 "while being significantly faster and more cost-efficient."),
    author="Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, Nima Mesgarani (StyleTTS2)",
    tags=["tts"]
)

IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
CUDA_AVAILABLE = torch.cuda.is_available()
if not IS_DUPLICATE:
    import kokoro
    import misaki
    print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)

CHAR_LIMIT = None if IS_DUPLICATE else 5000
models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'

@spaces.GPU(duration=30)
def forward_gpu(ps, ref_s, speed):
    return models[True](ps, ref_s, speed)

def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
    text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
    pipeline = pipelines[voice[0]]
    pack = pipeline.load_voice(voice)
    use_gpu = use_gpu and CUDA_AVAILABLE
    first = True
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps)-1]
        try:
            if use_gpu:
                audio = forward_gpu(ps, ref_s, speed)
            else:
                audio = models[False](ps, ref_s, speed)
        except gr.exceptions.Error as e:
            if use_gpu:
                gr.Warning(str(e))
                gr.Info('Switching to CPU')
                audio = models[False](ps, ref_s, speed)
            else:
                raise gr.Error(e)
        yield 24000, audio.numpy()
        # if first:
        #     first = False
        #     yield 24000, torch.zeros(1).numpy()


CHOICES = {
'[US-Female] Heart': 'af_heart',
'[US-Female] Bella': 'af_bella',
'[US-Female] Nicole': 'af_nicole',
'[US-Female] Aoede': 'af_aoede',
'[US-Female] Kore': 'af_kore',
'[US-Female] Sarah': 'af_sarah',
'[US-Female] Nova': 'af_nova',
'[US-Female] Sky': 'af_sky',
'[US-Female] Alloy': 'af_alloy',
'[US-Female] Jessica': 'af_jessica',
'[US-Female] River': 'af_river',
'[US-Male] Michael': 'am_michael',
'[US-Male] Fenrir': 'am_fenrir',
'[US-Male] Puck': 'am_puck',
'[US-Male] Echo': 'am_echo',
'[US-Male] Eric': 'am_eric',
'[US-Male] Liam': 'am_liam',
'[US-Male] Onyx': 'am_onyx',
'[US-Male] Santa': 'am_santa',
'[US-Male] Adam': 'am_adam',
'[UK-Female] Emma': 'bf_emma',
'[UK-Female] Isabella': 'bf_isabella',
'[UK-Female] Alice': 'bf_alice',
'[UK-Female] Lily': 'bf_lily',
'[UK-Male] George': 'bm_george',
'[UK-Male] Fable': 'bm_fable',
'[UK-Male] Lewis': 'bm_lewis',
'[UK-Male] Daniel': 'bm_daniel',
}
for v in CHOICES.values():
    pipelines[v[0]].load_voice(v)

API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
API_NAME = None if API_OPEN else False

def process_fn(text_input: str, speaker: str):
    speaker = CHOICES[speaker]
    # (fs, wav), _ = generate_first(text_input, speaker, speed=1, use_gpu=False)
    results = [x for x in generate_all(text_input, speaker, speed=1, use_gpu=True)]
    fs = results[0][0]
    wavs = [x[1] for x in results]
    wav = np.concatenate(wavs)
    sig = AudioSignal(wav.astype("float32"), sample_rate=fs)
    return save_audio(sig)

with gr.Blocks() as app:
    gr.Markdown("## 💬 Kokoro Text To Speech")

    # Inputs
    text_input = gr.Textbox(
        label="Text Input",
        info="Up to 5000 character text input. To get the best performance, please start a new line for each sentence."
    ).harp_required(True)

    speaker_dropdown = gr.Dropdown(
        list(CHOICES.keys()), value='[US-Female] Heart', label='Voice', info='US and UK accented male and female voices available'
    )

    # Outputs
    output_wav = gr.Audio(
        type="filepath",
        label="Synthesized Speech"
    )

    _ = build_endpoint(
        model_card=model_card,
        input_components=[
            text_input,
            # language_dropdown,
            speaker_dropdown
        ],
        output_components=[
            output_wav
        ],
        process_fn=process_fn
    )

if __name__ == '__main__':
    app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)