import datetime
import json
import time
import pytz

import gradio as gr
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from vosk import KaldiRecognizer, Model

THRESHOLD_G = 0.5
THRESHOLD_H = 0.34


def load_vosk(model_id: str):
    model_dir = snapshot_download(model_id)
    return Model(model_path=model_dir)


OmegaConf.register_new_resolver("load_vosk", load_vosk)

models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))


def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str):
    model = models_config[model_id]["model"]

    sample_rate, audio_array = new_chunk
    if audio_array.ndim == 2:
        audio_array = audio_array[:, 0]

    audio_bytes = audio_array.tobytes()

    if stream is None:
        rec = KaldiRecognizer(model, sample_rate)
        rec.SetWords(True)
        rec.SetMaxAlternatives(0)
        results = []
        start_time = time.time()
    else:
        rec, results, start_time = stream

    if time.time() - start_time > 10:
        rec.FinalResult()
        start_time = time.time()

    if rec.AcceptWaveform(audio_bytes):
        result = json.loads(rec.Result())
        text_result = result["text"]
        if text_result != "" and result["text"] != "<SIL>":
            for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]:
                if word in text_result:
                    print(result)

                    conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3))
                    if "HI" in word or "HEY" in word:
                        threshold_1 = THRESHOLD_H
                    else:
                        threshold_1 = THRESHOLD_G
                    if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G:
                        results.append(
                            datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result
                        )
                        rec.FinalResult()
                        start_time = time.time()
                        break

    # rec.AcceptWaveform(audio_bytes)
    # partial_result = json.loads(rec.PartialResult())

    # if partial_result["partial"] != "":
    #     print(partial_result)
    #     results.append(
    #         datetime.datetime.now().strftime("%H:%M:%S")
    #         + " "
    #         + partial_result["partial"]
    #     )

    if len(results) > 0:
        output_text = "\n".join(results)
    else:
        output_text = ""

    return (rec, results, start_time), output_text


demo = gr.Blocks(
    title="Wakeword Demo",
    css="@import url(https://tauhu.tw/tauhu-oo.css);",
    theme=gr.themes.Default(
        font=(
            "tauhu-oo",
            gr.themes.GoogleFont("Source Sans Pro"),
            "ui-sans-serif",
            "system-ui",
            "sans-serif",
        )
    ),
)

with demo:
    default_model_id = list(models_config.keys())[0]
    model_drop_down = gr.Dropdown(
        models_config.keys(),
        value=default_model_id,
        label="模型",
    )

    gr.Markdown(
        """
        # Wakeword Demo
        """
    )
    state = gr.State()
    audio = gr.Audio(
        label="錄音",
        type="numpy",
        format="wav",
        waveform_options=gr.WaveformOptions(
            sample_rate=16000,
        ),
        sources=["microphone"],
        streaming=True,
    )
    gr.Interface(
        automatic_speech_recognition,
        inputs=[
            model_drop_down,
            state,
            audio,
        ],
        outputs=[
            state,
            gr.Text(interactive=False, label="輸出"),
        ],
        live=True,
        stream_every=0.25,
        clear_btn=None,
        # flagging_mode="auto",
    )


demo.launch()
demo.launch()
demo.launch()