import os

import gradio as gr

from transcribe import (
    finish_streaming_realtime,
    preload_streaming_assets,
    reset_realtime_stream,
    transcribe,
    transcribe_streaming_realtime,
)

MODEL_CHOICES = [
    "Offline Model",
    "Streaming 160ms",
    "Streaming 480ms",
    "Streaming 960ms",
    "Streaming 1920ms",
]

STREAMING_CHOICES = MODEL_CHOICES[1:]

def predict_asr(input_audio, model_mode):
    if input_audio is None:
        return "Please Upload or Record Audio!"

    result = transcribe(input_audio, model_mode=model_mode)
    return result["text"]


default_audio_path1 = "default_sample_zh.mp3"
default_audio_path2 = "default_sample_en.mp3"

# if not os.path.exists(default_audio_path):
#     with open(default_audio_path, "wb") as f:
#         pass

preload_streaming_assets()


with gr.Blocks(
    title="X-ASR",
) as gradio_app:
    gr.Markdown("# X-ASR")

    with gr.Tab("Utterance-Level Speech Recognition"):
        batch_audio = gr.Audio(
            label="Upload or Record",
            sources=["upload", "microphone"],
            type="numpy",
        )
        batch_mode = gr.Dropdown(
            label="Model Choices",
            choices=MODEL_CHOICES,
            value="Offline Model",
            interactive=True,
        )
        batch_button = gr.Button("Recognize", variant="primary")
        batch_output = gr.Textbox(label="Transcription")

        gr.Examples(
            examples=[
                [default_audio_path1, "Offline Model"],
                [default_audio_path2, "Offline Model"],
            ],
            inputs=[batch_audio, batch_mode],
            outputs=batch_output,
            fn=predict_asr,
            cache_examples=False,
        )

    with gr.Tab("Streaming Speech Recognition"):
        stream_mode = gr.Dropdown(
            label="Model Choices",
            choices=STREAMING_CHOICES,
            value="Streaming 160ms",
            interactive=True,
        )
        stream_audio = gr.Audio(
            label="Microphone",
            sources=["microphone"],
            type="numpy",
            streaming=True,
        )
        stream_output = gr.Textbox(label="Transcription")
        stream_state = gr.State({})

        with gr.Row():
            reset_button = gr.Button("Clear")

        stream_audio.stream(
            fn=transcribe_streaming_realtime,
            inputs=[stream_audio, stream_state, stream_mode],
            outputs=[stream_output, stream_state],
            show_progress="hidden",
            stream_every=0.25,
            time_limit=3600,
            trigger_mode="multiple",
            concurrency_limit=1,
        )
        stream_audio.start_recording(
            fn=reset_realtime_stream,
            inputs=stream_mode,
            outputs=[stream_output, stream_state],
            show_progress="hidden",
        )
        stream_audio.stop_recording(
            fn=finish_streaming_realtime,
            inputs=[stream_state, stream_mode],
            outputs=[stream_output, stream_state],
            show_progress="hidden",
        )
        stream_mode.change(
            fn=reset_realtime_stream,
            inputs=stream_mode,
            outputs=[stream_output, stream_state],
            show_progress="hidden",
        )
        reset_button.click(
            fn=reset_realtime_stream,
            inputs=stream_mode,
            outputs=[stream_output, stream_state],
            show_progress="hidden",
        )

    batch_button.click(
        fn=predict_asr,
        inputs=[batch_audio, batch_mode],
        outputs=batch_output,
    )


if __name__ == "__main__":
    gradio_app.launch(share=False)