"""
Speech-to-text note taker Gradio app for Hugging Face Spaces
Supports two backends: Vosk (offline) and OpenAI Whisper (local model).

How to use:
1. Create a new Hugging Face Space (Gradio runtime) and upload this file as `app.py`.
2. Add the models you want to use for Vosk under a `models/vosk/` directory
   (e.g. `models/vosk/vosk-model-small-en-us-0.15`) and set the VOSK_MODEL_PATH field in the UI.
3. Space requirements (put in `requirements.txt`):
   gradio
   pydub
   soundfile
   vosk
   whisper
   numpy

Notes:
- Whisper model sizes can be large; choose `small` or `base` for Spaces with limited resources.
- Vosk requires pre-downloaded models and works offline.
- This app converts incoming audio to 16kHz mono WAV before transcribing.
"""

import os
import tempfile
import json
from pathlib import Path
from typing import Optional

import gradio as gr
from pydub import AudioSegment
import soundfile as sf
import numpy as np

# Optional imports (lazy load)
_whisper_model_cache = {}
_vosk_model_cache = {}


def ensure_wav_16k_mono(input_audio_path: str) -> str:
    """Convert arbitrary audio file to a 16kHz mono WAV and return the path."""
    audio = AudioSegment.from_file(input_audio_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    audio.export(out.name, format="wav")
    return out.name


def transcribe_with_whisper(wav_path: str, model_size: str = "small") -> str:
    """Transcribe using OpenAI's whisper package (local model)."""
    try:
        import whisper
    except Exception as e:
        return f"Whisper import error: {e}. Make sure 'whisper' is installed in requirements.txt."

    global _whisper_model_cache
    if model_size not in _whisper_model_cache:
        try:
            _whisper_model_cache[model_size] = whisper.load_model(model_size)
        except Exception as e:
            return f"Failed to load Whisper model '{model_size}': {e}"

    model = _whisper_model_cache[model_size]
    try:
        result = model.transcribe(wav_path)
        return result.get("text", "")
    except Exception as e:
        return f"Whisper transcription error: {e}"


def transcribe_with_vosk(wav_path: str, vosk_model_path: str) -> str:
    """Transcribe using Vosk local model. Expects a path to a downloaded Vosk model directory."""
    try:
        from vosk import Model, KaldiRecognizer
    except Exception as e:
        return f"Vosk import error: {e}. Make sure 'vosk' is installed in requirements.txt."

    if not vosk_model_path or not os.path.isdir(vosk_model_path):
        return "Vosk model path is invalid or missing. Please provide a valid Vosk model directory."

    global _vosk_model_cache
    if vosk_model_path not in _vosk_model_cache:
        try:
            _vosk_model_cache[vosk_model_path] = Model(vosk_model_path)
        except Exception as e:
            return f"Failed to load Vosk model at '{vosk_model_path}': {e}"

    model = _vosk_model_cache[vosk_model_path]

    # Read audio frames
    import wave
    wf = wave.open(wav_path, "rb")
    if wf.getnchannels() != 1 or wf.getframerate() != 16000:
        return "Vosk expects 16kHz mono WAV. Conversion failed or wrong format."

    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    final_text_parts = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            res = json.loads(rec.Result())
            if "text" in res:
                final_text_parts.append(res["text"])
    res = json.loads(rec.FinalResult())
    if "text" in res:
        final_text_parts.append(res["text"])

    return " ".join(final_text_parts)


def transcribe_audio(audio, backend: str, vosk_model_path: str, whisper_size: str):
    """Main handler called by Gradio. audio can be from mic or upload."""
    if audio is None:
        return "No audio provided. Use the microphone or upload an audio file."

    # Gradio returns a file path string
    input_path = audio if isinstance(audio, str) else audio.get("name", None)
    if not input_path:
        return "Invalid audio input."

    # Convert to 16kHz mono WAV
    try:
        wav_path = ensure_wav_16k_mono(input_path)
    except Exception as e:
        return f"Audio conversion error: {e}"

    if backend == "whisper":
        text = transcribe_with_whisper(wav_path, model_size=whisper_size)
    elif backend == "vosk":
        text = transcribe_with_vosk(wav_path, vosk_model_path=vosk_model_path)
    else:
        text = "Unknown backend chosen."

    try:
        os.unlink(wav_path)
    except Exception:
        pass

    return text


# Build Gradio UI
with gr.Blocks(title="Speech-to-Text Note Taker") as demo:
    gr.Markdown(
        "# 🎙️ Speech-to-Text Note Taker\nChoose a backend (Vosk or Whisper), record or upload audio, and get a transcript you can edit or download."
    )

    with gr.Row():
        backend = gr.Radio(
            choices=["whisper", "vosk"], value="whisper", label="Backend"
        )
        whisper_size = gr.Dropdown(
            choices=["tiny", "base", "small", "medium", "large"],
            value="small",
            label="Whisper model size (if using Whisper)",
        )

    vosk_model_path = gr.Textbox(
        value="models/vosk/vosk-model-small-en-us-0.15",
        label="Vosk model path (if using Vosk)",
    )

    with gr.Row():
        mic = gr.Audio(
            sources=["microphone"],
            label="Record (microphone)",
            type="filepath",
            format="wav",
        )
        upload = gr.Audio(
            sources=["upload"],
            label="Or upload an audio file",
            type="filepath",
            format="wav",
        )

    transcribe_btn = gr.Button("Transcribe")
    output = gr.Textbox(label="Transcript", lines=8)

    def run(b, mfile, ufile, vpath, wsize):
        # prefer mic if data exists otherwise uploaded file
        audio_input = None
        if mfile:
            audio_input = mfile
        elif ufile:
            audio_input = ufile
        return transcribe_audio(audio_input, b, vpath, wsize)

    transcribe_btn.click(run, inputs=[backend, mic, upload, vosk_model_path, whisper_size], outputs=[output])

    gr.Markdown(
        "---\n**Tips:**\n- If using Vosk, download a small English model and enter the path in the Vosk model path field.\n- If using Whisper, choose a smaller model for faster transcriptions on CPU.\n"
    )

if __name__ == "__main__":
    demo.launch()