import gradio as gr
import numpy as np
from transformers import pipeline

MODEL_ID = "openai/whisper-tiny"

asr = pipeline(
    "automatic-speech-recognition",
    model=MODEL_ID,
    chunk_length_s=10,
    device=-1,
)

TARGET_SR = 16000


def to_mono(audio_tuple):
    if audio_tuple is None:
        return None, None
    sr, data = audio_tuple
    if data is None:
        return None, None
    data = np.asarray(data)
    if data.ndim == 2:
        data = data.mean(axis=1)
    if np.issubdtype(data.dtype, np.integer):
        max_val = np.iinfo(data.dtype).max
        data = data.astype(np.float32) / max_val
    else:
        data = data.astype(np.float32)
    return sr, data


def linear_resample(audio, orig_sr, target_sr=TARGET_SR):
    if orig_sr == target_sr:
        return audio
    duration = len(audio) / orig_sr
    old_times = np.linspace(0, duration, num=len(audio), endpoint=False)
    new_length = int(duration * target_sr)
    new_times = np.linspace(0, duration, num=new_length, endpoint=False)
    return np.interp(new_times, old_times, audio).astype(np.float32)


def run_asr(audio_np, sr):
    if audio_np is None or len(audio_np) < sr * 0.4:
        return ""
    result = asr({"sampling_rate": sr, "raw": audio_np})
    if isinstance(result, dict):
        return result.get("text", "").strip()
    return str(result).strip()


def stream_transcribe(audio, state):
    if state is None:
        state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}

    sr, chunk = to_mono(audio)
    if chunk is None:
        return state, state.get("partial", ""), state.get("stable", "")

    chunk = linear_resample(chunk, sr, TARGET_SR)
    state["buffer"] = np.concatenate([state["buffer"], chunk])

    max_samples = TARGET_SR * 20
    if len(state["buffer"]) > max_samples:
        state["buffer"] = state["buffer"][-max_samples:]

    preview_samples = TARGET_SR * 8
    preview_audio = state["buffer"][-preview_samples:]

    partial = run_asr(preview_audio, TARGET_SR)
    state["partial"] = partial
    live = (state["stable"] + " " + partial).strip()
    return state, partial, live


def finalize(state):
    if state is None:
        state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}
    stable = state.get("stable", "").strip()
    partial = state.get("partial", "").strip()
    if partial:
        stable = f"{stable} {partial}".strip()
    state["stable"] = stable
    state["partial"] = ""
    state["buffer"] = np.zeros(0, dtype=np.float32)
    return state, "", stable


def clear():
    state = {"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""}
    return None, state, "", ""


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # NeoScribe (pseudo-live)
        Stream audio from the browser microphone and transcribe in near real time.

        Next step: send audio chunks from your browser extension to this backend.
        """
    )

    state = gr.State({"buffer": np.zeros(0, dtype=np.float32), "stable": "", "partial": ""})

    audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="Input audio")
    partial = gr.Textbox(label="Partial text", lines=3)
    final = gr.Textbox(label="Stable transcript", lines=10)

    clear_btn = gr.Button("Clear")

    audio.stream(
        stream_transcribe,
        inputs=[audio, state],
        outputs=[state, partial, final],
        stream_every=0.8,
        time_limit=120,
    )

    audio.stop_recording(finalize, inputs=[state], outputs=[state, partial, final])

    clear_btn.click(clear, outputs=[audio, state, partial, final])


demo.launch()