"""Evoxtral — Expressive Tagged Transcription Demo (ZeroGPU)."""

import torch
import spaces
import gradio as gr
import numpy as np
from transformers import VoxtralForConditionalGeneration, AutoProcessor
from peft import PeftModel

MODEL_ID = "mistralai/Voxtral-Mini-3B-2507"
ADAPTER_ID = "YongkangZOU/evoxtral-rl"

# Load model on CPU at startup, ZeroGPU moves to GPU on demand
print("Loading model...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
base_model = VoxtralForConditionalGeneration.from_pretrained(
    MODEL_ID, torch_dtype=torch.bfloat16,
)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
model.eval()
print("Model loaded!")


@spaces.GPU
def transcribe(audio_input):
    """Transcribe audio with expressive tags."""
    if audio_input is None:
        return "Please upload or record an audio file."

    sr, audio_array = audio_input
    # Convert to float32 and mono if needed
    audio_array = audio_array.astype(np.float32)
    if audio_array.ndim > 1:
        audio_array = audio_array.mean(axis=1)
    # Normalize to [-1, 1]
    if audio_array.max() > 1.0:
        audio_array = audio_array / 32768.0

    # Resample to 16kHz if needed
    if sr != 16000:
        import librosa
        audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)

    inputs = processor.apply_transcription_request(
        language="en",
        audio=[audio_array],
        format=["WAV"],
        model_id=MODEL_ID,
        return_tensors="pt",
    )
    inputs = {
        k: v.to(model.device, dtype=torch.bfloat16)
        if v.dtype in (torch.float32, torch.float16, torch.bfloat16)
        else v.to(model.device)
        for k, v in inputs.items()
    }

    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)

    input_len = inputs["input_ids"].shape[1]
    transcription = processor.tokenizer.decode(
        output_ids[0][input_len:], skip_special_tokens=True
    )
    return transcription


EXAMPLES_TEXT = """
## What is Evoxtral?

Evoxtral is a fine-tuned version of [Voxtral-Mini-3B](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)
that produces transcriptions enriched with **inline expressive audio tags** from the
[ElevenLabs v3 tag set](https://elevenlabs.io/docs/api-reference/text-to-speech).

### Standard ASR output:
> So I was thinking maybe we could try that new restaurant downtown.

### Evoxtral output:
> [nervous] So... I was thinking maybe we could [clears throat] try that new restaurant downtown? [laughs nervously]

### Supported tags include:
`[laughs]` `[sighs]` `[gasps]` `[whispers]` `[clears throat]` `[pause]` `[nervous]` `[frustrated]` `[excited]` `[sad]` `[calm]` `[stammers]` `[yawns]` and more.

### Results

| Metric | Base Voxtral | Evoxtral | Improvement |
|--------|-------------|----------|-------------|
| WER | 6.64% | **4.47%** | 32.7% better |
| Tag F1 | 22.0% | **67.2%** | 3x better |
"""

with gr.Blocks(title="Evoxtral — Expressive Tagged Transcription", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Evoxtral — Expressive Tagged Transcription")
    gr.Markdown("Upload or record audio to get a transcription with inline expressive tags like `[sighs]`, `[laughs]`, `[whispers]`, etc.")

    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                label="Audio Input",
                type="numpy",
                sources=["upload", "microphone"],
            )
            submit_btn = gr.Button("Transcribe", variant="primary", size="lg")

        with gr.Column(scale=1):
            output_text = gr.Textbox(
                label="Expressive Transcription",
                lines=8,
                show_copy_button=True,
            )

    submit_btn.click(fn=transcribe, inputs=audio_input, outputs=output_text)

    gr.Markdown(EXAMPLES_TEXT)

    gr.Markdown("""
---
Built for the **Mistral AI Online Hackathon 2026** (W&B Fine-Tuning Track) |
[Model](https://huggingface.co/YongkangZOU/evoxtral-lora) |
[W&B Dashboard](https://wandb.ai/yongkang-zou-ai/evoxtral) |
By Yongkang Zou
""")

demo.launch()