ethos / space /app.py
Lior-0618's picture
chore: merge master β†’ dev/video-fer (SSE transcribe-stream)
aa15e90
"""Evoxtral β€” Expressive Tagged Transcription Demo (ZeroGPU)."""
import torch
import spaces
import gradio as gr
import numpy as np
from transformers import VoxtralForConditionalGeneration, AutoProcessor
from peft import PeftModel
MODEL_ID = "mistralai/Voxtral-Mini-3B-2507"
ADAPTER_ID = "YongkangZOU/evoxtral-rl"
# Load model on CPU at startup, ZeroGPU moves to GPU on demand
print("Loading model...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
base_model = VoxtralForConditionalGeneration.from_pretrained(
MODEL_ID, torch_dtype=torch.bfloat16,
)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
model.eval()
print("Model loaded!")
@spaces.GPU
def transcribe(audio_input):
"""Transcribe audio with expressive tags."""
if audio_input is None:
return "Please upload or record an audio file."
sr, audio_array = audio_input
# Convert to float32 and mono if needed
audio_array = audio_array.astype(np.float32)
if audio_array.ndim > 1:
audio_array = audio_array.mean(axis=1)
# Normalize to [-1, 1]
if audio_array.max() > 1.0:
audio_array = audio_array / 32768.0
# Resample to 16kHz if needed
if sr != 16000:
import librosa
audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
inputs = processor.apply_transcription_request(
language="en",
audio=[audio_array],
format=["WAV"],
model_id=MODEL_ID,
return_tensors="pt",
)
inputs = {
k: v.to(model.device, dtype=torch.bfloat16)
if v.dtype in (torch.float32, torch.float16, torch.bfloat16)
else v.to(model.device)
for k, v in inputs.items()
}
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
input_len = inputs["input_ids"].shape[1]
transcription = processor.tokenizer.decode(
output_ids[0][input_len:], skip_special_tokens=True
)
return transcription
EXAMPLES_TEXT = """
## What is Evoxtral?
Evoxtral is a fine-tuned version of [Voxtral-Mini-3B](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)
that produces transcriptions enriched with **inline expressive audio tags** from the
[ElevenLabs v3 tag set](https://elevenlabs.io/docs/api-reference/text-to-speech).
### Standard ASR output:
> So I was thinking maybe we could try that new restaurant downtown.
### Evoxtral output:
> [nervous] So... I was thinking maybe we could [clears throat] try that new restaurant downtown? [laughs nervously]
### Supported tags include:
`[laughs]` `[sighs]` `[gasps]` `[whispers]` `[clears throat]` `[pause]` `[nervous]` `[frustrated]` `[excited]` `[sad]` `[calm]` `[stammers]` `[yawns]` and more.
### Results
| Metric | Base Voxtral | Evoxtral | Improvement |
|--------|-------------|----------|-------------|
| WER | 6.64% | **4.47%** | 32.7% better |
| Tag F1 | 22.0% | **67.2%** | 3x better |
"""
with gr.Blocks(title="Evoxtral β€” Expressive Tagged Transcription", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Evoxtral β€” Expressive Tagged Transcription")
gr.Markdown("Upload or record audio to get a transcription with inline expressive tags like `[sighs]`, `[laughs]`, `[whispers]`, etc.")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Audio Input",
type="numpy",
sources=["upload", "microphone"],
)
submit_btn = gr.Button("Transcribe", variant="primary", size="lg")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="Expressive Transcription",
lines=8,
show_copy_button=True,
)
submit_btn.click(fn=transcribe, inputs=audio_input, outputs=output_text)
gr.Markdown(EXAMPLES_TEXT)
gr.Markdown("""
---
Built for the **Mistral AI Online Hackathon 2026** (W&B Fine-Tuning Track) |
[Model](https://huggingface.co/YongkangZOU/evoxtral-lora) |
[W&B Dashboard](https://wandb.ai/yongkang-zou-ai/evoxtral) |
By Yongkang Zou
""")
demo.launch()