Spaces:
Running
Running
| """Evoxtral β Expressive Tagged Transcription Demo (ZeroGPU).""" | |
| import torch | |
| import spaces | |
| import gradio as gr | |
| import numpy as np | |
| from transformers import VoxtralForConditionalGeneration, AutoProcessor | |
| from peft import PeftModel | |
| MODEL_ID = "mistralai/Voxtral-Mini-3B-2507" | |
| ADAPTER_ID = "YongkangZOU/evoxtral-rl" | |
| # Load model on CPU at startup, ZeroGPU moves to GPU on demand | |
| print("Loading model...") | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| base_model = VoxtralForConditionalGeneration.from_pretrained( | |
| MODEL_ID, torch_dtype=torch.bfloat16, | |
| ) | |
| model = PeftModel.from_pretrained(base_model, ADAPTER_ID) | |
| model.eval() | |
| print("Model loaded!") | |
| def transcribe(audio_input): | |
| """Transcribe audio with expressive tags.""" | |
| if audio_input is None: | |
| return "Please upload or record an audio file." | |
| sr, audio_array = audio_input | |
| # Convert to float32 and mono if needed | |
| audio_array = audio_array.astype(np.float32) | |
| if audio_array.ndim > 1: | |
| audio_array = audio_array.mean(axis=1) | |
| # Normalize to [-1, 1] | |
| if audio_array.max() > 1.0: | |
| audio_array = audio_array / 32768.0 | |
| # Resample to 16kHz if needed | |
| if sr != 16000: | |
| import librosa | |
| audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=16000) | |
| inputs = processor.apply_transcription_request( | |
| language="en", | |
| audio=[audio_array], | |
| format=["WAV"], | |
| model_id=MODEL_ID, | |
| return_tensors="pt", | |
| ) | |
| inputs = { | |
| k: v.to(model.device, dtype=torch.bfloat16) | |
| if v.dtype in (torch.float32, torch.float16, torch.bfloat16) | |
| else v.to(model.device) | |
| for k, v in inputs.items() | |
| } | |
| with torch.no_grad(): | |
| output_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False) | |
| input_len = inputs["input_ids"].shape[1] | |
| transcription = processor.tokenizer.decode( | |
| output_ids[0][input_len:], skip_special_tokens=True | |
| ) | |
| return transcription | |
| EXAMPLES_TEXT = """ | |
| ## What is Evoxtral? | |
| Evoxtral is a fine-tuned version of [Voxtral-Mini-3B](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507) | |
| that produces transcriptions enriched with **inline expressive audio tags** from the | |
| [ElevenLabs v3 tag set](https://elevenlabs.io/docs/api-reference/text-to-speech). | |
| ### Standard ASR output: | |
| > So I was thinking maybe we could try that new restaurant downtown. | |
| ### Evoxtral output: | |
| > [nervous] So... I was thinking maybe we could [clears throat] try that new restaurant downtown? [laughs nervously] | |
| ### Supported tags include: | |
| `[laughs]` `[sighs]` `[gasps]` `[whispers]` `[clears throat]` `[pause]` `[nervous]` `[frustrated]` `[excited]` `[sad]` `[calm]` `[stammers]` `[yawns]` and more. | |
| ### Results | |
| | Metric | Base Voxtral | Evoxtral | Improvement | | |
| |--------|-------------|----------|-------------| | |
| | WER | 6.64% | **4.47%** | 32.7% better | | |
| | Tag F1 | 22.0% | **67.2%** | 3x better | | |
| """ | |
| with gr.Blocks(title="Evoxtral β Expressive Tagged Transcription", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Evoxtral β Expressive Tagged Transcription") | |
| gr.Markdown("Upload or record audio to get a transcription with inline expressive tags like `[sighs]`, `[laughs]`, `[whispers]`, etc.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| label="Audio Input", | |
| type="numpy", | |
| sources=["upload", "microphone"], | |
| ) | |
| submit_btn = gr.Button("Transcribe", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="Expressive Transcription", | |
| lines=8, | |
| show_copy_button=True, | |
| ) | |
| submit_btn.click(fn=transcribe, inputs=audio_input, outputs=output_text) | |
| gr.Markdown(EXAMPLES_TEXT) | |
| gr.Markdown(""" | |
| --- | |
| Built for the **Mistral AI Online Hackathon 2026** (W&B Fine-Tuning Track) | | |
| [Model](https://huggingface.co/YongkangZOU/evoxtral-lora) | | |
| [W&B Dashboard](https://wandb.ai/yongkang-zou-ai/evoxtral) | | |
| By Yongkang Zou | |
| """) | |
| demo.launch() | |