Spaces:

mistral-hackaton-2026
/

ethos

Running

App Files Files Community

ethos / space /app.py

Lior-0618

chore: merge master → dev/video-fer (SSE transcribe-stream)

aa15e90 6 days ago

raw

history blame contribute delete

4.12 kB

	"""Evoxtral — Expressive Tagged Transcription Demo (ZeroGPU)."""

	import torch
	import spaces
	import gradio as gr
	import numpy as np
	from transformers import VoxtralForConditionalGeneration, AutoProcessor
	from peft import PeftModel

	MODEL_ID = "mistralai/Voxtral-Mini-3B-2507"
	ADAPTER_ID = "YongkangZOU/evoxtral-rl"

	# Load model on CPU at startup, ZeroGPU moves to GPU on demand
	print("Loading model...")
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	base_model = VoxtralForConditionalGeneration.from_pretrained(
	MODEL_ID, torch_dtype=torch.bfloat16,
	)
	model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
	model.eval()
	print("Model loaded!")


	@spaces.GPU
	def transcribe(audio_input):
	"""Transcribe audio with expressive tags."""
	if audio_input is None:
	return "Please upload or record an audio file."

	sr, audio_array = audio_input
	# Convert to float32 and mono if needed
	audio_array = audio_array.astype(np.float32)
	if audio_array.ndim > 1:
	audio_array = audio_array.mean(axis=1)
	# Normalize to [-1, 1]
	if audio_array.max() > 1.0:
	audio_array = audio_array / 32768.0

	# Resample to 16kHz if needed
	if sr != 16000:
	import librosa
	audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)

	inputs = processor.apply_transcription_request(
	language="en",
	audio=[audio_array],
	format=["WAV"],
	model_id=MODEL_ID,
	return_tensors="pt",
	)
	inputs = {
	k: v.to(model.device, dtype=torch.bfloat16)
	if v.dtype in (torch.float32, torch.float16, torch.bfloat16)
	else v.to(model.device)
	for k, v in inputs.items()
	}

	with torch.no_grad():
	output_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)

	input_len = inputs["input_ids"].shape[1]
	transcription = processor.tokenizer.decode(
	output_ids[0][input_len:], skip_special_tokens=True
	)
	return transcription


	EXAMPLES_TEXT = """
	## What is Evoxtral?

	Evoxtral is a fine-tuned version of [Voxtral-Mini-3B](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)
	that produces transcriptions enriched with inline expressive audio tags from the
	[ElevenLabs v3 tag set](https://elevenlabs.io/docs/api-reference/text-to-speech).

	### Standard ASR output:
	> So I was thinking maybe we could try that new restaurant downtown.

	### Evoxtral output:
	> [nervous] So... I was thinking maybe we could [clears throat] try that new restaurant downtown? [laughs nervously]

	### Supported tags include:
	`[laughs]` `[sighs]` `[gasps]` `[whispers]` `[clears throat]` `[pause]` `[nervous]` `[frustrated]` `[excited]` `[sad]` `[calm]` `[stammers]` `[yawns]` and more.

	### Results

	\| Metric \| Base Voxtral \| Evoxtral \| Improvement \|
	\|--------\|-------------\|----------\|-------------\|
	\| WER \| 6.64% \| 4.47% \| 32.7% better \|
	\| Tag F1 \| 22.0% \| 67.2% \| 3x better \|
	"""

	with gr.Blocks(title="Evoxtral — Expressive Tagged Transcription", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Evoxtral — Expressive Tagged Transcription")
	gr.Markdown("Upload or record audio to get a transcription with inline expressive tags like `[sighs]`, `[laughs]`, `[whispers]`, etc.")

	with gr.Row():
	with gr.Column(scale=1):
	audio_input = gr.Audio(
	label="Audio Input",
	type="numpy",
	sources=["upload", "microphone"],
	)
	submit_btn = gr.Button("Transcribe", variant="primary", size="lg")

	with gr.Column(scale=1):
	output_text = gr.Textbox(
	label="Expressive Transcription",
	lines=8,
	show_copy_button=True,
	)

	submit_btn.click(fn=transcribe, inputs=audio_input, outputs=output_text)

	gr.Markdown(EXAMPLES_TEXT)

	gr.Markdown("""
	---
	Built for the Mistral AI Online Hackathon 2026 (W&B Fine-Tuning Track) \|
	[Model](https://huggingface.co/YongkangZOU/evoxtral-lora) \|
	[W&B Dashboard](https://wandb.ai/yongkang-zou-ai/evoxtral) \|
	By Yongkang Zou
	""")

	demo.launch()