Spaces:

pluttodk
/

Milo-ASR-Demo

Runtime error

App Files Files Community

Milo-ASR-Demo / app.py

pluttodk

Fix No API found error: disable SSR, pin Gradio 5.9.1

5f4bc28 2 months ago

raw

history blame contribute delete

6.25 kB

	"""Milo-ASR: Danish Speech Recognition - Hugging Face Space."""

	import base64
	import io
	import tempfile
	import time

	import gradio as gr
	import numpy as np
	from scipy.io.wavfile import write as wav_write

	MODEL_ID = "pluttodk/Milo-ASR"
	ALIGNER_ID = "Qwen/Qwen3-ForcedAligner-0.6B"

	_model = None
	_model_ts = None


	def _load_model(with_timestamps: bool):
	global _model, _model_ts

	from qwen_asr import Qwen3ASRModel

	if with_timestamps:
	if _model_ts is None:
	_model_ts = Qwen3ASRModel.from_pretrained(
	MODEL_ID,
	dtype="float32",
	device_map="cpu",
	forced_aligner=ALIGNER_ID,
	forced_aligner_kwargs=dict(
	dtype="float32",
	device_map="cpu",
	),
	)
	return _model_ts
	else:
	if _model is None:
	_model = Qwen3ASRModel.from_pretrained(
	MODEL_ID,
	dtype="float32",
	device_map="cpu",
	)
	return _model


	def _normalize_audio(wav):
	x = np.asarray(wav, dtype=np.float32)
	if x.ndim > 1:
	x = np.mean(x, axis=-1)
	m = np.max(np.abs(x)) if x.size else 0.0
	if m > 1.0 + 1e-6:
	x = x / m
	return np.clip(x, -1.0, 1.0)


	def _make_timestamp_html(sr, audio, timestamps):
	if not timestamps:
	return ""

	html = """
	<style>
	.ts-container { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px; }
	.ts-box {
	border: 1px solid #ddd; border-radius: 8px; padding: 8px 12px;
	background: #f9f9f9; box-shadow: 0 1px 3px rgba(0,0,0,0.06);
	text-align: center;
	}
	.ts-word { font-size: 16px; font-weight: 700; margin-bottom: 4px; }
	.ts-time { font-size: 11px; color: #666; margin-bottom: 6px; }
	.ts-audio audio { width: 120px; height: 28px; }
	</style>
	<details open>
	<summary style="font-weight: 700; cursor: pointer; margin-bottom: 8px;">
	Word-level Timestamps (click to play each segment)
	</summary>
	<div class="ts-container">
	"""

	for item in timestamps:
	word = item["text"]
	start = float(item["start_time"])
	end = float(item["end_time"])
	if end <= start:
	continue

	s_idx = max(0, int(start * sr))
	e_idx = min(len(audio), int(end * sr))
	if e_idx <= s_idx:
	continue

	seg = (np.clip(audio[s_idx:e_idx], -1.0, 1.0) * 32767).astype(np.int16)
	buf = io.BytesIO()
	wav_write(buf, sr, seg)
	b64 = base64.b64encode(buf.getvalue()).decode()

	html += f"""
	<div class="ts-box">
	<div class="ts-word">{word}</div>
	<div class="ts-time">{start:.2f}s - {end:.2f}s</div>
	<div class="ts-audio">
	<audio controls preload="none" src="data:audio/wav;base64,{b64}"></audio>
	</div>
	</div>
	"""

	html += "</div></details>"
	return html


	def transcribe(audio, use_timestamps):
	if audio is None:
	return "Please upload or record an audio file.", "", ""

	sr, raw = audio
	normalized = _normalize_audio(raw)

	# Write to temp WAV file for the model
	int16_data = (normalized * 32767).astype(np.int16)
	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	wav_write(tmp.name, sr, int16_data)
	tmp.close()

	t0 = time.perf_counter()
	model = _load_model(with_timestamps=use_timestamps)
	load_time = time.perf_counter() - t0

	t1 = time.perf_counter()
	results = model.transcribe(
	audio=tmp.name,
	language="Danish",
	return_time_stamps=use_timestamps,
	)
	inference_time = time.perf_counter() - t1

	r = results[0]
	text = getattr(r, "text", "") or ""

	info = f"Inference: {inference_time:.1f}s"
	if load_time > 1.0:
	info += f" (model load: {load_time:.1f}s)"

	ts_html = ""
	if use_timestamps and hasattr(r, "time_stamps") and r.time_stamps:
	ts_data = [
	{
	"text": getattr(t, "text", ""),
	"start_time": getattr(t, "start_time", 0),
	"end_time": getattr(t, "end_time", 0),
	}
	for t in r.time_stamps.items
	]
	ts_html = _make_timestamp_html(sr, normalized, ts_data)

	return text, info, ts_html


	theme = gr.themes.Soft(
	font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"],
	)

	with gr.Blocks(theme=theme, title="Milo-ASR") as demo:
	gr.Markdown(
	"""
	# Milo-ASR - Danish Speech Recognition

	Model: [`pluttodk/Milo-ASR`](https://huggingface.co/pluttodk/Milo-ASR) (finetuned Qwen3-ASR-1.7B)

	Upload an audio file or record with your microphone to transcribe Danish speech.
	Running on CPU -- the first request will be slow while the model loads, and inference takes longer than on GPU.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	audio_in = gr.Audio(
	label="Audio",
	sources=["upload", "microphone"],
	type="numpy",
	)
	ts_checkbox = gr.Checkbox(
	label="Word-level timestamps",
	value=False,
	info="Uses Qwen3-ForcedAligner for word alignment",
	)
	btn = gr.Button("Transcribe", variant="primary", size="lg")

	with gr.Column(scale=1):
	out_text = gr.Textbox(
	label="Transcription",
	lines=6,
	show_copy_button=True,
	interactive=False,
	)
	out_info = gr.Textbox(
	label="Info",
	lines=1,
	interactive=False,
	)

	out_ts = gr.HTML()

	btn.click(
	fn=transcribe,
	inputs=[audio_in, ts_checkbox],
	outputs=[out_text, out_info, out_ts],
	)

	gr.Markdown(
	"""
	---
	Links: [Model Card](https://huggingface.co/pluttodk/Milo-ASR) \|
	Based on [Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) finetuned on CoRal v2 Danish speech data.
	"""
	)


	if __name__ == "__main__":
	demo.launch(ssr_mode=False)