Milo-ASR-Demo / app.py
pluttodk's picture
Fix No API found error: disable SSR, pin Gradio 5.9.1
5f4bc28
"""Milo-ASR: Danish Speech Recognition - Hugging Face Space."""
import base64
import io
import tempfile
import time
import gradio as gr
import numpy as np
from scipy.io.wavfile import write as wav_write
MODEL_ID = "pluttodk/Milo-ASR"
ALIGNER_ID = "Qwen/Qwen3-ForcedAligner-0.6B"
_model = None
_model_ts = None
def _load_model(with_timestamps: bool):
global _model, _model_ts
from qwen_asr import Qwen3ASRModel
if with_timestamps:
if _model_ts is None:
_model_ts = Qwen3ASRModel.from_pretrained(
MODEL_ID,
dtype="float32",
device_map="cpu",
forced_aligner=ALIGNER_ID,
forced_aligner_kwargs=dict(
dtype="float32",
device_map="cpu",
),
)
return _model_ts
else:
if _model is None:
_model = Qwen3ASRModel.from_pretrained(
MODEL_ID,
dtype="float32",
device_map="cpu",
)
return _model
def _normalize_audio(wav):
x = np.asarray(wav, dtype=np.float32)
if x.ndim > 1:
x = np.mean(x, axis=-1)
m = np.max(np.abs(x)) if x.size else 0.0
if m > 1.0 + 1e-6:
x = x / m
return np.clip(x, -1.0, 1.0)
def _make_timestamp_html(sr, audio, timestamps):
if not timestamps:
return ""
html = """
<style>
.ts-container { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px; }
.ts-box {
border: 1px solid #ddd; border-radius: 8px; padding: 8px 12px;
background: #f9f9f9; box-shadow: 0 1px 3px rgba(0,0,0,0.06);
text-align: center;
}
.ts-word { font-size: 16px; font-weight: 700; margin-bottom: 4px; }
.ts-time { font-size: 11px; color: #666; margin-bottom: 6px; }
.ts-audio audio { width: 120px; height: 28px; }
</style>
<details open>
<summary style="font-weight: 700; cursor: pointer; margin-bottom: 8px;">
Word-level Timestamps (click to play each segment)
</summary>
<div class="ts-container">
"""
for item in timestamps:
word = item["text"]
start = float(item["start_time"])
end = float(item["end_time"])
if end <= start:
continue
s_idx = max(0, int(start * sr))
e_idx = min(len(audio), int(end * sr))
if e_idx <= s_idx:
continue
seg = (np.clip(audio[s_idx:e_idx], -1.0, 1.0) * 32767).astype(np.int16)
buf = io.BytesIO()
wav_write(buf, sr, seg)
b64 = base64.b64encode(buf.getvalue()).decode()
html += f"""
<div class="ts-box">
<div class="ts-word">{word}</div>
<div class="ts-time">{start:.2f}s - {end:.2f}s</div>
<div class="ts-audio">
<audio controls preload="none" src="data:audio/wav;base64,{b64}"></audio>
</div>
</div>
"""
html += "</div></details>"
return html
def transcribe(audio, use_timestamps):
if audio is None:
return "Please upload or record an audio file.", "", ""
sr, raw = audio
normalized = _normalize_audio(raw)
# Write to temp WAV file for the model
int16_data = (normalized * 32767).astype(np.int16)
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
wav_write(tmp.name, sr, int16_data)
tmp.close()
t0 = time.perf_counter()
model = _load_model(with_timestamps=use_timestamps)
load_time = time.perf_counter() - t0
t1 = time.perf_counter()
results = model.transcribe(
audio=tmp.name,
language="Danish",
return_time_stamps=use_timestamps,
)
inference_time = time.perf_counter() - t1
r = results[0]
text = getattr(r, "text", "") or ""
info = f"Inference: {inference_time:.1f}s"
if load_time > 1.0:
info += f" (model load: {load_time:.1f}s)"
ts_html = ""
if use_timestamps and hasattr(r, "time_stamps") and r.time_stamps:
ts_data = [
{
"text": getattr(t, "text", ""),
"start_time": getattr(t, "start_time", 0),
"end_time": getattr(t, "end_time", 0),
}
for t in r.time_stamps.items
]
ts_html = _make_timestamp_html(sr, normalized, ts_data)
return text, info, ts_html
theme = gr.themes.Soft(
font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"],
)
with gr.Blocks(theme=theme, title="Milo-ASR") as demo:
gr.Markdown(
"""
# Milo-ASR - Danish Speech Recognition
**Model:** [`pluttodk/Milo-ASR`](https://huggingface.co/pluttodk/Milo-ASR) (finetuned Qwen3-ASR-1.7B)
Upload an audio file or record with your microphone to transcribe Danish speech.
Running on CPU -- the first request will be slow while the model loads, and inference takes longer than on GPU.
"""
)
with gr.Row():
with gr.Column(scale=1):
audio_in = gr.Audio(
label="Audio",
sources=["upload", "microphone"],
type="numpy",
)
ts_checkbox = gr.Checkbox(
label="Word-level timestamps",
value=False,
info="Uses Qwen3-ForcedAligner for word alignment",
)
btn = gr.Button("Transcribe", variant="primary", size="lg")
with gr.Column(scale=1):
out_text = gr.Textbox(
label="Transcription",
lines=6,
show_copy_button=True,
interactive=False,
)
out_info = gr.Textbox(
label="Info",
lines=1,
interactive=False,
)
out_ts = gr.HTML()
btn.click(
fn=transcribe,
inputs=[audio_in, ts_checkbox],
outputs=[out_text, out_info, out_ts],
)
gr.Markdown(
"""
---
**Links:** [Model Card](https://huggingface.co/pluttodk/Milo-ASR) |
Based on [Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) finetuned on CoRal v2 Danish speech data.
"""
)
if __name__ == "__main__":
demo.launch(ssr_mode=False)