"""Milo-ASR: Danish Speech Recognition - Hugging Face Space.""" import base64 import io import tempfile import time import gradio as gr import numpy as np from scipy.io.wavfile import write as wav_write MODEL_ID = "pluttodk/Milo-ASR" ALIGNER_ID = "Qwen/Qwen3-ForcedAligner-0.6B" _model = None _model_ts = None def _load_model(with_timestamps: bool): global _model, _model_ts from qwen_asr import Qwen3ASRModel if with_timestamps: if _model_ts is None: _model_ts = Qwen3ASRModel.from_pretrained( MODEL_ID, dtype="float32", device_map="cpu", forced_aligner=ALIGNER_ID, forced_aligner_kwargs=dict( dtype="float32", device_map="cpu", ), ) return _model_ts else: if _model is None: _model = Qwen3ASRModel.from_pretrained( MODEL_ID, dtype="float32", device_map="cpu", ) return _model def _normalize_audio(wav): x = np.asarray(wav, dtype=np.float32) if x.ndim > 1: x = np.mean(x, axis=-1) m = np.max(np.abs(x)) if x.size else 0.0 if m > 1.0 + 1e-6: x = x / m return np.clip(x, -1.0, 1.0) def _make_timestamp_html(sr, audio, timestamps): if not timestamps: return "" html = """
Word-level Timestamps (click to play each segment)
""" for item in timestamps: word = item["text"] start = float(item["start_time"]) end = float(item["end_time"]) if end <= start: continue s_idx = max(0, int(start * sr)) e_idx = min(len(audio), int(end * sr)) if e_idx <= s_idx: continue seg = (np.clip(audio[s_idx:e_idx], -1.0, 1.0) * 32767).astype(np.int16) buf = io.BytesIO() wav_write(buf, sr, seg) b64 = base64.b64encode(buf.getvalue()).decode() html += f"""
{word}
{start:.2f}s - {end:.2f}s
""" html += "
" return html def transcribe(audio, use_timestamps): if audio is None: return "Please upload or record an audio file.", "", "" sr, raw = audio normalized = _normalize_audio(raw) # Write to temp WAV file for the model int16_data = (normalized * 32767).astype(np.int16) tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) wav_write(tmp.name, sr, int16_data) tmp.close() t0 = time.perf_counter() model = _load_model(with_timestamps=use_timestamps) load_time = time.perf_counter() - t0 t1 = time.perf_counter() results = model.transcribe( audio=tmp.name, language="Danish", return_time_stamps=use_timestamps, ) inference_time = time.perf_counter() - t1 r = results[0] text = getattr(r, "text", "") or "" info = f"Inference: {inference_time:.1f}s" if load_time > 1.0: info += f" (model load: {load_time:.1f}s)" ts_html = "" if use_timestamps and hasattr(r, "time_stamps") and r.time_stamps: ts_data = [ { "text": getattr(t, "text", ""), "start_time": getattr(t, "start_time", 0), "end_time": getattr(t, "end_time", 0), } for t in r.time_stamps.items ] ts_html = _make_timestamp_html(sr, normalized, ts_data) return text, info, ts_html theme = gr.themes.Soft( font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"], ) with gr.Blocks(theme=theme, title="Milo-ASR") as demo: gr.Markdown( """ # Milo-ASR - Danish Speech Recognition **Model:** [`pluttodk/Milo-ASR`](https://huggingface.co/pluttodk/Milo-ASR) (finetuned Qwen3-ASR-1.7B) Upload an audio file or record with your microphone to transcribe Danish speech. Running on CPU -- the first request will be slow while the model loads, and inference takes longer than on GPU. """ ) with gr.Row(): with gr.Column(scale=1): audio_in = gr.Audio( label="Audio", sources=["upload", "microphone"], type="numpy", ) ts_checkbox = gr.Checkbox( label="Word-level timestamps", value=False, info="Uses Qwen3-ForcedAligner for word alignment", ) btn = gr.Button("Transcribe", variant="primary", size="lg") with gr.Column(scale=1): out_text = gr.Textbox( label="Transcription", lines=6, show_copy_button=True, interactive=False, ) out_info = gr.Textbox( label="Info", lines=1, interactive=False, ) out_ts = gr.HTML() btn.click( fn=transcribe, inputs=[audio_in, ts_checkbox], outputs=[out_text, out_info, out_ts], ) gr.Markdown( """ --- **Links:** [Model Card](https://huggingface.co/pluttodk/Milo-ASR) | Based on [Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) finetuned on CoRal v2 Danish speech data. """ ) if __name__ == "__main__": demo.launch(ssr_mode=False)