Spaces:
Runtime error
Runtime error
| """Milo-ASR: Danish Speech Recognition - Hugging Face Space.""" | |
| import base64 | |
| import io | |
| import tempfile | |
| import time | |
| import gradio as gr | |
| import numpy as np | |
| from scipy.io.wavfile import write as wav_write | |
| MODEL_ID = "pluttodk/Milo-ASR" | |
| ALIGNER_ID = "Qwen/Qwen3-ForcedAligner-0.6B" | |
| _model = None | |
| _model_ts = None | |
| def _load_model(with_timestamps: bool): | |
| global _model, _model_ts | |
| from qwen_asr import Qwen3ASRModel | |
| if with_timestamps: | |
| if _model_ts is None: | |
| _model_ts = Qwen3ASRModel.from_pretrained( | |
| MODEL_ID, | |
| dtype="float32", | |
| device_map="cpu", | |
| forced_aligner=ALIGNER_ID, | |
| forced_aligner_kwargs=dict( | |
| dtype="float32", | |
| device_map="cpu", | |
| ), | |
| ) | |
| return _model_ts | |
| else: | |
| if _model is None: | |
| _model = Qwen3ASRModel.from_pretrained( | |
| MODEL_ID, | |
| dtype="float32", | |
| device_map="cpu", | |
| ) | |
| return _model | |
| def _normalize_audio(wav): | |
| x = np.asarray(wav, dtype=np.float32) | |
| if x.ndim > 1: | |
| x = np.mean(x, axis=-1) | |
| m = np.max(np.abs(x)) if x.size else 0.0 | |
| if m > 1.0 + 1e-6: | |
| x = x / m | |
| return np.clip(x, -1.0, 1.0) | |
| def _make_timestamp_html(sr, audio, timestamps): | |
| if not timestamps: | |
| return "" | |
| html = """ | |
| <style> | |
| .ts-container { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px; } | |
| .ts-box { | |
| border: 1px solid #ddd; border-radius: 8px; padding: 8px 12px; | |
| background: #f9f9f9; box-shadow: 0 1px 3px rgba(0,0,0,0.06); | |
| text-align: center; | |
| } | |
| .ts-word { font-size: 16px; font-weight: 700; margin-bottom: 4px; } | |
| .ts-time { font-size: 11px; color: #666; margin-bottom: 6px; } | |
| .ts-audio audio { width: 120px; height: 28px; } | |
| </style> | |
| <details open> | |
| <summary style="font-weight: 700; cursor: pointer; margin-bottom: 8px;"> | |
| Word-level Timestamps (click to play each segment) | |
| </summary> | |
| <div class="ts-container"> | |
| """ | |
| for item in timestamps: | |
| word = item["text"] | |
| start = float(item["start_time"]) | |
| end = float(item["end_time"]) | |
| if end <= start: | |
| continue | |
| s_idx = max(0, int(start * sr)) | |
| e_idx = min(len(audio), int(end * sr)) | |
| if e_idx <= s_idx: | |
| continue | |
| seg = (np.clip(audio[s_idx:e_idx], -1.0, 1.0) * 32767).astype(np.int16) | |
| buf = io.BytesIO() | |
| wav_write(buf, sr, seg) | |
| b64 = base64.b64encode(buf.getvalue()).decode() | |
| html += f""" | |
| <div class="ts-box"> | |
| <div class="ts-word">{word}</div> | |
| <div class="ts-time">{start:.2f}s - {end:.2f}s</div> | |
| <div class="ts-audio"> | |
| <audio controls preload="none" src="data:audio/wav;base64,{b64}"></audio> | |
| </div> | |
| </div> | |
| """ | |
| html += "</div></details>" | |
| return html | |
| def transcribe(audio, use_timestamps): | |
| if audio is None: | |
| return "Please upload or record an audio file.", "", "" | |
| sr, raw = audio | |
| normalized = _normalize_audio(raw) | |
| # Write to temp WAV file for the model | |
| int16_data = (normalized * 32767).astype(np.int16) | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| wav_write(tmp.name, sr, int16_data) | |
| tmp.close() | |
| t0 = time.perf_counter() | |
| model = _load_model(with_timestamps=use_timestamps) | |
| load_time = time.perf_counter() - t0 | |
| t1 = time.perf_counter() | |
| results = model.transcribe( | |
| audio=tmp.name, | |
| language="Danish", | |
| return_time_stamps=use_timestamps, | |
| ) | |
| inference_time = time.perf_counter() - t1 | |
| r = results[0] | |
| text = getattr(r, "text", "") or "" | |
| info = f"Inference: {inference_time:.1f}s" | |
| if load_time > 1.0: | |
| info += f" (model load: {load_time:.1f}s)" | |
| ts_html = "" | |
| if use_timestamps and hasattr(r, "time_stamps") and r.time_stamps: | |
| ts_data = [ | |
| { | |
| "text": getattr(t, "text", ""), | |
| "start_time": getattr(t, "start_time", 0), | |
| "end_time": getattr(t, "end_time", 0), | |
| } | |
| for t in r.time_stamps.items | |
| ] | |
| ts_html = _make_timestamp_html(sr, normalized, ts_data) | |
| return text, info, ts_html | |
| theme = gr.themes.Soft( | |
| font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"], | |
| ) | |
| with gr.Blocks(theme=theme, title="Milo-ASR") as demo: | |
| gr.Markdown( | |
| """ | |
| # Milo-ASR - Danish Speech Recognition | |
| **Model:** [`pluttodk/Milo-ASR`](https://huggingface.co/pluttodk/Milo-ASR) (finetuned Qwen3-ASR-1.7B) | |
| Upload an audio file or record with your microphone to transcribe Danish speech. | |
| Running on CPU -- the first request will be slow while the model loads, and inference takes longer than on GPU. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_in = gr.Audio( | |
| label="Audio", | |
| sources=["upload", "microphone"], | |
| type="numpy", | |
| ) | |
| ts_checkbox = gr.Checkbox( | |
| label="Word-level timestamps", | |
| value=False, | |
| info="Uses Qwen3-ForcedAligner for word alignment", | |
| ) | |
| btn = gr.Button("Transcribe", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| out_text = gr.Textbox( | |
| label="Transcription", | |
| lines=6, | |
| show_copy_button=True, | |
| interactive=False, | |
| ) | |
| out_info = gr.Textbox( | |
| label="Info", | |
| lines=1, | |
| interactive=False, | |
| ) | |
| out_ts = gr.HTML() | |
| btn.click( | |
| fn=transcribe, | |
| inputs=[audio_in, ts_checkbox], | |
| outputs=[out_text, out_info, out_ts], | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Links:** [Model Card](https://huggingface.co/pluttodk/Milo-ASR) | | |
| Based on [Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) finetuned on CoRal v2 Danish speech data. | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(ssr_mode=False) | |