Spaces:

pluttodk
/

Milo-ASR-Demo

Runtime error

File size: 6,245 Bytes

"""Milo-ASR: Danish Speech Recognition - Hugging Face Space."""

import base64
import io
import tempfile
import time

import gradio as gr
import numpy as np
from scipy.io.wavfile import write as wav_write

MODEL_ID = "pluttodk/Milo-ASR"
ALIGNER_ID = "Qwen/Qwen3-ForcedAligner-0.6B"

_model = None
_model_ts = None


def _load_model(with_timestamps: bool):
    global _model, _model_ts

    from qwen_asr import Qwen3ASRModel

    if with_timestamps:
        if _model_ts is None:
            _model_ts = Qwen3ASRModel.from_pretrained(
                MODEL_ID,
                dtype="float32",
                device_map="cpu",
                forced_aligner=ALIGNER_ID,
                forced_aligner_kwargs=dict(
                    dtype="float32",
                    device_map="cpu",
                ),
            )
        return _model_ts
    else:
        if _model is None:
            _model = Qwen3ASRModel.from_pretrained(
                MODEL_ID,
                dtype="float32",
                device_map="cpu",
            )
        return _model


def _normalize_audio(wav):
    x = np.asarray(wav, dtype=np.float32)
    if x.ndim > 1:
        x = np.mean(x, axis=-1)
    m = np.max(np.abs(x)) if x.size else 0.0
    if m > 1.0 + 1e-6:
        x = x / m
    return np.clip(x, -1.0, 1.0)


def _make_timestamp_html(sr, audio, timestamps):
    if not timestamps:
        return ""

    html = """
    <style>
        .ts-container { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px; }
        .ts-box {
            border: 1px solid #ddd; border-radius: 8px; padding: 8px 12px;
            background: #f9f9f9; box-shadow: 0 1px 3px rgba(0,0,0,0.06);
            text-align: center;
        }
        .ts-word { font-size: 16px; font-weight: 700; margin-bottom: 4px; }
        .ts-time { font-size: 11px; color: #666; margin-bottom: 6px; }
        .ts-audio audio { width: 120px; height: 28px; }
    </style>
    <details open>
        <summary style="font-weight: 700; cursor: pointer; margin-bottom: 8px;">
            Word-level Timestamps (click to play each segment)
        </summary>
        <div class="ts-container">
    """

    for item in timestamps:
        word = item["text"]
        start = float(item["start_time"])
        end = float(item["end_time"])
        if end <= start:
            continue

        s_idx = max(0, int(start * sr))
        e_idx = min(len(audio), int(end * sr))
        if e_idx <= s_idx:
            continue

        seg = (np.clip(audio[s_idx:e_idx], -1.0, 1.0) * 32767).astype(np.int16)
        buf = io.BytesIO()
        wav_write(buf, sr, seg)
        b64 = base64.b64encode(buf.getvalue()).decode()

        html += f"""
        <div class="ts-box">
            <div class="ts-word">{word}</div>
            <div class="ts-time">{start:.2f}s - {end:.2f}s</div>
            <div class="ts-audio">
                <audio controls preload="none" src="data:audio/wav;base64,{b64}"></audio>
            </div>
        </div>
        """

    html += "</div></details>"
    return html


def transcribe(audio, use_timestamps):
    if audio is None:
        return "Please upload or record an audio file.", "", ""

    sr, raw = audio
    normalized = _normalize_audio(raw)

    # Write to temp WAV file for the model
    int16_data = (normalized * 32767).astype(np.int16)
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    wav_write(tmp.name, sr, int16_data)
    tmp.close()

    t0 = time.perf_counter()
    model = _load_model(with_timestamps=use_timestamps)
    load_time = time.perf_counter() - t0

    t1 = time.perf_counter()
    results = model.transcribe(
        audio=tmp.name,
        language="Danish",
        return_time_stamps=use_timestamps,
    )
    inference_time = time.perf_counter() - t1

    r = results[0]
    text = getattr(r, "text", "") or ""

    info = f"Inference: {inference_time:.1f}s"
    if load_time > 1.0:
        info += f" (model load: {load_time:.1f}s)"

    ts_html = ""
    if use_timestamps and hasattr(r, "time_stamps") and r.time_stamps:
        ts_data = [
            {
                "text": getattr(t, "text", ""),
                "start_time": getattr(t, "start_time", 0),
                "end_time": getattr(t, "end_time", 0),
            }
            for t in r.time_stamps.items
        ]
        ts_html = _make_timestamp_html(sr, normalized, ts_data)

    return text, info, ts_html


theme = gr.themes.Soft(
    font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"],
)

with gr.Blocks(theme=theme, title="Milo-ASR") as demo:
    gr.Markdown(
        """
# Milo-ASR - Danish Speech Recognition

**Model:** [`pluttodk/Milo-ASR`](https://huggingface.co/pluttodk/Milo-ASR) (finetuned Qwen3-ASR-1.7B)

Upload an audio file or record with your microphone to transcribe Danish speech.
Running on CPU -- the first request will be slow while the model loads, and inference takes longer than on GPU.
"""
    )

    with gr.Row():
        with gr.Column(scale=1):
            audio_in = gr.Audio(
                label="Audio",
                sources=["upload", "microphone"],
                type="numpy",
            )
            ts_checkbox = gr.Checkbox(
                label="Word-level timestamps",
                value=False,
                info="Uses Qwen3-ForcedAligner for word alignment",
            )
            btn = gr.Button("Transcribe", variant="primary", size="lg")

        with gr.Column(scale=1):
            out_text = gr.Textbox(
                label="Transcription",
                lines=6,
                show_copy_button=True,
                interactive=False,
            )
            out_info = gr.Textbox(
                label="Info",
                lines=1,
                interactive=False,
            )

    out_ts = gr.HTML()

    btn.click(
        fn=transcribe,
        inputs=[audio_in, ts_checkbox],
        outputs=[out_text, out_info, out_ts],
    )

    gr.Markdown(
        """
---
**Links:** [Model Card](https://huggingface.co/pluttodk/Milo-ASR) |
Based on [Qwen3-ASR-1.7B](https://huggingface.co/Qwen/Qwen3-ASR-1.7B) finetuned on CoRal v2 Danish speech data.
"""
    )


if __name__ == "__main__":
    demo.launch(ssr_mode=False)