""" Dutch ASR Leaderboard — Gradio Space """ import json from pathlib import Path import gradio as gr import pandas as pd RESULTS_DIR = Path("results") DATASET_KEYS = ["fleurs_nl", "voxpopuli_nl", "mls_nl"] DATASET_LABELS = { "fleurs_nl": "FLEURS", "voxpopuli_nl": "VoxPopuli", "mls_nl": "Multilingual LibriSpeech" } # Models to hide when "Show Proprietary" is OFF PROPRIETARY_MODELS = ["ElevenLabs/scribe_v2", "voxtral-mini-2602", "Resonate-1 2026-05-22", "Murmel 2026-05-04"] def load_results() -> list[dict]: records = [] for path in sorted(RESULTS_DIR.glob("*.json")): try: with open(path) as f: records.append(json.load(f)) except Exception as e: print(f"Warning: could not load {path}: {e}") return records def build_dataframe( records: list[dict], sort_by: str = "average", show_proprietary: bool = True, datasets_visible: dict | None = None, ) -> pd.DataFrame: rows = [] for r in records: model_name = r.get("model_name", r.get("model_id", "?")) # Skip proprietary models if toggle is off if not show_proprietary and any(p in model_name for p in PROPRIETARY_MODELS): continue results = r.get("results", {}) params = r.get("params_billions") row = { "Model": model_name, "License": r.get("license", "?"), "Parameters (B)": f"{params}" if params is not None else "—", } wer_values = {} for ds_key in DATASET_KEYS: if datasets_visible and not datasets_visible.get(ds_key, True): continue label = DATASET_LABELS[ds_key] ds_result = results.get(ds_key, {}) wer = ds_result.get("wer") rtf = ds_result.get("rtf") n_failed = ds_result.get("n_failed", 0) failure_rate = ds_result.get("failure_rate_pct", 0.0) wer_str = f"{wer:.1f}" if wer is not None else "—" if n_failed > 0: wer_str += f" ⚠{failure_rate:.0f}%" row[f"{label} WER"] = wer_str row[f"{label} RTF"] = f"{rtf:.3f}" if rtf is not None else "—" if wer is not None: wer_values[ds_key] = wer # Calculate average WER if wer_values: row["Average WER"] = f"{sum(wer_values.values()) / len(wer_values):.1f}" else: row["Average WER"] = "—" rows.append(row) if not rows: return pd.DataFrame() df = pd.DataFrame(rows) front = [c for c in ["Model", "License", "Parameters (B)", "Average WER"] if c in df.columns] rest = [c for c in df.columns if c not in front] df = df[front + rest] # Sort by selected metric if sort_by == "average": df["_sort"] = df["Average WER"].str.extract(r"([\d.]+)")[0].astype(float, errors="ignore") df = df.sort_values("_sort", na_position="last").drop(columns=["_sort"]) elif sort_by in DATASET_LABELS: sort_col = f"{DATASET_LABELS[sort_by]} WER" if sort_col in df.columns: df["_sort"] = df[sort_col].str.extract(r"([\d.]+)")[0].astype(float, errors="ignore") df = df.sort_values("_sort", na_position="last").drop(columns=["_sort"]) df = df.reset_index(drop=True) return df def refresh(sort_by: str, show_proprietary: bool, selected_datasets: list[str]): records = load_results() datasets_visible = {key: (DATASET_LABELS[key] in selected_datasets) for key in DATASET_KEYS} return build_dataframe(records, sort_by, show_proprietary, datasets_visible) ABOUT_TEXT = """ ## Repository Evaluation code and submission instructions are on GitHub: [tvosch/Dutch-ASR-leaderboard](https://github.com/tvosch/Dutch-ASR-leaderboard) ## Evaluation Setup Local models are benchmarked on **H100 HBM2e GPUs** for consistent performance measurements. ## Metrics **Word Error Rate (WER)** measures the percentage of words transcribed incorrectly compared to a reference transcript. It is calculated as `(substitutions + deletions + insertions) / total reference words × 100`. Lower is better. Results are normalized before scoring: lowercase, no punctuation, digits expanded to words, fillers removed. **Real-Time Factor (RTF)** measures how fast a model transcribes relative to the audio duration. An RTF of 0.1 means 1 second of audio is processed in 100 ms. Lower is faster. Measured here at batch size 1; RTF measured via HTTP API includes network overhead. ## Datasets | Dataset | Speech Type | |---------|-------------| | **FLEURS** | Read, studio quality | | **VoxPopuli** | Formal, Parliament speech | | **MLS** | Audiobook style | """ with gr.Blocks(title="Dutch ASR Leaderboard", theme=gr.themes.Default()) as demo: gr.Markdown( "# Dutch ASR Leaderboard\n" "**An independent, community-driven benchmark for Dutch automatic speech recognition.** \n" "Models are evaluated on standardized public test sets. Lower WER is better. " "Rankings serve as a proxy for comparison, performance on your data may differ.\n\n" "> **Note:** Some models may be benchmaxxed: trained or fine-tuned on data that overlaps " "with these test sets. Treat results as indicative, not definitive. " "[Submit your model on GitHub →](https://github.com/tvosch/Dutch-ASR-leaderboard)" ) with gr.Tabs(): with gr.Tab("🏆 Leaderboard"): with gr.Row(): sort_by = gr.Dropdown( choices=[ ("Average WER", "average"), ("FLEURS", "fleurs_nl"), ("VoxPopuli", "voxpopuli_nl"), ("MLS", "mls_nl"), ], value="average", label="Sort by", scale=1, min_width=160, ) dataset_selector = gr.CheckboxGroup( choices=list(DATASET_LABELS.values()), value=list(DATASET_LABELS.values()), label="Datasets", scale=2, ) show_proprietary = gr.Checkbox( value=True, label="Show proprietary models", scale=1, ) leaderboard_table = gr.Dataframe( label="Results", interactive=False, wrap=True, min_width=800, ) # Initial load and event handlers _inputs = [sort_by, show_proprietary, dataset_selector] demo.load(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) sort_by.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) show_proprietary.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) dataset_selector.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) with gr.Tab("ℹ️ About"): gr.Markdown(ABOUT_TEXT) if __name__ == "__main__": demo.launch()