Spaces:
Running
Running
| """ | |
| Dutch ASR Leaderboard — Gradio Space | |
| """ | |
| import json | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| RESULTS_DIR = Path("results") | |
| DATASET_KEYS = ["fleurs_nl", "voxpopuli_nl", "mls_nl"] | |
| DATASET_LABELS = { | |
| "fleurs_nl": "FLEURS", | |
| "voxpopuli_nl": "VoxPopuli", | |
| "mls_nl": "Multilingual LibriSpeech" | |
| } | |
| # Models to hide when "Show Proprietary" is OFF | |
| PROPRIETARY_MODELS = ["ElevenLabs/scribe_v2", "voxtral-mini-2602", "Resonate-1 2026-05-22", "Murmel 2026-05-04"] | |
| def load_results() -> list[dict]: | |
| records = [] | |
| for path in sorted(RESULTS_DIR.glob("*.json")): | |
| try: | |
| with open(path) as f: | |
| records.append(json.load(f)) | |
| except Exception as e: | |
| print(f"Warning: could not load {path}: {e}") | |
| return records | |
| def build_dataframe( | |
| records: list[dict], | |
| sort_by: str = "average", | |
| show_proprietary: bool = True, | |
| datasets_visible: dict | None = None, | |
| ) -> pd.DataFrame: | |
| rows = [] | |
| for r in records: | |
| model_name = r.get("model_name", r.get("model_id", "?")) | |
| # Skip proprietary models if toggle is off | |
| if not show_proprietary and any(p in model_name for p in PROPRIETARY_MODELS): | |
| continue | |
| results = r.get("results", {}) | |
| params = r.get("params_billions") | |
| row = { | |
| "Model": model_name, | |
| "License": r.get("license", "?"), | |
| "Parameters (B)": f"{params}" if params is not None else "—", | |
| } | |
| wer_values = {} | |
| for ds_key in DATASET_KEYS: | |
| if datasets_visible and not datasets_visible.get(ds_key, True): | |
| continue | |
| label = DATASET_LABELS[ds_key] | |
| ds_result = results.get(ds_key, {}) | |
| wer = ds_result.get("wer") | |
| rtf = ds_result.get("rtf") | |
| n_failed = ds_result.get("n_failed", 0) | |
| failure_rate = ds_result.get("failure_rate_pct", 0.0) | |
| wer_str = f"{wer:.1f}" if wer is not None else "—" | |
| if n_failed > 0: | |
| wer_str += f" ⚠{failure_rate:.0f}%" | |
| row[f"{label} WER"] = wer_str | |
| row[f"{label} RTF"] = f"{rtf:.3f}" if rtf is not None else "—" | |
| if wer is not None: | |
| wer_values[ds_key] = wer | |
| # Calculate average WER | |
| if wer_values: | |
| row["Average WER"] = f"{sum(wer_values.values()) / len(wer_values):.1f}" | |
| else: | |
| row["Average WER"] = "—" | |
| rows.append(row) | |
| if not rows: | |
| return pd.DataFrame() | |
| df = pd.DataFrame(rows) | |
| front = [c for c in ["Model", "License", "Parameters (B)", "Average WER"] if c in df.columns] | |
| rest = [c for c in df.columns if c not in front] | |
| df = df[front + rest] | |
| # Sort by selected metric | |
| if sort_by == "average": | |
| df["_sort"] = df["Average WER"].str.extract(r"([\d.]+)")[0].astype(float, errors="ignore") | |
| df = df.sort_values("_sort", na_position="last").drop(columns=["_sort"]) | |
| elif sort_by in DATASET_LABELS: | |
| sort_col = f"{DATASET_LABELS[sort_by]} WER" | |
| if sort_col in df.columns: | |
| df["_sort"] = df[sort_col].str.extract(r"([\d.]+)")[0].astype(float, errors="ignore") | |
| df = df.sort_values("_sort", na_position="last").drop(columns=["_sort"]) | |
| df = df.reset_index(drop=True) | |
| return df | |
| def refresh(sort_by: str, show_proprietary: bool, selected_datasets: list[str]): | |
| records = load_results() | |
| datasets_visible = {key: (DATASET_LABELS[key] in selected_datasets) for key in DATASET_KEYS} | |
| return build_dataframe(records, sort_by, show_proprietary, datasets_visible) | |
| ABOUT_TEXT = """ | |
| ## Repository | |
| Evaluation code and submission instructions are on GitHub: [tvosch/Dutch-ASR-leaderboard](https://github.com/tvosch/Dutch-ASR-leaderboard) | |
| ## Evaluation Setup | |
| Local models are benchmarked on **H100 HBM2e GPUs** for consistent performance measurements. | |
| ## Metrics | |
| **Word Error Rate (WER)** measures the percentage of words transcribed incorrectly compared to a reference transcript. It is calculated as `(substitutions + deletions + insertions) / total reference words × 100`. Lower is better. Results are normalized before scoring: lowercase, no punctuation, digits expanded to words, fillers removed. | |
| **Real-Time Factor (RTF)** measures how fast a model transcribes relative to the audio duration. An RTF of 0.1 means 1 second of audio is processed in 100 ms. Lower is faster. Measured here at batch size 1; RTF measured via HTTP API includes network overhead. | |
| ## Datasets | |
| | Dataset | Speech Type | | |
| |---------|-------------| | |
| | **FLEURS** | Read, studio quality | | |
| | **VoxPopuli** | Formal, Parliament speech | | |
| | **MLS** | Audiobook style | | |
| """ | |
| with gr.Blocks(title="Dutch ASR Leaderboard", theme=gr.themes.Default()) as demo: | |
| gr.Markdown( | |
| "# Dutch ASR Leaderboard\n" | |
| "**An independent, community-driven benchmark for Dutch automatic speech recognition.** \n" | |
| "Models are evaluated on standardized public test sets. Lower WER is better. " | |
| "Rankings serve as a proxy for comparison, performance on your data may differ.\n\n" | |
| "> **Note:** Some models may be benchmaxxed: trained or fine-tuned on data that overlaps " | |
| "with these test sets. Treat results as indicative, not definitive. " | |
| "[Submit your model on GitHub →](https://github.com/tvosch/Dutch-ASR-leaderboard)" | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("🏆 Leaderboard"): | |
| with gr.Row(): | |
| sort_by = gr.Dropdown( | |
| choices=[ | |
| ("Average WER", "average"), | |
| ("FLEURS", "fleurs_nl"), | |
| ("VoxPopuli", "voxpopuli_nl"), | |
| ("MLS", "mls_nl"), | |
| ], | |
| value="average", | |
| label="Sort by", | |
| scale=1, | |
| min_width=160, | |
| ) | |
| dataset_selector = gr.CheckboxGroup( | |
| choices=list(DATASET_LABELS.values()), | |
| value=list(DATASET_LABELS.values()), | |
| label="Datasets", | |
| scale=2, | |
| ) | |
| show_proprietary = gr.Checkbox( | |
| value=True, | |
| label="Show proprietary models", | |
| scale=1, | |
| ) | |
| leaderboard_table = gr.Dataframe( | |
| label="Results", | |
| interactive=False, | |
| wrap=True, | |
| min_width=800, | |
| ) | |
| # Initial load and event handlers | |
| _inputs = [sort_by, show_proprietary, dataset_selector] | |
| demo.load(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) | |
| sort_by.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) | |
| show_proprietary.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) | |
| dataset_selector.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table]) | |
| with gr.Tab("ℹ️ About"): | |
| gr.Markdown(ABOUT_TEXT) | |
| if __name__ == "__main__": | |
| demo.launch() | |