tvosch's picture
RTX with batch size of 1
dcdbf5e
"""
Dutch ASR Leaderboard — Gradio Space
"""
import json
from pathlib import Path
import gradio as gr
import pandas as pd
RESULTS_DIR = Path("results")
DATASET_KEYS = ["fleurs_nl", "voxpopuli_nl", "mls_nl"]
DATASET_LABELS = {
"fleurs_nl": "FLEURS",
"voxpopuli_nl": "VoxPopuli",
"mls_nl": "Multilingual LibriSpeech"
}
# Models to hide when "Show Proprietary" is OFF
PROPRIETARY_MODELS = ["ElevenLabs/scribe_v2", "voxtral-mini-2602", "Resonate-1 2026-05-22", "Murmel 2026-05-04"]
def load_results() -> list[dict]:
records = []
for path in sorted(RESULTS_DIR.glob("*.json")):
try:
with open(path) as f:
records.append(json.load(f))
except Exception as e:
print(f"Warning: could not load {path}: {e}")
return records
def build_dataframe(
records: list[dict],
sort_by: str = "average",
show_proprietary: bool = True,
datasets_visible: dict | None = None,
) -> pd.DataFrame:
rows = []
for r in records:
model_name = r.get("model_name", r.get("model_id", "?"))
# Skip proprietary models if toggle is off
if not show_proprietary and any(p in model_name for p in PROPRIETARY_MODELS):
continue
results = r.get("results", {})
params = r.get("params_billions")
row = {
"Model": model_name,
"License": r.get("license", "?"),
"Parameters (B)": f"{params}" if params is not None else "—",
}
wer_values = {}
for ds_key in DATASET_KEYS:
if datasets_visible and not datasets_visible.get(ds_key, True):
continue
label = DATASET_LABELS[ds_key]
ds_result = results.get(ds_key, {})
wer = ds_result.get("wer")
rtf = ds_result.get("rtf")
n_failed = ds_result.get("n_failed", 0)
failure_rate = ds_result.get("failure_rate_pct", 0.0)
wer_str = f"{wer:.1f}" if wer is not None else "—"
if n_failed > 0:
wer_str += f" ⚠{failure_rate:.0f}%"
row[f"{label} WER"] = wer_str
row[f"{label} RTF"] = f"{rtf:.3f}" if rtf is not None else "—"
if wer is not None:
wer_values[ds_key] = wer
# Calculate average WER
if wer_values:
row["Average WER"] = f"{sum(wer_values.values()) / len(wer_values):.1f}"
else:
row["Average WER"] = "—"
rows.append(row)
if not rows:
return pd.DataFrame()
df = pd.DataFrame(rows)
front = [c for c in ["Model", "License", "Parameters (B)", "Average WER"] if c in df.columns]
rest = [c for c in df.columns if c not in front]
df = df[front + rest]
# Sort by selected metric
if sort_by == "average":
df["_sort"] = df["Average WER"].str.extract(r"([\d.]+)")[0].astype(float, errors="ignore")
df = df.sort_values("_sort", na_position="last").drop(columns=["_sort"])
elif sort_by in DATASET_LABELS:
sort_col = f"{DATASET_LABELS[sort_by]} WER"
if sort_col in df.columns:
df["_sort"] = df[sort_col].str.extract(r"([\d.]+)")[0].astype(float, errors="ignore")
df = df.sort_values("_sort", na_position="last").drop(columns=["_sort"])
df = df.reset_index(drop=True)
return df
def refresh(sort_by: str, show_proprietary: bool, selected_datasets: list[str]):
records = load_results()
datasets_visible = {key: (DATASET_LABELS[key] in selected_datasets) for key in DATASET_KEYS}
return build_dataframe(records, sort_by, show_proprietary, datasets_visible)
ABOUT_TEXT = """
## Repository
Evaluation code and submission instructions are on GitHub: [tvosch/Dutch-ASR-leaderboard](https://github.com/tvosch/Dutch-ASR-leaderboard)
## Evaluation Setup
Local models are benchmarked on **H100 HBM2e GPUs** for consistent performance measurements.
## Metrics
**Word Error Rate (WER)** measures the percentage of words transcribed incorrectly compared to a reference transcript. It is calculated as `(substitutions + deletions + insertions) / total reference words × 100`. Lower is better. Results are normalized before scoring: lowercase, no punctuation, digits expanded to words, fillers removed.
**Real-Time Factor (RTF)** measures how fast a model transcribes relative to the audio duration. An RTF of 0.1 means 1 second of audio is processed in 100 ms. Lower is faster. Measured here at batch size 1; RTF measured via HTTP API includes network overhead.
## Datasets
| Dataset | Speech Type |
|---------|-------------|
| **FLEURS** | Read, studio quality |
| **VoxPopuli** | Formal, Parliament speech |
| **MLS** | Audiobook style |
"""
with gr.Blocks(title="Dutch ASR Leaderboard", theme=gr.themes.Default()) as demo:
gr.Markdown(
"# Dutch ASR Leaderboard\n"
"**An independent, community-driven benchmark for Dutch automatic speech recognition.** \n"
"Models are evaluated on standardized public test sets. Lower WER is better. "
"Rankings serve as a proxy for comparison, performance on your data may differ.\n\n"
"> **Note:** Some models may be benchmaxxed: trained or fine-tuned on data that overlaps "
"with these test sets. Treat results as indicative, not definitive. "
"[Submit your model on GitHub →](https://github.com/tvosch/Dutch-ASR-leaderboard)"
)
with gr.Tabs():
with gr.Tab("🏆 Leaderboard"):
with gr.Row():
sort_by = gr.Dropdown(
choices=[
("Average WER", "average"),
("FLEURS", "fleurs_nl"),
("VoxPopuli", "voxpopuli_nl"),
("MLS", "mls_nl"),
],
value="average",
label="Sort by",
scale=1,
min_width=160,
)
dataset_selector = gr.CheckboxGroup(
choices=list(DATASET_LABELS.values()),
value=list(DATASET_LABELS.values()),
label="Datasets",
scale=2,
)
show_proprietary = gr.Checkbox(
value=True,
label="Show proprietary models",
scale=1,
)
leaderboard_table = gr.Dataframe(
label="Results",
interactive=False,
wrap=True,
min_width=800,
)
# Initial load and event handlers
_inputs = [sort_by, show_proprietary, dataset_selector]
demo.load(fn=refresh, inputs=_inputs, outputs=[leaderboard_table])
sort_by.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table])
show_proprietary.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table])
dataset_selector.change(fn=refresh, inputs=_inputs, outputs=[leaderboard_table])
with gr.Tab("ℹ️ About"):
gr.Markdown(ABOUT_TEXT)
if __name__ == "__main__":
demo.launch()