Spaces:

treble-technologies
/

ffasr

Running on CPU Upgrade

App Files Files Community

ffasr / analytics.py

whojavumusic

changes and cleanup in text

af0dc0a 21 days ago

Raw

History Blame Contribute Delete

44.3 kB

	"""
	Interactive Plotly charts for the Analysis tab (legend toggles traces, zoom, pan, hover).

	Radar = compare models on normalized axes. Middle chart = WER across scenarios (one trace per model).
	Bar chart = grouped WER by scenario for top models.
	"""

	from __future__ import annotations

	import threading
	from typing import Sequence

	import numpy as np
	import pandas as pd
	import plotly.graph_objects as go
	from plotly.colors import qualitative

	from metrics_config import (
	HEATMAP_SCENARIO_KEYS,
	LIVE_SCENARIO_KEYS,
	SCENARIO_METRICS,
	heatmap_label_for_key,
	metric_by_key,
	)

	# Mean WER over realistic SNR splits (Pareto X-axis and ranking).
	SNR_WER_KEYS: tuple[str, ...] = (
	"wer_realistic_high_snr",
	"wer_realistic_mid_snr",
	"wer_realistic_low_snr",
	)


	def _wer_pct(v) -> float \| None:
	"""Fractional WER (0–1) → percent for chart display."""
	if v is None or (isinstance(v, float) and not np.isfinite(v)):
	return None
	try:
	f = float(v)
	except (TypeError, ValueError):
	return None
	if not np.isfinite(f):
	return None
	return f * 100.0


	def _pareto_efficient_frontier_wer_rtf(
	wers: Sequence[float], rtfs: Sequence[float]
	) -> tuple[list[float], list[float]]:
	"""Layer-1 envelope: X = WER (lower better), Y = RTF (higher better)."""
	pts = [
	(float(w), float(r))
	for w, r in zip(wers, rtfs)
	if np.isfinite(w) and np.isfinite(r)
	]
	if not pts:
	return [], []
	frontier: list[tuple[float, float]] = []
	for i, (wi, ri) in enumerate(pts):
	dominated = False
	for j, (wj, rj) in enumerate(pts):
	if i == j:
	continue
	if wj <= wi and rj >= ri and (wj < wi or rj > ri):
	dominated = True
	break
	if not dominated:
	frontier.append((wi, ri))
	frontier.sort(key=lambda p: p[0])
	return [p[0] for p in frontier], [p[1] for p in frontier]

	# Consistent height for Gradio Plot
	_FIG_HEIGHT = 460
	_TEMPLATE = "plotly_white"

	# Weight-file size lookup for the "Memory" radar axis. Populated lazily on first render;
	# falls back to NaN if the hub call fails (e.g. gated repo without token).
	_model_size_cache: dict[str, float] = {}
	_model_size_lock = threading.Lock()
	_WEIGHT_EXTS = (".safetensors", ".bin", ".pt", ".pth", ".gguf", ".ckpt", ".msgpack", ".h5")


	def _cached_model_size_mb(model_id: str) -> float:
	with _model_size_lock:
	if model_id in _model_size_cache:
	return _model_size_cache[model_id]
	total = 0
	try:
	from huggingface_hub import HfApi

	info = HfApi().model_info(model_id, files_metadata=True)
	for s in info.siblings or []:
	name = getattr(s, "rfilename", "") or ""
	size = getattr(s, "size", None) or 0
	if name.lower().endswith(_WEIGHT_EXTS):
	total += int(size)
	except Exception:
	total = 0
	mb = float(total) / (1024.0 * 1024.0) if total > 0 else float("nan")
	with _model_size_lock:
	_model_size_cache[model_id] = mb
	return mb


	def _coerce_wer(v) -> float \| None:
	"""Non-negative finite WER, or None if missing / invalid."""
	if v is None or v == "":
	return None
	try:
	x = float(v)
	return x if np.isfinite(x) and x >= 0 else None
	except (TypeError, ValueError):
	return None


	def _coerce_rtf(v) -> float \| None:
	"""Non-negative finite RTF (audio sec / inference sec), or None if missing."""
	if v is None or v == "":
	return None
	try:
	x = float(v)
	return x if np.isfinite(x) and x >= 0 else None
	except (TypeError, ValueError):
	return None


	def snr_avg_wer(row: dict) -> float \| None:
	"""Mean WER over high / mid / low SNR scenarios (skip missing)."""
	vals: list[float] = []
	for k in SNR_WER_KEYS:
	w = _coerce_wer(row.get(k))
	if w is not None:
	vals.append(w)
	if not vals:
	return None
	return float(sum(vals) / len(vals))


	def compute_pareto_layers(rows: list[dict]) -> dict[str, int \| None]:
	"""
	Pareto non-dominated layers over (snr_avg_wer ↓, eval_rtf ↑).

	Layer 1 = efficient frontier; peel and repeat. Missing WER or RTF → ``None``.
	"""
	pts: list[tuple[str, float, float]] = []
	out: dict[str, int \| None] = {}

	for r in rows:
	mid = (r.get("model_id") or "").strip()
	if not mid:
	continue
	w = snr_avg_wer(r)
	rtf = _coerce_rtf(r.get("eval_rtf"))
	if w is None or rtf is None:
	out[mid] = None
	else:
	pts.append((mid, w, rtf))

	unassigned = list(pts)
	layer = 1
	while unassigned:
	frontier: list[str] = []
	for i, (mid_i, w_i, r_i) in enumerate(unassigned):
	dominated = False
	for j, (_, w_j, r_j) in enumerate(unassigned):
	if i == j:
	continue
	if w_j <= w_i and r_j >= r_i and (w_j < w_i or r_j > r_i):
	dominated = True
	break
	if not dominated:
	frontier.append(mid_i)
	frontier_set = set(frontier)
	for mid in frontier:
	out[mid] = layer
	unassigned = [p for p in unassigned if p[0] not in frontier_set]
	layer += 1

	return out


	def _avg_wer_for_row(row: dict) -> float:
	"""Mean WER over live scenarios; ``inf`` when nothing populated."""
	vals: list[float] = []
	for k in LIVE_SCENARIO_KEYS:
	w = _coerce_wer(row.get(k))
	if w is not None:
	vals.append(w)
	if not vals:
	return float("inf")
	return sum(vals) / len(vals)


	def sort_leaderboard_rows_inplace(rows: list[dict]) -> None:
	"""Sort rows by Average WER ascending (lower WER first)."""
	rows.sort(key=lambda r: (_avg_wer_for_row(r), (r.get("model_id") or "")))


	def frontier_label(layer: int \| None) -> str:
	return str(layer) if layer is not None else "NA"


	def _log_normalize(values: np.ndarray, *, inverse: bool, strength_floor: float = 0.1) -> np.ndarray:
	"""
	Log10-normalize an array across all models to a "strength" score (default display floor 0.1).

	* `inverse=True` -> lower raw value is better (WER, memory). Strength = 1 - norm.
	* `inverse=False` -> higher raw value is better (RTF). Strength = norm.

	Returns NaN for entries whose raw value is missing or non-positive.
	"""
	v = np.asarray(values, dtype=float)
	out = np.full_like(v, np.nan, dtype=float)
	mask = np.isfinite(v) & (v > 0)
	if not mask.any():
	return out
	logv = np.log10(np.maximum(v[mask], 1e-12))
	lo = float(np.min(logv))
	hi = float(np.max(logv))
	if hi - lo < 1e-9:
	out[mask] = 0.5
	else:
	out[mask] = (logv - lo) / (hi - lo)
	if inverse:
	out[mask] = 1.0 - out[mask]
	fin = np.isfinite(out)
	out[fin] = np.clip(np.maximum(out[fin], strength_floor), strength_floor, 1.0)
	return out


	# Axis spec: (strength_label, primary col, fallback col, inverse, raw_display_name, raw_unit)
	# strength_label: chart axis label; always phrased so "farther = better".
	# inverse : True if lower raw value is better (e.g. WER, memory); False if higher is better (RTF).
	# raw_display_name: human-readable name of the underlying metric, shown in the hover tooltip.
	# raw_unit : optional unit suffix for the raw value in the tooltip.
	#
	# Live far-field scenario axes + Speed (RTF) + Compactness (parameters / Hub size).
	_RADAR_AXES: tuple[tuple[str, str, str \| None, bool, str, str], ...] = (
	("Near Field Speech", "wer_anechoic_speech", None, True, "Near field speech WER", ""),
	("Lab Measured", "wer_lab_measured", None, True, "Lab measured WER", ""),
	("Lab Simulated", "wer_lab_simulated", None, True, "Lab simulated WER", ""),
	("High SNR", "wer_realistic_high_snr", None, True, "High SNR WER", ""),
	("Mid SNR", "wer_realistic_mid_snr", None, True, "Mid SNR WER", ""),
	("Low SNR", "wer_realistic_low_snr", None, True, "Low SNR WER", ""),
	("Moving Sources", "wer_moving_sources", None, True, "Moving Sources WER", ""),
	("Speed", "eval_rtf", None, False, "RTFx", "× realtime"),
	# Compactness: smaller model -> more compact -> farther from centre.
	# `inverse=True` makes `_log_normalize` emit `1 - norm`, so the smallest
	# observed model maps to strength = 1.0 (radar edge) and the largest
	# maps to 0.0 (centre).
	#
	# Primary signal: parameter count (millions), which is recorded for every
	# backend by `attach_params` and is much more reliable than the Hub
	# weight-file size lookup (which can return NaN for gated/private repos
	# or repos that ship weights via LFS pointers without size metadata).
	# `model_size_mb` is the fallback for legacy rows that pre-date param
	# tracking.
	("Compactness", "num_params_m", "model_size_mb", True, "Parameters", " M"),
	)

	# Hover-label override when a radar axis falls back from `primary` to
	# `fallback`. Without this the tooltip would still claim the value is in the
	# primary metric (e.g. "Parameters: 1500.0000 M") when in fact the rendered
	# value came from the fallback column (`Model size: 1500.0000 MB`).
	_RADAR_FALLBACK_DISPLAY: dict[str, tuple[str, str]] = {
	"num_params_m": ("Model size", " MB"),
	}


	def _empty_fig(message: str) -> go.Figure:
	fig = go.Figure()
	fig.add_annotation(
	text=message,
	xref="paper",
	yref="paper",
	x=0.5,
	y=0.5,
	showarrow=False,
	font=dict(size=14),
	)
	fig.update_layout(
	template=_TEMPLATE,
	height=400,
	margin=dict(l=40, r=40, t=40, b=40),
	xaxis=dict(visible=False),
	yaxis=dict(visible=False),
	)
	return fig


	def _plotly_legend_config() -> dict:
	"""Legend click toggles traces; double-click isolates."""
	return dict(
	legend=dict(
	orientation="v",
	yanchor="top",
	y=1,
	xanchor="left",
	x=1.02,
	font=dict(size=10),
	),
	hovermode="closest",
	)


	def _raw_to_analytics_df(raw: list[dict]) -> pd.DataFrame:
	"""
	Parse leaderboard rows; coerce scenario WER + timing columns to float (NaN if missing).

	Computes:
	* ``avg_wer``: mean of live scenario WERs present for that row (reference).
	* ``model_size_mb``: total weight-file size fetched from the Hub (cached).
	"""
	from init import normalize_legacy_csv_row

	if not raw:
	return pd.DataFrame()
	rows = [normalize_legacy_csv_row(dict(r)) for r in raw]
	df = pd.DataFrame(rows)
	if "model_id" not in df.columns:
	return pd.DataFrame()
	for m in SCENARIO_METRICS:
	k = m.key
	if k not in df.columns:
	df[k] = np.nan
	else:
	df[k] = pd.to_numeric(df[k], errors="coerce")
	for k in ("eval_wall_time_s", "eval_rtf", "eval_audio_seconds", "num_params"):
	if k not in df.columns:
	df[k] = np.nan
	else:
	df[k] = pd.to_numeric(df[k], errors="coerce")
	# Millions of parameters; compact display.
	df["num_params_m"] = df["num_params"] / 1e6
	live_cols = [c for c in LIVE_SCENARIO_KEYS if c in df.columns]
	df["avg_wer"] = df[live_cols].mean(axis=1, skipna=True) if live_cols else np.nan
	layers = compute_pareto_layers(rows)
	df["pareto_layer"] = [
	layers.get((r.get("model_id") or "").strip()) for r in rows
	]
	df["snr_avg_wer"] = [snr_avg_wer(r) for r in rows]
	df["model_size_mb"] = df["model_id"].apply(_cached_model_size_mb)
	return df


	def _sort_df_leaderboard_order(df: pd.DataFrame) -> pd.DataFrame:
	"""Average WER ascending (lower is better)."""
	if df.empty:
	return df
	d = df.copy()
	if "avg_wer" in d.columns:
	return d.sort_values("avg_wer", ascending=True, na_position="last")
	return d


	def available_metric_keys(df: pd.DataFrame) -> list[str]:
	"""Scenario keys that have at least one non-NaN value."""
	keys = []
	for m in SCENARIO_METRICS:
	if m.key in df.columns and df[m.key].notna().any():
	keys.append(m.key)
	return keys


	# Brand-aligned colors per HF org / company. Lower-cased org prefix → hex.
	# Unknown orgs fall back to a deterministic palette so that the same company
	# keeps the same color across the Intelligence / Speed charts.
	_COMPANY_COLORS: dict[str, str] = {
	"nvidia": "#76B900", # NVIDIA green
	"openai": "#111111", # OpenAI black
	"openai-community": "#111111",
	"qwen": "#615CED", # Qwen purple
	"ibm-granite": "#0F62FE", # IBM blue
	"ibm": "#0F62FE",
	"coherelabs": "#39594D", # Cohere muted green
	"cohere": "#39594D",
	"efficient-speech": "#F2994A",
	"usefulsensors": "#22A7F0",
	"google": "#4285F4", # Google blue
	"google-t5": "#4285F4",
	"meta-llama": "#1877F2", # Meta blue
	"facebook": "#1877F2",
	"microsoft": "#00A4EF",
	"deepseek-ai": "#1D6FE0",
	"deepseek": "#1D6FE0",
	"moonshotai": "#111111",
	"mistralai": "#FF6F00",
	"anthropic": "#CC9B73",
	"x-ai": "#000000",
	"xai": "#000000",
	"zhipuai": "#1F8AFF",
	"zai-org": "#1F8AFF",
	}

	_FALLBACK_PALETTE: tuple[str, ...] = (
	"#1F77B4", "#FF7F0E", "#2CA02C", "#D62728", "#9467BD",
	"#8C564B", "#E377C2", "#7F7F7F", "#BCBD22", "#17BECF",
	)


	def _company_key(model_id: str) -> str:
	"""Normalized company key from a HF model id (`org/name` → `org`)."""
	mid = (model_id or "").strip()
	if "/" in mid:
	return mid.split("/", 1)[0].lower()
	return mid.lower()


	def _company_color_map(model_ids: Sequence[str]) -> dict[str, str]:
	"""Stable model_id → hex color mapping; same company shares a color."""
	out: dict[str, str] = {}
	fallback_assign: dict[str, str] = {}
	unknown_companies: list[str] = []
	for mid in model_ids:
	comp = _company_key(mid)
	if comp in _COMPANY_COLORS:
	out[mid] = _COMPANY_COLORS[comp]
	else:
	if comp not in fallback_assign:
	unknown_companies.append(comp)
	out[mid] = "" # filled in below
	# Deterministic palette assignment for unknown orgs (sorted alphabetically).
	for i, comp in enumerate(sorted(set(unknown_companies))):
	fallback_assign[comp] = _FALLBACK_PALETTE[i % len(_FALLBACK_PALETTE)]
	for mid in model_ids:
	if not out[mid]:
	out[mid] = fallback_assign[_company_key(mid)]
	return out


	def _short_model_label(model_id: str, max_len: int = 26) -> str:
	name = (model_id or "").split("/", 1)[-1]
	return name if len(name) <= max_len else name[: max_len - 1] + "…"


	def _hex_to_rgb(hex_color: str) -> tuple[int, int, int]:
	h = hex_color.lstrip("#")
	return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)


	def _mix(c: int, target: int, t: float) -> int:
	return int(round(c + (target - c) * max(0.0, min(1.0, t))))


	def _lighten(hex_color: str, t: float) -> str:
	"""t in [0,1] -> mix toward white (t=0 returns the original color)."""
	r, g, b = _hex_to_rgb(hex_color)
	return "#{:02X}{:02X}{:02X}".format(_mix(r, 255, t), _mix(g, 255, t), _mix(b, 255, t))


	def _darken(hex_color: str, t: float) -> str:
	"""t in [0,1] -> mix toward black (t=0 returns the original color)."""
	r, g, b = _hex_to_rgb(hex_color)
	return "#{:02X}{:02X}{:02X}".format(_mix(r, 0, t), _mix(g, 0, t), _mix(b, 0, t))


	def _to_rgba(hex_color: str, alpha: float) -> str:
	r, g, b = _hex_to_rgb(hex_color)
	return f"rgba({r},{g},{b},{alpha:.3f})"


	def _company_shade_assignments(
	model_ids: Sequence[str],
	color_map: dict[str, str],
	) -> tuple[list[str], list[str], list[float]]:
	"""
	For bars whose company appears multiple times in this chart, vary the
	luminance + opacity so adjacent same-color bars remain distinguishable.

	Returns parallel lists (one entry per model_id):
	* fill rgba color (with per-bar alpha baked in)
	* border hex color (a darker shade of the base brand color)
	* text opacity (kept ≥ 0.85 so in-bar labels stay legible)
	"""
	counts: dict[str, int] = {}
	for mid in model_ids:
	counts[_company_key(mid)] = counts.get(_company_key(mid), 0) + 1

	seen: dict[str, int] = {}
	fills: list[str] = []
	borders: list[str] = []
	text_alphas: list[float] = []
	for mid in model_ids:
	base = color_map[mid]
	comp = _company_key(mid)
	n = counts[comp]
	idx = seen.get(comp, 0)
	seen[comp] = idx + 1
	if n > 1:
	# First instance keeps base color; subsequent ones lighten ~12% per step
	# (capped at ~45%) and reduce alpha slightly so the boundary is obvious
	# even when two same-company bars sit next to each other.
	lighten_t = min(0.45, 0.12 * idx)
	alpha = max(0.65, 1.0 - 0.08 * idx)
	shaded = _lighten(base, lighten_t)
	fills.append(_to_rgba(shaded, alpha))
	borders.append(_darken(base, 0.35))
	text_alphas.append(max(0.9, 1.0 - 0.05 * idx))
	else:
	fills.append(_to_rgba(base, 1.0))
	borders.append(_darken(base, 0.35))
	text_alphas.append(1.0)
	return fills, borders, text_alphas


	def _plot_company_bars(
	labels: Sequence[str],
	values: Sequence[float],
	model_ids: Sequence[str],
	*,
	title: str,
	subtitle: str,
	value_fmt: str,
	hover_value_fmt: str,
	hover_metric_label: str,
	) -> go.Figure:
	"""
	Shared renderer for the Intelligence / Speed vertical bar charts.

	- Bars are colored by HF org via `_company_color_map`.
	- Same-company bars in the same chart get progressively lighter fills +
	slightly reduced alpha so they remain distinguishable.
	- Each bar has a thin darker outline ("corner line") tracing the rounded
	corners so it stands out from the white background.
	- Values are drawn inside the bar (white) with rounded corners.
	- Hover shows the full model id and the metric value.
	"""
	color_map = _company_color_map(list(model_ids))
	fills, borders, text_alphas = _company_shade_assignments(
	list(model_ids), color_map
	)

	text_in = [value_fmt.format(v) for v in values]
	text_colors = [f"rgba(255,255,255,{a:.3f})" for a in text_alphas]

	# Use integer x-positions so duplicate display labels (e.g. two models that
	# share the same truncated short name) don't collapse into a single stacked
	# bar. Tick labels are mapped back onto these positions below.
	x_positions = list(range(len(labels)))

	fig = go.Figure(
	go.Bar(
	x=x_positions,
	y=values,
	customdata=list(model_ids),
	marker=dict(
	color=fills,
	line=dict(color=borders, width=1.6),
	cornerradius=8,
	),
	text=text_in,
	textposition="inside",
	insidetextanchor="middle",
	textfont=dict(
	color=text_colors, size=13, family="Inter, Arial, sans-serif"
	),
	hovertemplate=(
	"<b>%{customdata}</b><br>"
	f"{hover_metric_label}: {hover_value_fmt}<extra></extra>"
	),
	)
	)

	max_v = max([float(v) for v in values if v is not None] or [1.0])
	fig.update_layout(
	title=dict(
	text=(
	f"<b>{title}</b>"
	f"<br><span style='font-size:12px;color:#6b7280'>{subtitle}</span>"
	),
	x=0.0,
	xanchor="left",
	font=dict(size=18),
	),
	xaxis=dict(
	tickmode="array",
	tickvals=x_positions,
	ticktext=list(labels),
	tickangle=-40,
	automargin=True,
	showgrid=False,
	showline=False,
	),
	yaxis=dict(
	visible=False,
	range=[0, max_v * 1.18],
	),
	template=_TEMPLATE,
	height=_FIG_HEIGHT,
	margin=dict(l=20, r=20, t=80, b=120),
	showlegend=False,
	bargap=0.25,
	plot_bgcolor="white",
	)
	return fig


	def plot_avg_wer_bars(
	df: pd.DataFrame,
	top_n: int = 10,
	) -> go.Figure:
	"""
	Ranked vertical bars of average WER (``avg_wer``) as a percent.

	Top models sorted ascending (lower WER is better); value inside the bar,
	rounded corners, one color per HF org.
	"""
	if df.empty or "avg_wer" not in df.columns:
	return _empty_fig("No leaderboard WER data yet.")

	d = df[["model_id", "avg_wer"]].dropna(subset=["avg_wer"]).copy()
	if d.empty:
	return _empty_fig("No WER values computed.")

	d["avg_wer"] = pd.to_numeric(d["avg_wer"], errors="coerce")
	d = d.dropna(subset=["avg_wer"])
	d["wer_pct"] = d["avg_wer"].apply(lambda v: _wer_pct(float(v)))
	d = d.dropna(subset=["wer_pct"])
	d = d.sort_values("wer_pct", ascending=True).head(max(1, int(top_n)))

	labels = [_short_model_label(m) for m in d["model_id"].tolist()]
	return _plot_company_bars(
	labels=labels,
	values=[float(v) for v in d["wer_pct"].tolist()],
	model_ids=d["model_id"].tolist(),
	title="WER",
	subtitle="Average WER (%) · Lower is better",
	value_fmt="{:.1f}",
	hover_value_fmt="%{y:.2f}%",
	hover_metric_label="Average WER",
	)


	def plot_speed_bars(
	df: pd.DataFrame,
	top_n: int = 10,
	) -> go.Figure:
	"""
	Ranked vertical bars of throughput (`eval_rtf`, × realtime).

	Companion to `plot_avg_wer_bars`: same color-per-company convention,
	same in-bar value labels, rounded corners.
	"""
	if df.empty or "eval_rtf" not in df.columns:
	return _empty_fig("No RTF measurements yet.")

	d = df[["model_id", "eval_rtf"]].copy()
	d["eval_rtf"] = pd.to_numeric(d["eval_rtf"], errors="coerce")
	d = d.dropna(subset=["eval_rtf"])
	d = d[d["eval_rtf"] > 0]
	if d.empty:
	return _empty_fig("No RTF measurements yet.")

	d = d.sort_values("eval_rtf", ascending=False).head(max(1, int(top_n)))

	labels = [_short_model_label(m) for m in d["model_id"].tolist()]
	values = [float(v) for v in d["eval_rtf"].tolist()]
	# Show ints when large, one decimal when small (matches AA "314" / "0.7" feel).
	fmt = "{:.0f}" if max(values) >= 10 else "{:.1f}"
	return _plot_company_bars(
	labels=labels,
	values=values,
	model_ids=d["model_id"].tolist(),
	title="Speed",
	subtitle="RTFx (audio sec / inference sec) · Higher is better",
	value_fmt=fmt,
	hover_value_fmt="%{y:.2f}×",
	hover_metric_label="RTFx",
	)


	def plot_leaderboard_score_bars(
	df: pd.DataFrame,
	top_n: int = 40,
	title: str \| None = None,
	) -> go.Figure:
	"""
	Ranked horizontal bar chart of average WER (percent).
	"""
	if df.empty or "avg_wer" not in df.columns:
	return _empty_fig("No leaderboard WER data yet.")
	cols = ["model_id", "avg_wer"]
	if "pareto_layer" in df.columns:
	cols.append("pareto_layer")
	d = df[cols].dropna(subset=["avg_wer"]).copy()
	if d.empty:
	return _empty_fig("No WER values computed.")
	n = max(1, int(top_n))
	d = _sort_df_leaderboard_order(d).head(n)
	labels = d["model_id"].str.split("/").str[-1].str[:42]
	scores = d["avg_wer"].astype(float).apply(lambda v: _wer_pct(v) or 0.0)
	fig = go.Figure(
	go.Bar(
	x=scores,
	y=labels,
	orientation="h",
	marker=dict(
	color=scores,
	colorscale="RdYlGn_r",
	showscale=True,
	colorbar=dict(title="WER (%)"),
	),
	text=[f"{s:.1f}" for s in scores],
	textposition="outside",
	hovertemplate="<b>%{y}</b><br>Average WER: %{x:.2f}%<extra></extra>",
	)
	)
	fig.update_layout(
	title=dict(
	text=title or f"Average WER (top {len(d)} models)",
	x=0.5,
	xanchor="center",
	),
	xaxis=dict(title="WER (%)", rangemode="tozero"),
	yaxis=dict(autorange="reversed"),
	template=_TEMPLATE,
	height=int(min(920, max(_FIG_HEIGHT, 100 + 22 * len(d)))),
	margin=dict(l=200, r=100, t=60, b=50),
	)
	return fig


	def _radar_absolute_value(row: pd.Series, axis: str) -> tuple[float, str]:
	"""Map a leaderboard row to an absolute 0–1 radar coordinate (higher = better)."""
	if axis == "WER":
	wer = pd.to_numeric(row.get("avg_wer"), errors="coerce")
	if not np.isfinite(wer) or float(wer) < 0:
	return 0.0, "Average WER: N/A"
	w = float(wer)
	pct = w * 100.0
	v = float(max(0.0, min(1.0, 1.0 / (1.0 + w))))
	return v, f"Average WER: {pct:.2f}% → strength {v:.3f}"
	if axis == "Speed":
	rtf = pd.to_numeric(row.get("eval_rtf"), errors="coerce")
	if not np.isfinite(rtf) or float(rtf) < 0:
	return 0.0, "RTFx: N/A"
	r = float(rtf)
	v = float(max(0.0, min(1.0, r / (1.0 + r))))
	return v, f"RTFx: {r:.4f}× → speed {v:.3f} (RTFx/(1+RTFx))"
	if axis == "Compactness":
	pm = pd.to_numeric(row.get("num_params_m"), errors="coerce")
	if not np.isfinite(pm) or float(pm) < 0:
	return 0.0, "Parameters: N/A"
	p = float(pm)
	v = float(max(0.0, min(1.0, 1.0 / (1.0 + p / 1000.0))))
	return v, f"Parameters: {p:.2f} M → compactness {v:.3f}"
	return 0.0, f"{axis}: N/A"


	_RADAR_ABSOLUTE_AXES: tuple[str, ...] = ("WER", "Speed", "Compactness")


	def plot_robustness_radar(
	df: pd.DataFrame,
	model_ids: Sequence[str],
	title: str = "Robustness radar (absolute 0–1; outward is better)",
	) -> go.Figure:
	"""
	Three-axis radar with absolute coordinates in [0, 1] (not relative to other models).

	WER strength = 1 / (1 + avg_wer); Speed = RTFx / (1 + RTFx);
	Compactness = 1 / (1 + num_params_m / 1000). Missing values map to 0.
	"""
	if df.empty:
	return _empty_fig("No leaderboard data yet.")

	d = df.copy().reset_index(drop=True)
	labels = list(_RADAR_ABSOLUTE_AXES)
	labels_closed = labels + [labels[0]]

	selected = [m for m in (model_ids or []) if m in set(d["model_id"])]
	if not selected:
	selected = d["model_id"].tolist()[: min(5, len(d))]

	fig = go.Figure()
	palette = qualitative.Plotly * 3

	for i, mid in enumerate(selected):
	rows = d.index[d["model_id"] == mid].tolist()
	if not rows:
	continue
	row = d.iloc[rows[0]]
	r_vals: list[float] = []
	raw_text: list[str] = []
	for ax in labels:
	v, tip = _radar_absolute_value(row, ax)
	r_vals.append(v)
	raw_text.append(tip)

	r_closed = r_vals + [r_vals[0]]
	raw_closed = raw_text + [raw_text[0]]

	short = mid.split("/")[-1][:28]
	color = palette[i % len(palette)]
	fig.add_trace(
	go.Scatterpolar(
	r=r_closed,
	theta=labels_closed,
	customdata=raw_closed,
	fill="none",
	name=short,
	line=dict(width=2.2, color=color),
	marker=dict(size=6, color=color),
	opacity=1.0,
	hovertemplate=(
	f"<b>{short}</b><br>"
	"%{theta}: %{r:.2f}<br>"
	"%{customdata}<extra></extra>"
	),
	)
	)

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 1],
	showticklabels=True,
	tickformat=".2f",
	tickvals=[0, 0.25, 0.5, 0.75, 1.0],
	),
	angularaxis=dict(direction="clockwise", rotation=90),
	),
	title=dict(text=title, x=0.5, xanchor="center"),
	template=_TEMPLATE,
	height=_FIG_HEIGHT,
	margin=dict(l=50, r=120, t=60, b=50),
	showlegend=True,
	**_plotly_legend_config(),
	)
	return fig


	def plot_radar_compare(
	df: pd.DataFrame,
	model_ids: Sequence[str],
	metric_keys: Sequence[str] \| None = None,
	title: str = "Robustness profile (relative strength; outward is better)",
	) -> go.Figure:
	"""Deprecated alias for older UI bindings; forwards to `plot_robustness_radar`."""
	return plot_robustness_radar(df, model_ids, title=title)


	def plot_compare_models_across_scenarios(
	df: pd.DataFrame,
	model_ids: Sequence[str],
	metric_keys: Sequence[str],
	title: str = "WER by model (one trace per scenario; use legend to toggle)",
	) -> go.Figure:
	"""
	Line chart: x = model (sorted by Average WER), y = WER, one trace per scenario.

	Scaling to more models now just extends the x axis instead of adding more traces,
	which keeps the legend small (≤ #scenarios) and makes cross-scenario comparisons
	for the same model trivial (read a vertical slice).

	`model_ids` filters which models appear on the x axis; empty / None → all rows.
	"""
	metric_keys = [k for k in metric_keys if k in df.columns]
	if df.empty or not metric_keys:
	return _empty_fig("Not enough data.")

	selected = [m for m in (model_ids or []) if m]
	sub = df[df["model_id"].isin(selected)].copy() if selected else df.copy()
	if sub.empty:
	return _empty_fig("No matching models.")

	# Sort models by Average WER (lower is better).
	if "avg_wer" in sub.columns:
	sub = sub.sort_values("avg_wer", ascending=True, na_position="last")
	sub = sub.reset_index(drop=True)

	x_labels = sub["model_id"].str.split("/").str[-1].str[:32].tolist()
	full_ids = sub["model_id"].tolist()

	fig = go.Figure()
	palette = qualitative.Plotly * 3

	for i, k in enumerate(metric_keys):
	m = metric_by_key(k)
	scen_label = m.short if m else k
	ys = [
	_wer_pct(v) if pd.notna(v) else None for v in sub[k].tolist()
	]
	fig.add_trace(
	go.Scatter(
	x=x_labels,
	y=ys,
	mode="lines+markers",
	name=scen_label,
	customdata=full_ids,
	line=dict(width=2, color=palette[i % len(palette)]),
	marker=dict(size=8),
	hovertemplate=(
	f"<b>%{{customdata}}</b><br>{scen_label}: %{{y:.2f}}%<extra></extra>"
	),
	)
	)

	n_models = len(x_labels)
	tickangle = -45 if n_models > 8 else -25
	fig.update_layout(
	title=dict(text=title, x=0.5, xanchor="center"),
	xaxis=dict(title="Model (sorted by Average WER)", tickangle=tickangle, automargin=True),
	yaxis=dict(title="WER (%) — lower is better", rangemode="tozero"),
	template=_TEMPLATE,
	height=_FIG_HEIGHT,
	margin=dict(l=60, r=120, t=60, b=140),
	**_plotly_legend_config(),
	)
	return fig


	def plot_scenario_heatmap(
	df: pd.DataFrame,
	metric_keys: Sequence[str],
	top_n: int = 30,
	title: str \| None = None,
	) -> go.Figure:
	"""
	Models × scenarios heatmap of WER values (lower = greener).

	This replaces the per-scenario line chart for the common case of N≫few models
	and a small fixed set of conditions: every row is one model and every column is
	one condition, so the eye can immediately spot:
	- which models do badly on which conditions (hot rows / cells),
	- which conditions are universally hard (whole columns red),
	- and how the top of the leaderboard compares to the rest.

	`metric_keys` selects which scenario columns to show (in given order). Models
	are sorted by Average WER and capped at `top_n` rows so the chart
	stays legible even with hundreds of leaderboard entries.
	"""
	metric_keys = [k for k in metric_keys if k in df.columns]
	if df.empty or not metric_keys:
	return _empty_fig("Not enough data for the heatmap.")

	d = _sort_df_leaderboard_order(df.copy())
	d = d.head(max(1, int(top_n))).reset_index(drop=True)

	# Omit scenarios with no data in the visible rows (avoids sparse heatmaps).
	keep_keys: list[str] = []
	for k in metric_keys:
	col = pd.to_numeric(d[k], errors="coerce")
	if col.notna().any():
	keep_keys.append(k)
	if not keep_keys:
	return _empty_fig("No WER values for the selected scenarios.")

	order_index = {k: i for i, k in enumerate(HEATMAP_SCENARIO_KEYS)}
	keep_keys = sorted(keep_keys, key=lambda k: order_index.get(k, len(order_index)))

	x_labels = [heatmap_label_for_key(k) for k in keep_keys]
	y_labels = d["model_id"].str.split("/").str[-1].str[:42].tolist()

	z: list[list[float]] = []
	cell_text: list[list[str]] = []
	for _, row in d.iterrows():
	z_row: list[float] = []
	t_row: list[str] = []
	for k in keep_keys:
	v = pd.to_numeric(row.get(k), errors="coerce")
	if pd.isna(v):
	z_row.append(float("nan"))
	t_row.append("")
	else:
	pct = _wer_pct(float(v))
	if pct is None:
	z_row.append(float("nan"))
	t_row.append("")
	else:
	z_row.append(float(pct))
	t_row.append(f"{pct:.1f}")
	z.append(z_row)
	cell_text.append(t_row)

	flat_vals = [v for row in z for v in row if np.isfinite(v)]
	if flat_vals:
	raw_max = float(max(flat_vals))
	if raw_max <= 25:
	zmax = 25.0
	elif raw_max <= 60:
	zmax = float(np.ceil(raw_max / 5.0) * 5.0)
	elif raw_max <= 100:
	zmax = float(np.ceil(raw_max / 10.0) * 10.0)
	else:
	zmax = float(np.ceil(raw_max / 25.0) * 25.0)
	else:
	zmax = 100.0

	fig = go.Figure(
	data=go.Heatmap(
	z=z,
	x=x_labels,
	y=y_labels,
	text=cell_text,
	texttemplate="%{text}",
	textfont=dict(size=10, color="#1a1a1a"),
	hovertemplate="<b>%{y}</b><br>%{x}: %{z:.2f}%<extra></extra>",
	colorscale="RdYlGn_r",
	zmin=0.0,
	zmax=zmax,
	xgap=2,
	ygap=2,
	colorbar=dict(title="WER (%)", tickformat=".0f"),
	)
	)

	n_rows = max(1, len(y_labels))
	# Dynamic height so 30 models gets ~460 px and 80 models gets a taller chart,
	# without becoming uselessly tall.
	height = int(min(900, max(_FIG_HEIGHT, 80 + 18 * n_rows)))
	fig.update_layout(
	title=dict(
	text=title
	or f"WER heatmap: top {n_rows} models × {len(keep_keys)} scenarios (by Average WER)",
	x=0.5,
	xanchor="center",
	),
	xaxis=dict(
	title=None,
	tickangle=-35,
	side="top",
	automargin=True,
	tickfont=dict(size=10),
	),
	yaxis=dict(title="Model (sorted by Average WER)", automargin=True, autorange="reversed"),
	template=_TEMPLATE,
	height=height,
	margin=dict(l=80, r=60, t=110, b=60),
	)
	# Scenario tick labels sit on top (side="top"); placing the axis title there too
	# collides with the plot title, so render the "Scenario" label at the bottom.
	fig.add_annotation(
	text="Scenario",
	xref="paper",
	yref="paper",
	x=0.5,
	y=-0.02,
	yanchor="top",
	showarrow=False,
	font=dict(size=13),
	)
	return fig


	def plot_clean_vs_reverb_scatter(
	df: pd.DataFrame,
	title: str = "Near Field Speech versus Lab Simulated WER",
	) -> go.Figure:
	"""
	Scatter: x = near-field speech WER, y = lab-simulated WER. Models often sit above y=x when simulation is harder.
	"""
	c1, c2 = "wer_anechoic_speech", "wer_lab_simulated"
	if df.empty or c1 not in df.columns or c2 not in df.columns:
	return _empty_fig("Need near-field speech and lab-simulated WER columns in the leaderboard.")

	d = df[["model_id", c1, c2]].copy()
	d[c1] = pd.to_numeric(d[c1], errors="coerce")
	d[c2] = pd.to_numeric(d[c2], errors="coerce")
	d = d.dropna(subset=[c1, c2])
	if d.empty:
	return _empty_fig("No complete near-field speech / lab-simulated WER pairs yet.")

	fig = go.Figure()
	x_pct = [_wer_pct(v) for v in d[c1]]
	y_pct = [_wer_pct(v) for v in d[c2]]
	mx = float(max((v for v in x_pct + y_pct if v is not None), default=0.0))
	fig.add_trace(
	go.Scatter(
	x=[0, mx],
	y=[0, mx],
	mode="lines",
	name="y = x (equal WER)",
	line=dict(dash="dash", color="rgba(0,0,0,0.35)", width=2),
	hoverinfo="skip",
	)
	)
	short = d["model_id"].str.split("/").str[-1].str[:28]
	fig.add_trace(
	go.Scatter(
	x=x_pct,
	y=y_pct,
	mode="markers",
	text=short,
	marker=dict(size=10, opacity=0.85),
	hovertemplate=(
	"<b>%{text}</b><br>Near field speech WER: %{x:.2f}%<br>"
	"Lab simulated WER: %{y:.2f}%<extra></extra>"
	),
	)
	)

	fig.update_layout(
	title=dict(text=title, x=0.5, xanchor="center"),
	xaxis=dict(title="WER (%), Near Field Speech (lower is better)", rangemode="tozero"),
	yaxis=dict(title="WER (%), Lab Simulated (lower is better)", rangemode="tozero"),
	template=_TEMPLATE,
	height=_FIG_HEIGHT,
	margin=dict(l=60, r=40, t=60, b=60),
	showlegend=True,
	)
	return fig


	def plot_pareto_frontier(
	df: pd.DataFrame,
	title: str \| None = None,
	) -> go.Figure:
	"""
	Pareto front: X = Average WER (%, live scenarios), Y = RTFx (log scale).

	Frontier models are highlighted with star markers + name labels and connected
	by a dashed line; non-frontier models render as light dots.
	"""
	if df.empty:
	return _empty_fig("No leaderboard data yet.")

	need = {"model_id", "avg_wer", "eval_rtf"}
	if not need.issubset(df.columns):
	return _empty_fig("Missing Average WER or RTFx columns for the Pareto chart.")

	d = df[["model_id", "avg_wer", "eval_rtf"]].copy()
	d["avg_wer"] = pd.to_numeric(d["avg_wer"], errors="coerce")
	d["eval_rtf"] = pd.to_numeric(d["eval_rtf"], errors="coerce")
	d = d.dropna(subset=["avg_wer", "eval_rtf"])
	d = d[d["eval_rtf"] > 0]
	if d.empty:
	return _empty_fig(
	"No models with Average WER and RTFx. Re-run evaluations to populate timing."
	)

	d["wer_pct"] = d["avg_wer"].apply(lambda v: _wer_pct(float(v)))
	d = d.dropna(subset=["wer_pct"]).reset_index(drop=True)

	# Layer-1 frontier on (avg_wer, eval_rtf): lower WER + higher RTF is better.
	fx_raw, fy = _pareto_efficient_frontier_wer_rtf(
	d["avg_wer"].tolist(),
	d["eval_rtf"].tolist(),
	)
	frontier_pairs = set(zip([round(w, 8) for w in fx_raw], [round(r, 8) for r in fy]))

	def _is_frontier(row) -> bool:
	return (round(float(row["avg_wer"]), 8), round(float(row["eval_rtf"]), 8)) in frontier_pairs

	d["_frontier"] = d.apply(_is_frontier, axis=1)
	frontier_df = d[d["_frontier"]].sort_values("wer_pct").reset_index(drop=True)
	other_df = d[~d["_frontier"]]

	accent = "#1f77ff" # cool blue (matches screenshot)
	accent_light = "rgba(31, 119, 255, 0.30)"

	fig = go.Figure()

	if not other_df.empty:
	fig.add_trace(
	go.Scatter(
	x=other_df["wer_pct"],
	y=other_df["eval_rtf"],
	mode="markers",
	name="Other models",
	marker=dict(size=9, color=accent_light, line=dict(width=0, color="white")),
	text=other_df["model_id"],
	hovertemplate=(
	"<b>%{text}</b><br>"
	"Average WER: %{x:.2f}%<br>"
	"RTFx: %{y:.2f}×<extra></extra>"
	),
	)
	)

	if not frontier_df.empty:
	fig.add_trace(
	go.Scatter(
	x=frontier_df["wer_pct"],
	y=frontier_df["eval_rtf"],
	mode="lines+markers+text",
	name="Pareto frontier",
	line=dict(color=accent, width=2, dash="dash"),
	marker=dict(size=14, color=accent, symbol="star", line=dict(width=0, color="white")),
	text=frontier_df["model_id"],
	textposition="top center",
	textfont=dict(size=11, color=accent),
	cliponaxis=False,
	hovertemplate=(
	"<b>%{text}</b><br>"
	"Average WER: %{x:.2f}%<br>"
	"RTFx: %{y:.2f}×<extra></extra>"
	),
	)
	)

	ttl = title or "Pareto Front: Average WER vs RTFx"
	rtf_min = float(d["eval_rtf"].min())
	rtf_max = float(d["eval_rtf"].max())
	log_y_lo = np.log10(max(rtf_min * 0.85, 1e-4))
	log_y_hi = np.log10(rtf_max * 1.2)
	fig.update_layout(
	title=dict(text=ttl, x=0.0, xanchor="left", font=dict(size=18)),
	xaxis=dict(
	title="Average WER (lower is better)",
	ticksuffix="",
	zeroline=False,
	),
	template=_TEMPLATE,
	height=_FIG_HEIGHT,
	autosize=True,
	margin=dict(l=70, r=40, t=70, b=60),
	showlegend=True,
	legend=dict(
	yanchor="top",
	y=0.98,
	xanchor="right",
	x=0.99,
	bgcolor="rgba(255,255,255,0.7)",
	borderwidth=0,
	),
	)
	fig.update_yaxes(
	title="RTFx (higher is better)",
	type="log",
	zeroline=False,
	range=[log_y_lo, log_y_hi],
	tickformat=".2g",
	)
	fig.update_xaxes(
	title="Average WER (lower is better)",
	zeroline=False,
	range=[0, 100],
	)
	return fig


	def plot_scenario_bar_summary(df: pd.DataFrame, top_n: int = 8) -> go.Figure:
	"""Grouped bars: one trace per model; legend toggles models."""
	if df.empty:
	return _empty_fig("No leaderboard data yet.")

	cols = [c for c in HEATMAP_SCENARIO_KEYS if c in df.columns]
	if not cols:
	return _empty_fig("No core WER columns.")

	d = df.copy()
	if "avg_wer" in d.columns:
	d = d.sort_values("avg_wer", ascending=True, na_position="last").head(int(top_n))
	else:
	d["_avg"] = d[cols].mean(axis=1, skipna=True)
	d = d.sort_values("_avg", ascending=True).head(int(top_n))

	scenarios = cols
	x_labels = [metric_by_key(c).short if metric_by_key(c) else c for c in scenarios]
	fig = go.Figure()
	palette = qualitative.Plotly * 3

	for i, (_, row) in enumerate(d.iterrows()):
	heights = [
	_wer_pct(row[c]) if pd.notna(row[c]) else None for c in scenarios
	]
	short = row["model_id"].split("/")[-1][:22]
	fig.add_trace(
	go.Bar(
	name=short,
	x=x_labels,
	y=heights,
	marker_color=palette[i % len(palette)],
	hovertemplate=f"<b>{short}</b><br>%{{x}}: %{{y:.2f}}%<extra></extra>",
	)
	)

	fig.update_layout(
	barmode="group",
	title=dict(
	text=f"WER by scenario: top {len(d)} models by Average WER (legend toggles models)",
	x=0.5,
	xanchor="center",
	),
	xaxis=dict(title="Scenario", tickangle=-20),
	yaxis=dict(title="WER (%)", rangemode="tozero"),
	template=_TEMPLATE,
	height=_FIG_HEIGHT,
	margin=dict(l=60, r=120, t=60, b=100),
	**_plotly_legend_config(),
	)
	return fig


	def plot_lines_by_rank(df: pd.DataFrame, metric_keys: Sequence[str], title: str \| None = None) -> go.Figure:
	"""Deprecated: use plot_clean_vs_reverb_scatter or plot_latency_vs_wer."""
	return plot_clean_vs_reverb_scatter(df, title=title or "Clean vs reverberant WER")