from __future__ import annotations TITLE = "MOOD Bench Leaderboard" METRIC_KEY = "tpr@fpr0.01" METRIC_LABEL = "TPR @ FPR=1%" INTRO_MARKDOWN = f""" # MOOD Bench Leaderboard Multi-domain out-of-distribution safety detection on the [`mood-bench`](https://huggingface.co/datasets/mood-bench/mood-bench) test split. Each row is a **detection method**. For every out-of-distribution unsafe domain we report the true-positive rate at a fixed **1% false-positive rate** against the pooled in-distribution *safe* conversations. The **Overall** column is the unweighted mean across OOD-unsafe domains. Use the **Columns** picker below to show the per-domain breakdown, or the filter controls to narrow the table down. Sorted by Overall {METRIC_LABEL}. """ GROUPS: list[str] = [ "overall", "id", "controlling", "insecure-code", "scheming", "jailbroken", "sycophantic", "function-calling-missing", "function-calling-inappropriate", "swahili", ] GROUP_LABELS: dict[str, str] = { "overall": "Overall", "id": "ID", "controlling": "Controlling", "insecure-code": "Insecure Code", "scheming": "Scheming", "jailbroken": "Jailbroken", "sycophantic": "Sycophantic", "function-calling-missing": "FC Missing", "function-calling-inappropriate": "FC Inapprop.", "swahili": "Swahili", } META_COLUMNS: list[str] = ["Method", "Model", "Submitted By", "Submitted At"] def metric_column(group: str) -> str: """Stable column header for the flattened DataFrame.""" return GROUP_LABELS[group] ALL_METRIC_COLUMNS: list[str] = [metric_column(g) for g in GROUPS] DEFAULT_VISIBLE_COLUMNS: list[str] = [ "Method", "Model", metric_column("overall"), metric_column("id"), ] PRIMARY_SORT_COLUMN = metric_column("overall")