leaderboard / src /about.py
Dylan123's picture
Upload folder using huggingface_hub
5479db5 verified
from __future__ import annotations
TITLE = "MOOD Bench Leaderboard"
METRIC_KEY = "tpr@fpr0.01"
METRIC_LABEL = "TPR @ FPR=1%"
INTRO_MARKDOWN = f"""
# MOOD Bench Leaderboard
Multi-domain out-of-distribution safety detection on the
[`mood-bench`](https://huggingface.co/datasets/mood-bench/mood-bench) test split.
Each row is a **detection method**. For every out-of-distribution unsafe domain we
report the true-positive rate at a fixed **1% false-positive rate** against the
pooled in-distribution *safe* conversations. The **Overall** column is the unweighted
mean across OOD-unsafe domains.
Use the **Columns** picker below to show the per-domain breakdown, or the filter
controls to narrow the table down. Sorted by Overall {METRIC_LABEL}.
"""
GROUPS: list[str] = [
"overall",
"id",
"controlling",
"insecure-code",
"scheming",
"jailbroken",
"sycophantic",
"function-calling-missing",
"function-calling-inappropriate",
"swahili",
]
GROUP_LABELS: dict[str, str] = {
"overall": "Overall",
"id": "ID",
"controlling": "Controlling",
"insecure-code": "Insecure Code",
"scheming": "Scheming",
"jailbroken": "Jailbroken",
"sycophantic": "Sycophantic",
"function-calling-missing": "FC Missing",
"function-calling-inappropriate": "FC Inapprop.",
"swahili": "Swahili",
}
META_COLUMNS: list[str] = ["Method", "Model", "Submitted By", "Submitted At"]
def metric_column(group: str) -> str:
"""Stable column header for the flattened DataFrame."""
return GROUP_LABELS[group]
ALL_METRIC_COLUMNS: list[str] = [metric_column(g) for g in GROUPS]
DEFAULT_VISIBLE_COLUMNS: list[str] = [
"Method",
"Model",
metric_column("overall"),
metric_column("id"),
]
PRIMARY_SORT_COLUMN = metric_column("overall")