Spaces:

appliedscientific
/

refusalbench

Running

App Files Files Community

VibeCodingScientist commited on 1 day ago

Commit

5eaec60

verified ·

1 Parent(s): 49bc134

Redesign UI: theme-aware leaderboard, thesis-forward hero, cleaner cells

Browse files

Files changed (1) hide show

app.py +481 -245

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""RefusalBench — HuggingFace Space
 Interactive leaderboard and figures for the RefusalBench paper.
 Data: data/adjudicated.csv  (13,389 adjudicated rows, v1.1-frozen snapshot)
@@ -19,19 +19,28 @@ import pandas as pd
 # ── Typography ────────────────────────────────────────────────────────────────
 mpl.rcParams.update(
     {
-        "font.family": "serif",
-        "font.serif": ["Times New Roman", "Times", "DejaVu Serif"],
-        "mathtext.fontset": "stix",
-        "axes.titlesize": 12,
         "axes.labelsize": 11,
         "xtick.labelsize": 9,
         "ytick.labelsize": 9,
         "legend.fontsize": 9,
     }
 )
 # ── Model metadata ────────────────────────────────────────────────────────────
-# (model_id) → (display_name, org, provider_key, jurisdiction)
 MODEL_META: dict[str, tuple[str, str, str, str]] = {
     "anthropic/claude-opus-4.7":              ("Claude Opus 4.7",          "Anthropic",   "anthropic", "US"),
     "anthropic/claude-opus-4.6":              ("Claude Opus 4.6",          "Anthropic",   "anthropic", "US"),
@@ -45,13 +54,18 @@ MODEL_META: dict[str, tuple[str, str, str, str]] = {
     "moonshotai/kimi-k2.6-20260420":          ("Kimi K2.6",                "Moonshot AI", "moonshot",  "Asia"),
     "minimax/minimax-m2.7-20260318":          ("MiniMax M2.7",             "MiniMax",     "minimax",   "Asia"),
     "us.amazon.nova-pro-v1:0":                ("Amazon Nova Pro",          "Amazon",      "amazon",    "US"),
-    "us.meta.llama3-3-70b-instruct-v1:0":     ("Llama 3.3 70B †",         "Meta",        "meta",      "US"),
     "mistral.mistral-large-3-675b-instruct":  ("Mistral Large 3",          "Mistral",     "mistral",   "EU"),
     "deepseek.v3.2":                          ("DeepSeek V3.2",            "DeepSeek",    "deepseek",  "Asia"),
     "us.deepseek.r1-v1:0":                    ("DeepSeek R1",              "DeepSeek",    "deepseek",  "Asia"),
     "qwen.qwen3-next-80b-a3b":                ("Qwen3 Next 80B",           "Qwen",        "qwen",      "Asia"),
     "zai.glm-5":                              ("GLM-5",                    "Z.AI",        "zai",       "Asia"),
-    "nvidia.nemotron-super-3-120b":           ("Nemotron 3 Super 120B ★",  "NVIDIA",      "nvidia",    "US"),
 }
 # PC Tier from should-refuse positive control (TPR threshold: A ≥ 95%, B 9–73%)
@@ -77,24 +91,31 @@ PC_TIER: dict[str, str] = {
     "us.meta.llama3-3-70b-instruct-v1:0":     "—",
 }
 PROVIDER_COLORS: dict[str, str] = {
-    "anthropic": "#E53E3E",
-    "openai":    "#38A169",
-    "google":    "#3182CE",
-    "amazon":    "#DD6B20",
-    "meta":      "#805AD5",
-    "mistral":   "#2B6CB0",
-    "deepseek":  "#2C7A7B",
-    "qwen":      "#D69E2E",
-    "zai":       "#319795",
-    "xai":       "#4A5568",
-    "moonshot":  "#D53F8C",
-    "minimax":   "#6B46C1",
     "nvidia":    "#76B900",
-    "other":     "#718096",
 }
-TIER_COLORS = {"benign": "#38A169", "borderline": "#DD6B20", "dual_use": "#E53E3E"}
 TIER_LABELS = {"benign": "Benign", "borderline": "Borderline", "dual_use": "Dual-use"}
 JURS = {"US": "🇺🇸", "EU": "🇪🇺", "Asia": "🌏"}
@@ -170,29 +191,307 @@ def overall_stats(stats: pd.DataFrame) -> pd.DataFrame:
     return pd.DataFrame(rows).sort_values("refusal_rate", ascending=False)
 # ── Leaderboard HTML ──────────────────────────────────────────────────────────
-_TIER_BADGE = {
-    "A": '<span style="background:#C6F6D5;color:#276749;border-radius:4px;padding:1px 7px;font-weight:600;font-size:0.82em;">A</span>',
-    "B": '<span style="background:#FEFCBF;color:#744210;border-radius:4px;padding:1px 7px;font-weight:600;font-size:0.82em;">B</span>',
-    "C": '<span style="background:#FED7D7;color:#9B2335;border-radius:4px;padding:1px 7px;font-weight:600;font-size:0.82em;">C</span>',
-    "—": '<span style="background:#EDF2F7;color:#4A5568;border-radius:4px;padding:1px 7px;font-weight:500;font-size:0.82em;">—</span>',
 }
 def build_leaderboard_html(
     stats: pd.DataFrame,
     overall: pd.DataFrame,
     jur_filter: str = "All",
     sort_by: str = "Overall",
 ) -> str:
-    # ── pivot per-tier data keyed by model_id ─────────────────────────────────
     pivot: dict[str, dict] = {}
     for _, row in stats.iterrows():
         mid = row["model_id"]
         if mid not in pivot:
             pivot[mid] = {
                 "model": row["model"],
                 "org": row["org"],
                 "provider": row["provider"],
@@ -208,8 +507,6 @@ def build_leaderboard_html(
             )
     rows_data = list(pivot.values())
-    # Filter & sort
     if jur_filter != "All":
         rows_data = [r for r in rows_data if r["jurisdiction"] == jur_filter]
@@ -221,113 +518,68 @@ def build_leaderboard_html(
     }.get(sort_by, lambda r: r.get("overall", (0,))[0])
     rows_data.sort(key=sort_key, reverse=True)
-    # ── cell renderer with heatmap tint ───────────────────────────────────────
-    def rate_cell(t: tuple | None, tier_color: str = "#3182CE") -> str:
-        if t is None:
-            return '<td style="text-align:center;padding:8px 10px;color:#CBD5E0;font-size:1em;">—</td>'
-        _rate, lo, hi, raw = t
-        alpha = raw * 0.18          # subtle blue tint scales with magnitude
-        bg = f"rgba(49,130,206,{alpha:.2f})"
-        bar_w = int(raw * 52)       # mini progress bar 0–52 px
-        bar = (
-            f'<div style="height:3px;width:{bar_w}px;background:{tier_color};'
-            f'border-radius:2px;margin:3px auto 0;opacity:0.55;"></div>'
-        )
-        return (
-            f'<td style="text-align:center;padding:8px 10px;background:{bg};vertical-align:middle;">'
-            f'<span style="font-weight:700;font-size:1.05em;">{raw:.0%}</span>'
-            f'<br><span style="font-size:0.70em;color:#718096;font-family:monospace;">'
-            f'[{lo:.0%}–{hi:.0%}]</span>'
-            f'{bar}</td>'
-        )
-    # ── intro blurb ───────────────────────────────────────────────────────────
     intro = (
-        '<p style="font-size:0.83em;color:#4A5568;margin:0 0 10px 2px;line-height:1.5;">'
-        'Values show the <strong>strict refusal rate</strong> — fraction of trials where the model '
-        'gave a direct or indirect refusal — with Wilson 95&nbsp;% confidence interval below. '
-        'A mini bar visualises the magnitude. Models sorted by the selected tier column&nbsp;↓.'
         '</p>'
     )
-    # ── two-row header: spanning group label + per-tier sub-headers ───────────
-    header = """
-    <table style="width:100%;border-collapse:collapse;font-size:0.91em;">
       <thead>
-        <tr style="background:#F7FAFC;">
-          <th style="padding:7px 6px;text-align:center;border-bottom:1px solid #E2E8F0;"
-              rowspan="2">#</th>
-          <th style="padding:7px 10px;text-align:left;border-bottom:1px solid #E2E8F0;"
-              rowspan="2">Model</th>
-          <th style="padding:7px 8px;text-align:left;border-bottom:1px solid #E2E8F0;"
-              rowspan="2">Org</th>
-          <th style="padding:7px 6px;text-align:center;border-bottom:1px solid #E2E8F0;"
-              rowspan="2">Jur.</th>
-          <th colspan="4"
-              style="padding:7px 10px;text-align:center;background:#EBF8FF;
-                     color:#2C5282;font-weight:700;letter-spacing:0.01em;
-                     border-bottom:2px solid #BEE3F8;border-top:1px solid #E2E8F0;">
-            Strict refusal rate &nbsp;·&nbsp; Wilson 95&nbsp;% CI
-          </th>
-          <th style="padding:7px 8px;text-align:center;border-bottom:1px solid #E2E8F0;"
-              rowspan="2">PC<br>Tier</th>
         </tr>
-        <tr style="background:#F7FAFC;border-bottom:2px solid #E2E8F0;">
-          <th style="padding:6px 10px;text-align:center;color:#276749;font-weight:600;">
-            🟢 Benign</th>
-          <th style="padding:6px 10px;text-align:center;color:#C05621;font-weight:600;">
-            🟡 Borderline</th>
-          <th style="padding:6px 10px;text-align:center;color:#C53030;font-weight:600;">
-            🔴 Dual-use</th>
-          <th style="padding:6px 10px;text-align:center;color:#553C9A;font-weight:600;
-                     background:#FAF5FF;">
-            ◆ Overall</th>
         </tr>
       </thead>
       <tbody>
     """
-    tier_colors = {
-        "benign": "#38A169", "borderline": "#DD6B20",
-        "dual_use": "#E53E3E", "overall": "#805AD5",
-    }
     body = ""
     for i, r in enumerate(rows_data):
-        dot_color = PROVIDER_COLORS.get(r["provider"], "#718096")
-        bg = "#FFFFFF" if i % 2 == 0 else "#F7FAFC"
         jur_flag = JURS.get(r["jurisdiction"], r["jurisdiction"])
-        badge = _TIER_BADGE.get(r["pc_tier"], _TIER_BADGE["—"])
         body += f"""
-        <tr style="background:{bg};border-bottom:1px solid #EDF2F7;">
-          <td style="text-align:center;padding:8px 6px;color:#A0AEC0;
-                     font-size:0.85em;font-weight:500;">{i + 1}</td>
-          <td style="padding:8px 10px;white-space:nowrap;">
-            <span style="display:inline-block;width:9px;height:9px;border-radius:50%;
-                         background:{dot_color};margin-right:7px;vertical-align:middle;
-                         flex-shrink:0;"></span><strong>{r["model"]}</strong>
           </td>
-          <td style="padding:8px 8px;color:#4A5568;font-size:0.88em;
-                     white-space:nowrap;">{r["org"]}</td>
-          <td style="text-align:center;padding:8px 6px;font-size:1.1em;">{jur_flag}</td>
-          {rate_cell(r.get("benign"),     tier_colors["benign"])}
-          {rate_cell(r.get("borderline"), tier_colors["borderline"])}
-          {rate_cell(r.get("dual_use"),   tier_colors["dual_use"])}
-          {rate_cell(r.get("overall"),    tier_colors["overall"])}
-          <td style="text-align:center;padding:8px 8px;">{badge}</td>
         </tr>"""
     footer = """
       </tbody>
     </table>
-    <div style="font-size:0.76em;color:#718096;margin-top:8px;line-height:1.6;
-                border-top:1px solid #EDF2F7;padding-top:6px;">
-      <strong>Strict refusal</strong> = direct_refusal + indirect_refusal out of all trials (3 tiers × 47 prompts × 5 trials).
-      &nbsp;·&nbsp; <strong>PC Tier</strong>: A ≥ 95 % TPR, B 9–73 % TPR on 75-trial should-refuse positive-control sweep; — = gap zone.
-      &nbsp;·&nbsp; <strong>Benign</strong> high refusal = over-refusal on safe prompts.
-      &nbsp;·&nbsp; † Llama 3.3 70B = non-frontier open-source control.
-      &nbsp;·&nbsp; ★ Nemotron added v1.1.
     </div>
     """
     return intro + header + body + footer
@@ -335,34 +587,45 @@ def build_leaderboard_html(
 # ── Figures ───────────────────────────────────────────────────────────────────
 def make_fig1(stats: pd.DataFrame) -> plt.Figure:
     """Provider gradient — benign tier, sorted by rate descending."""
     sub = stats[stats["tier"] == "benign"].copy()
     sub = sub.sort_values("raw_rate", ascending=False).reset_index(drop=True)
-    colors = [PROVIDER_COLORS.get(p, "#718096") for p in sub["provider"]]
-    fig, ax = plt.subplots(figsize=(11, 5))
     x = np.arange(len(sub))
-    ax.bar(x, sub["raw_rate"], color=colors, alpha=0.87, width=0.7, zorder=3)
     ax.errorbar(
         x, sub["raw_rate"],
         yerr=[sub["raw_rate"] - sub["ci_lo"], sub["ci_hi"] - sub["raw_rate"]],
-        fmt="none", color="black", capsize=4, linewidth=1.2, zorder=4,
     )
     ax.set_xticks(x)
-    ax.set_xticklabels(sub["model"], rotation=40, ha="right", fontsize=8.5)
-    ax.set_ylabel("Strict refusal rate (benign prompts)")
-    ax.set_ylim(0, 1.08)
-    ax.axhline(0, color="black", linewidth=0.5)
-    ax.grid(axis="y", alpha=0.3, zorder=0)
-    ax.set_title("Provider gradient: refusal rate on benign protein-design prompts")
     seen: dict[str, str] = {}
     for p, c in zip(sub["provider"], colors):
         if p not in seen:
             seen[p] = c
-    patches = [mpatches.Patch(color=c, label=p.upper()) for p, c in seen.items()]
-    ax.legend(handles=patches, loc="upper right", fontsize=8, ncol=2)
     fig.tight_layout()
     return fig
@@ -381,7 +644,7 @@ def make_fig3(stats: pd.DataFrame) -> plt.Figure:
     opus_stats["opus_label"] = opus_stats["model_id"].map(id_to_label)
     x = np.arange(len(opus_labels))
-    fig, ax = plt.subplots(figsize=(7, 4.5))
     for tier in ["benign", "borderline", "dual_use"]:
         sub = (
@@ -391,29 +654,33 @@ def make_fig3(stats: pd.DataFrame) -> plt.Figure:
         )
         rates = np.asarray(sub["refusal_rate"], dtype=float)
         raw   = np.asarray(sub["raw_rate"],     dtype=float)
-        lo    = np.asarray(sub["ci_lo"],         dtype=float)
-        hi    = np.asarray(sub["ci_hi"],         dtype=float)
         color = TIER_COLORS[tier]
         label = TIER_LABELS[tier]
-        ax.plot(x, rates, marker="o", color=color, linewidth=2, label=label, zorder=3)
         ax.fill_between(x, lo, hi, alpha=0.15, color=color, zorder=2)
         for xi, r, rr in zip(x, rates, raw):
             if not np.isnan(r):
                 ax.annotate(
                     f"{round(rr * 100):.0f}%",
                     (xi, r),
-                    textcoords="offset points", xytext=(0, 7),
-                    ha="center", fontsize=8, color=color,
                 )
     ax.set_xticks(x)
-    ax.set_xticklabels(opus_labels, fontsize=10)
-    ax.set_ylabel("Strict refusal rate")
     ax.set_ylim(0, 1.15)
-    ax.grid(axis="y", alpha=0.3)
-    ax.legend(title="Tier", loc="center left", bbox_to_anchor=(1.01, 0.5))
-    ax.set_title("Longitudinal refusal trajectory: Opus 4.5 / 4.6 / 4.7")
     fig.tight_layout()
     return fig
@@ -424,7 +691,7 @@ def make_fig5(stats: pd.DataFrame) -> plt.Figure:
     model_order = overall["model"].tolist()
     x = np.arange(len(model_order))
-    width = 0.22
     tiers = ["benign", "borderline", "dual_use"]
     fig, ax = plt.subplots(figsize=(13, 5))
@@ -434,67 +701,33 @@ def make_fig5(stats: pd.DataFrame) -> plt.Figure:
             .set_index("model")
             .reindex(model_order)
         )
-        rates = np.asarray(sub["raw_rate"].fillna(0),  dtype=float)
-        lo    = np.asarray(sub["ci_lo"].fillna(0),     dtype=float)
-        hi    = np.asarray(sub["ci_hi"].fillna(0),     dtype=float)
         offset = (i - 1) * width
         ax.bar(x + offset, rates, width, label=TIER_LABELS[tier],
-               color=TIER_COLORS[tier], alpha=0.87)
         ax.errorbar(
             x + offset, rates,
             yerr=[(rates - lo).clip(0), (hi - rates).clip(0)],
-            fmt="none", color="black", capsize=2.5, linewidth=0.9,
         )
     ax.set_xticks(x)
     ax.set_xticklabels(model_order, rotation=35, ha="right", fontsize=8.5)
-    ax.set_ylabel("Strict refusal rate")
-    ax.set_ylim(0, 1.12)
-    ax.legend(title="Tier", fontsize=9)
-    ax.grid(axis="y", alpha=0.3)
-    ax.set_title("Tier-stratified refusal rates: benign vs borderline vs dual-use")
     fig.tight_layout()
     return fig
-# ── Key stats banner ──────────────────────────────────────────────────────────
-def _stats_banner(stats: pd.DataFrame, overall: pd.DataFrame) -> str:
-    n_models  = stats["model_id"].nunique()
-    n_trials  = stats["n"].sum()
-    n_prompts = 141  # fixed
-    top_model = overall.iloc[0]["model"]
-    top_rate  = overall.iloc[0]["raw_rate"]
-    return f"""
-    <div style="display:flex;gap:16px;flex-wrap:wrap;margin-bottom:12px;">
-      <div style="background:#FFF5F5;border:1px solid #FEB2B2;border-radius:8px;
-                  padding:12px 18px;min-width:120px;text-align:center;">
-        <div style="font-size:1.6em;font-weight:700;color:#C53030;">{n_models}</div>
-        <div style="font-size:0.82em;color:#744210;">models evaluated</div>
-      </div>
-      <div style="background:#F0FFF4;border:1px solid #9AE6B4;border-radius:8px;
-                  padding:12px 18px;min-width:120px;text-align:center;">
-        <div style="font-size:1.6em;font-weight:700;color:#276749;">{n_prompts}</div>
-        <div style="font-size:0.82em;color:#276749;">prompts (v1.0)</div>
-      </div>
-      <div style="background:#EBF8FF;border:1px solid #90CDF4;border-radius:8px;
-                  padding:12px 18px;min-width:120px;text-align:center;">
-        <div style="font-size:1.6em;font-weight:700;color:#2C5282;">{n_trials:,}</div>
-        <div style="font-size:0.82em;color:#2C5282;">adjudicated trials</div>
-      </div>
-      <div style="background:#FAF5FF;border:1px solid #D6BCFA;border-radius:8px;
-                  padding:12px 18px;min-width:180px;text-align:center;">
-        <div style="font-size:1.6em;font-weight:700;color:#553C9A;">
-          {top_rate:.0%}
-        </div>
-        <div style="font-size:0.82em;color:#553C9A;">
-          highest refusal ({top_model})
-        </div>
-      </div>
-    </div>
-    """
 # ── App ───────────────────────────────────────────────────────────────────────
 try:
@@ -507,25 +740,38 @@ except FileNotFoundError as exc:
 except Exception as exc:
     raise SystemExit(f"[RefusalBench Space] Failed to load stats: {exc}") from exc
-OVERALL_STATS = overall_stats(STATS)  # pre-computed once; reused by leaderboard & banner
-HEADER = """
-<div style="text-align:center;padding:16px 0 8px;">
-  <h1 style="margin:0;font-family:serif;font-size:2em;">🧬 RefusalBench</h1>
-  <p style="margin:4px 0 0;color:#4A5568;font-size:1.05em;">
-    Frontier LLM refusal on biological research prompts — 19 models · 141 prompts · 3 tiers
-  </p>
-  <p style="margin:8px 0 0;font-size:0.9em;">
-    <a href="https://github.com/AppliedScientific/refusalbench" target="_blank">
-      📦 GitHub
-    </a>
     &nbsp;·&nbsp;
-    <a href="https://arxiv.org/abs/2605.21545" target="_blank">
-      📄 Paper (arXiv:2605.21545)
-    </a>
     &nbsp;·&nbsp;
-    Snapshot: <code>v1.1-frozen · May 2026</code>
-  </p>
 </div>
 """
@@ -551,9 +797,9 @@ Based on a 75-trial should-refuse positive-control sweep (15 prompts × 5 trials
 | Tier | TPR threshold | Interpretation |
 |---|---|---|
-| **A** | ≥ 95% | Reliably refuses clearly dangerous prompts |
-| **B** | 9–73% | Intermediate calibration |
-| **C** | ≤ 1.3% | Effectively never refuses |
 | **—** | Gap zone | Between formal tiers |
 ---
@@ -561,8 +807,8 @@ Based on a 75-trial should-refuse positive-control sweep (15 prompts × 5 trials
 ## Snapshot
 - **Version:** v1.1-frozen (May 2026)
-- **Main sweep:** 18 frontier models + 1 control (Llama 3.3 70B†)
-- **v1.1 addition:** NVIDIA Nemotron 3 Super 120B (★)
 - **Data:** `data/adjudicated.csv` (bundled in this Space) — compliance labels only; raw prompt text is not published. Full snapshot in the [GitHub repo](https://github.com/AppliedScientific/refusalbench).
 ---
@@ -595,23 +841,22 @@ def update_leaderboard(jur_filter: str, sort_by: str) -> str:
 with gr.Blocks(
     theme=gr.themes.Soft(
-        primary_hue="red",
-        secondary_hue="indigo",
     ),
     title="RefusalBench",
-    css="""
-        .gradio-container { max-width: 1100px !important; }
-        footer { display: none !important; }
-    """,
 ) as demo:
-    gr.HTML(HEADER)
-    gr.HTML(_stats_banner(STATS, OVERALL_STATS))
     with gr.Tabs():
         # ── Tab 1: Leaderboard ─────────────────────────────────────────────
-        with gr.Tab("🏆 Leaderboard"):
             with gr.Row():
                 jur_dd = gr.Dropdown(
                     choices=["All", "US", "EU", "Asia"],
@@ -630,54 +875,45 @@ with gr.Blocks(
                 value=build_leaderboard_html(STATS, OVERALL_STATS, "All", "Overall")
             )
-            jur_dd.change(
-                fn=update_leaderboard,
-                inputs=[jur_dd, sort_dd],
-                outputs=leaderboard_html,
-            )
-            sort_dd.change(
-                fn=update_leaderboard,
-                inputs=[jur_dd, sort_dd],
-                outputs=leaderboard_html,
-            )
         # ── Tab 2: Provider figures ────────────────────────────────────────
-        with gr.Tab("📊 Provider Analysis"):
             gr.Markdown(
-                "**Figure 1** — Benign-tier strict refusal rate for all 19 models, "
-                "sorted descending, coloured by provider organisation. "
-                "Error bars = Wilson 95% CI."
             )
             gr.Plot(value=make_fig1(STATS))
             gr.Markdown(
-                "**Figure 2** — Tier-stratified rates for all 19 models. "
-                "Benign (green) / Borderline (amber) / Dual-use (red). "
-                "Models sorted by overall rate descending."
             )
             gr.Plot(value=make_fig5(STATS))
         # ── Tab 3: Longitudinal ────────────────────────────────────────────
-        with gr.Tab("📈 Opus Longitudinal"):
             gr.Markdown(
-                "**Figure 3** — Refusal trajectory across Opus 4.5 → 4.6 → 4.7 "
-                "by tier. Shaded bands = Wilson 95% CI. "
-                "Point labels use raw rates (n_refused / n); "
-                "line position uses Wilson centre."
             )
             gr.Plot(value=make_fig3(STATS))
             gr.Markdown(
                 """
-**Key finding (H4):** Dual-use refusal is at ceiling (100%) across all three Opus versions.
-Benign-tier refusal is flat from Opus 4.5 → 4.6 (33%), then jumps +44 pp to 77% at Opus 4.7,
-reducing Youden's J by 65% (from +67 pp to +23 pp). The 4.6 → 4.7 McNemar test gives
 χ²(cc) = 107, p ≈ 0 on 703 matched triples, with 112 new benign refusals and 0 reversals.
                 """
             )
         # ── Tab 4: About ───────────────────────────────────────────────────
-        with gr.Tab("ℹ️ About"):
             gr.Markdown(ABOUT_MD)
 if __name__ == "__main__":
     demo.launch()

+"""RefusalBench — HuggingFace Space (v2)
 Interactive leaderboard and figures for the RefusalBench paper.
 Data: data/adjudicated.csv  (13,389 adjudicated rows, v1.1-frozen snapshot)
 # ── Typography ────────────────────────────────────────────────────────────────
 mpl.rcParams.update(
     {
+        "font.family": "sans-serif",
+        "font.sans-serif": ["Inter", "Helvetica Neue", "Helvetica", "Arial", "DejaVu Sans"],
+        "axes.titlesize": 13,
+        "axes.titleweight": "semibold",
         "axes.labelsize": 11,
         "xtick.labelsize": 9,
         "ytick.labelsize": 9,
         "legend.fontsize": 9,
+        "axes.spines.top": False,
+        "axes.spines.right": False,
+        "axes.edgecolor": "#94A3B8",
+        "axes.labelcolor": "#94A3B8",
+        "xtick.color": "#94A3B8",
+        "ytick.color": "#94A3B8",
+        "figure.facecolor": "none",
+        "axes.facecolor": "none",
+        "savefig.facecolor": "none",
+        "savefig.transparent": True,
     }
 )
 # ── Model metadata ────────────────────────────────────────────────────────────
 MODEL_META: dict[str, tuple[str, str, str, str]] = {
     "anthropic/claude-opus-4.7":              ("Claude Opus 4.7",          "Anthropic",   "anthropic", "US"),
     "anthropic/claude-opus-4.6":              ("Claude Opus 4.6",          "Anthropic",   "anthropic", "US"),
     "moonshotai/kimi-k2.6-20260420":          ("Kimi K2.6",                "Moonshot AI", "moonshot",  "Asia"),
     "minimax/minimax-m2.7-20260318":          ("MiniMax M2.7",             "MiniMax",     "minimax",   "Asia"),
     "us.amazon.nova-pro-v1:0":                ("Amazon Nova Pro",          "Amazon",      "amazon",    "US"),
+    "us.meta.llama3-3-70b-instruct-v1:0":     ("Llama 3.3 70B",            "Meta",        "meta",      "US"),
     "mistral.mistral-large-3-675b-instruct":  ("Mistral Large 3",          "Mistral",     "mistral",   "EU"),
     "deepseek.v3.2":                          ("DeepSeek V3.2",            "DeepSeek",    "deepseek",  "Asia"),
     "us.deepseek.r1-v1:0":                    ("DeepSeek R1",              "DeepSeek",    "deepseek",  "Asia"),
     "qwen.qwen3-next-80b-a3b":                ("Qwen3 Next 80B",           "Qwen",        "qwen",      "Asia"),
     "zai.glm-5":                              ("GLM-5",                    "Z.AI",        "zai",       "Asia"),
+    "nvidia.nemotron-super-3-120b":           ("Nemotron 3 Super 120B",    "NVIDIA",      "nvidia",    "US"),
+}
+NOTE_FLAGS: dict[str, str] = {
+    "us.meta.llama3-3-70b-instruct-v1:0":  "non-frontier open-source control",
+    "nvidia.nemotron-super-3-120b":        "added v1.1",
 }
 # PC Tier from should-refuse positive control (TPR threshold: A ≥ 95%, B 9–73%)
     "us.meta.llama3-3-70b-instruct-v1:0":     "—",
 }
+# Restrained provider palette — saturated enough to read on dark + light
 PROVIDER_COLORS: dict[str, str] = {
+    "anthropic": "#D97757",
+    "openai":    "#10A37F",
+    "google":    "#4285F4",
+    "amazon":    "#FF9900",
+    "meta":      "#0866FF",
+    "mistral":   "#FA520F",
+    "deepseek":  "#4D6BFE",
+    "qwen":      "#615CED",
+    "zai":       "#06A77D",
+    "xai":       "#1DA1F2",
+    "moonshot":  "#8B5CF6",
+    "minimax":   "#EC4899",
     "nvidia":    "#76B900",
+    "other":     "#94A3B8",
 }
+# Tier colors (chosen to work on both dark and light Gradio Soft backgrounds)
+TIER_COLORS = {
+    "benign":     "#10B981",   # emerald
+    "borderline": "#F59E0B",   # amber
+    "dual_use":   "#EF4444",   # red
+    "overall":    "#6366F1",   # indigo
+}
 TIER_LABELS = {"benign": "Benign", "borderline": "Borderline", "dual_use": "Dual-use"}
 JURS = {"US": "🇺🇸", "EU": "🇪🇺", "Asia": "🌏"}
     return pd.DataFrame(rows).sort_values("refusal_rate", ascending=False)
+def headline_spread(stats: pd.DataFrame) -> tuple[float, float, str, str]:
+    """Return (min, max, min_model, max_model) for PC-Tier-A models on benign."""
+    sub = stats[(stats["pc_tier"] == "A") & (stats["tier"] == "benign")].copy()
+    if sub.empty:
+        return 0.0, 0.0, "", ""
+    lo_row = sub.loc[sub["raw_rate"].idxmin()]
+    hi_row = sub.loc[sub["raw_rate"].idxmax()]
+    return (
+        float(lo_row["raw_rate"]),
+        float(hi_row["raw_rate"]),
+        str(lo_row["model"]),
+        str(hi_row["model"]),
+    )
+# ── Theme-aware CSS (uses Gradio CSS variables for dark/light support) ───────
+_PC_BADGE_CSS = """
+.pc-badge {
+  display: inline-block;
+  min-width: 22px;
+  padding: 2px 8px;
+  border-radius: 999px;
+  font-weight: 700;
+  font-size: 0.78em;
+  text-align: center;
+  letter-spacing: 0.02em;
+}
+.pc-A { background: rgba(16, 185, 129, 0.16); color: #059669; border: 1px solid rgba(16, 185, 129, 0.35); }
+.pc-B { background: rgba(245, 158, 11, 0.16); color: #B45309; border: 1px solid rgba(245, 158, 11, 0.40); }
+.pc-C { background: rgba(239, 68, 68, 0.16); color: #B91C1C; border: 1px solid rgba(239, 68, 68, 0.40); }
+.pc-x { background: var(--background-fill-secondary, #F1F5F9); color: var(--body-text-color-subdued, #64748B); border: 1px solid var(--border-color-primary, #E2E8F0); }
+@media (prefers-color-scheme: dark) {
+  .pc-A { color: #34D399; }
+  .pc-B { color: #FBBF24; }
+  .pc-C { color: #F87171; }
+}
+"""
+_HERO_CSS = """
+.rb-hero {
+  display: flex;
+  gap: 22px;
+  align-items: center;
+  padding: 22px 26px;
+  border-radius: 16px;
+  background:
+    linear-gradient(135deg, rgba(239, 68, 68, 0.10), rgba(99, 102, 241, 0.10)),
+    var(--background-fill-secondary, #F8FAFC);
+  border: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.3));
+  margin: 6px 0 18px;
+}
+.rb-hero-number {
+  flex-shrink: 0;
+  text-align: center;
+  padding: 0 14px;
+  border-right: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.3));
+}
+.rb-hero-number .big {
+  font-size: 2.6em;
+  font-weight: 800;
+  line-height: 1;
+  letter-spacing: -0.02em;
+  background: linear-gradient(135deg, #EF4444, #6366F1);
+  -webkit-background-clip: text;
+  background-clip: text;
+  color: transparent;
+}
+.rb-hero-number .label {
+  font-size: 0.75em;
+  color: var(--body-text-color-subdued, #64748B);
+  margin-top: 4px;
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+}
+.rb-hero-text {
+  flex: 1;
+  color: var(--body-text-color, inherit);
+  font-size: 1em;
+  line-height: 1.5;
+}
+.rb-hero-text strong { font-weight: 700; }
+.rb-hero-text .thesis { font-size: 1.08em; font-weight: 600; display: block; margin-bottom: 4px; }
+.rb-hero-text .body { color: var(--body-text-color-subdued, #475569); }
+"""
+_HEADER_CSS = """
+.rb-header { text-align: center; padding: 18px 0 6px; }
+.rb-header h1 {
+  margin: 0;
+  font-size: 2.4em;
+  font-weight: 800;
+  letter-spacing: -0.025em;
+  background: linear-gradient(135deg, #EF4444, #6366F1);
+  -webkit-background-clip: text;
+  background-clip: text;
+  color: transparent;
+}
+.rb-header .sub {
+  margin: 6px 0 10px;
+  color: var(--body-text-color-subdued, #64748B);
+  font-size: 1.02em;
+}
+.rb-header .meta { font-size: 0.86em; color: var(--body-text-color-subdued, #64748B); }
+.rb-header .meta a { color: var(--body-text-color, inherit); text-decoration: none; border-bottom: 1px dotted currentColor; }
+.rb-header .meta a:hover { color: #6366F1; }
+.rb-header .pill {
+  display: inline-block;
+  padding: 2px 9px;
+  border-radius: 999px;
+  font-family: ui-monospace, SFMono-Regular, monospace;
+  font-size: 0.82em;
+  background: var(--background-fill-secondary, rgba(99, 102, 241, 0.08));
+  border: 1px solid var(--border-color-primary, rgba(99, 102, 241, 0.2));
+  color: var(--body-text-color, inherit);
+}
+"""
+_TABLE_CSS = """
+.rb-tablewrap {
+  border: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.25));
+  border-radius: 12px;
+  overflow: hidden;
+  background: var(--background-fill-primary, transparent);
+}
+.rb-tablewrap table {
+  width: 100%;
+  border-collapse: separate;
+  border-spacing: 0;
+  font-size: 0.92em;
+  color: var(--body-text-color, inherit);
+}
+.rb-tablewrap thead th {
+  position: sticky;
+  top: 0;
+  z-index: 2;
+  background: var(--background-fill-secondary, #F8FAFC);
+  color: var(--body-text-color-subdued, #475569);
+  font-weight: 600;
+  font-size: 0.82em;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  padding: 10px 10px;
+  text-align: left;
+  border-bottom: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.25));
+}
+.rb-tablewrap thead th.center { text-align: center; }
+.rb-tablewrap thead .grp {
+  text-transform: none;
+  letter-spacing: 0;
+  font-weight: 700;
+  color: var(--body-text-color, inherit);
+  font-size: 0.86em;
+  border-bottom: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.18));
+  background: var(--background-fill-secondary, rgba(99, 102, 241, 0.05));
+}
+.rb-tablewrap tbody tr { transition: background 120ms ease; }
+.rb-tablewrap tbody tr:hover {
+  background: var(--background-fill-secondary, rgba(99, 102, 241, 0.04)) !important;
+}
+.rb-tablewrap tbody td {
+  padding: 11px 10px;
+  border-bottom: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.14));
+  vertical-align: middle;
+}
+.rb-tablewrap tbody tr:last-child td { border-bottom: 0; }
+.rb-rank {
+  color: var(--body-text-color-subdued, #94A3B8);
+  font-size: 0.85em;
+  font-variant-numeric: tabular-nums;
+  text-align: center;
+  width: 30px;
+}
+.rb-model {
+  white-space: nowrap;
+  font-weight: 600;
+  color: var(--body-text-color, inherit);
+}
+.rb-dot {
+  display: inline-block;
+  width: 9px; height: 9px;
+  border-radius: 50%;
+  margin-right: 8px;
+  vertical-align: middle;
+  box-shadow: 0 0 0 1.5px var(--background-fill-primary, white);
+}
+.rb-org {
+  color: var(--body-text-color-subdued, #64748B);
+  font-size: 0.88em;
+  white-space: nowrap;
+}
+.rb-flag { text-align: center; font-size: 1.05em; }
+.rb-note {
+  font-size: 0.72em;
+  color: var(--body-text-color-subdued, #94A3B8);
+  font-style: italic;
+  margin-left: 6px;
+}
+.rb-cell {
+  text-align: right;
+  font-variant-numeric: tabular-nums;
+  padding: 11px 12px !important;
+  min-width: 92px;
+}
+.rb-pct {
+  font-size: 1.05em;
+  font-weight: 700;
+  color: var(--body-text-color, inherit);
+  letter-spacing: -0.01em;
+}
+.rb-bar {
+  height: 5px;
+  border-radius: 3px;
+  margin-top: 5px;
+  background: var(--background-fill-secondary, rgba(148, 163, 184, 0.18));
+  overflow: hidden;
+  position: relative;
+}
+.rb-bar-fill {
+  display: block;
+  height: 100%;
+  border-radius: 3px;
+}
+.rb-na { color: var(--body-text-color-subdued, #94A3B8); font-weight: 500; }
+.rb-intro {
+  color: var(--body-text-color-subdued, #64748B);
+  font-size: 0.88em;
+  margin: 4px 2px 14px;
+  line-height: 1.55;
+}
+.rb-footer {
+  margin-top: 14px;
+  padding: 12px 4px 0;
+  font-size: 0.78em;
+  color: var(--body-text-color-subdued, #64748B);
+  line-height: 1.7;
+  border-top: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.18));
+}
+.rb-footer strong { color: var(--body-text-color, inherit); font-weight: 600; }
+.rb-footer code {
+  background: var(--background-fill-secondary, rgba(148, 163, 184, 0.12));
+  padding: 1px 5px;
+  border-radius: 4px;
+  font-size: 0.92em;
+}
+"""
+CSS = (
+    """
+    .gradio-container { max-width: 1240px !important; }
+    footer { display: none !important; }
+    /* hide gr.Plot's locale-translated floating label ("Diagramm"/"Plot") */
+    .block.auto-margin > label.float { display: none !important; }
+    """
+    + _HEADER_CSS
+    + _HERO_CSS
+    + _PC_BADGE_CSS
+    + _TABLE_CSS
+)
 # ── Leaderboard HTML ──────────────────────────────────────────────────────────
+_PC_BADGE = {
+    "A": '<span class="pc-badge pc-A" title="≥95% TPR on the should-refuse positive control">A</span>',
+    "B": '<span class="pc-badge pc-B" title="9–73% TPR on the should-refuse positive control">B</span>',
+    "C": '<span class="pc-badge pc-C" title="≤1.3% TPR on the should-refuse positive control">C</span>',
+    "—": '<span class="pc-badge pc-x" title="In the gap zone between formal tiers">—</span>',
 }
+def _rate_cell(t: tuple | None, tier_color: str) -> str:
+    """Render a single rate cell: %, bar below, full Wilson CI on hover."""
+    if t is None:
+        return '<td class="rb-cell"><span class="rb-na">—</span></td>'
+    _rate, lo, hi, raw = t
+    pct = f"{raw:.0%}"
+    bar_w = f"{max(2, raw * 100):.1f}%"  # min width so tiny rates still show
+    tooltip = f"Wilson 95% CI: {lo:.1%} – {hi:.1%} (raw = {raw:.1%})"
+    return (
+        f'<td class="rb-cell" title="{tooltip}">'
+        f'<div class="rb-pct">{pct}</div>'
+        f'<div class="rb-bar"><span class="rb-bar-fill" '
+        f'style="width:{bar_w};background:{tier_color};"></span></div>'
+        f'</td>'
+    )
 def build_leaderboard_html(
     stats: pd.DataFrame,
     overall: pd.DataFrame,
     jur_filter: str = "All",
     sort_by: str = "Overall",
 ) -> str:
+    # Pivot per-tier data keyed by model_id
     pivot: dict[str, dict] = {}
     for _, row in stats.iterrows():
         mid = row["model_id"]
         if mid not in pivot:
             pivot[mid] = {
+                "model_id": mid,
                 "model": row["model"],
                 "org": row["org"],
                 "provider": row["provider"],
             )
     rows_data = list(pivot.values())
     if jur_filter != "All":
         rows_data = [r for r in rows_data if r["jurisdiction"] == jur_filter]
     }.get(sort_by, lambda r: r.get("overall", (0,))[0])
     rows_data.sort(key=sort_key, reverse=True)
     intro = (
+        '<p class="rb-intro">'
+        'Each cell shows the <strong>strict refusal rate</strong> '
+        '(direct + indirect refusal) — hover for the Wilson 95 % confidence interval. '
+        'Bars scale with magnitude. PC Tier reflects positive-control calibration on '
+        '15 clearly-dangerous prompts.'
         '</p>'
     )
+    header = f"""
+    <div class="rb-tablewrap">
+    <table>
       <thead>
+        <tr>
+          <th rowspan="2">#</th>
+          <th rowspan="2">Model</th>
+          <th rowspan="2">Org</th>
+          <th rowspan="2" class="center">Jur.</th>
+          <th colspan="4" class="center grp">Strict refusal rate</th>
+          <th rowspan="2" class="center">PC<br>Tier</th>
         </tr>
+        <tr>
+          <th class="center" style="color:{TIER_COLORS['benign']};">Benign</th>
+          <th class="center" style="color:{TIER_COLORS['borderline']};">Borderline</th>
+          <th class="center" style="color:{TIER_COLORS['dual_use']};">Dual-use</th>
+          <th class="center" style="color:{TIER_COLORS['overall']};">Overall</th>
         </tr>
       </thead>
       <tbody>
     """
     body = ""
     for i, r in enumerate(rows_data):
+        dot_color = PROVIDER_COLORS.get(r["provider"], "#94A3B8")
         jur_flag = JURS.get(r["jurisdiction"], r["jurisdiction"])
+        badge = _PC_BADGE.get(r["pc_tier"], _PC_BADGE["—"])
+        note = NOTE_FLAGS.get(r["model_id"])
+        note_html = f'<span class="rb-note">— {note}</span>' if note else ""
         body += f"""
+        <tr>
+          <td class="rb-rank">{i + 1}</td>
+          <td class="rb-model">
+            <span class="rb-dot" style="background:{dot_color};"></span>{r["model"]}{note_html}
           </td>
+          <td class="rb-org">{r["org"]}</td>
+          <td class="rb-flag">{jur_flag}</td>
+          {_rate_cell(r.get("benign"),     TIER_COLORS["benign"])}
+          {_rate_cell(r.get("borderline"), TIER_COLORS["borderline"])}
+          {_rate_cell(r.get("dual_use"),   TIER_COLORS["dual_use"])}
+          {_rate_cell(r.get("overall"),    TIER_COLORS["overall"])}
+          <td class="rb-flag">{badge}</td>
         </tr>"""
     footer = """
       </tbody>
     </table>
+    </div>
+    <div class="rb-footer">
+      <strong>Strict refusal</strong> = <code>direct_refusal</code> + <code>indirect_refusal</code> across 3 tiers × 47 prompts × 5 trials.
+      &nbsp;·&nbsp; <strong>PC Tier</strong>: A ≥ 95 % TPR, B 9–73 % TPR on the 75-trial should-refuse positive control; — = gap zone.
+      &nbsp;·&nbsp; High <strong>benign</strong> refusal indicates over-refusal on safe prompts.
     </div>
     """
     return intro + header + body + footer
 # ── Figures ───────────────────────────────────────────────────────────────────
+def _style_axes(ax: plt.Axes) -> None:
+    """Apply theme-neutral axis styling."""
+    ax.grid(axis="y", alpha=0.18, linestyle="-", linewidth=0.7, zorder=0)
+    ax.set_axisbelow(True)
+    for spine in ax.spines.values():
+        spine.set_color("#94A3B8")
+        spine.set_linewidth(0.7)
 def make_fig1(stats: pd.DataFrame) -> plt.Figure:
     """Provider gradient — benign tier, sorted by rate descending."""
     sub = stats[stats["tier"] == "benign"].copy()
     sub = sub.sort_values("raw_rate", ascending=False).reset_index(drop=True)
+    colors = [PROVIDER_COLORS.get(p, "#94A3B8") for p in sub["provider"]]
+    fig, ax = plt.subplots(figsize=(11, 4.8))
     x = np.arange(len(sub))
+    ax.bar(x, sub["raw_rate"], color=colors, alpha=0.92, width=0.72, zorder=3,
+           edgecolor="none")
     ax.errorbar(
         x, sub["raw_rate"],
         yerr=[sub["raw_rate"] - sub["ci_lo"], sub["ci_hi"] - sub["raw_rate"]],
+        fmt="none", color="#475569", capsize=3, linewidth=1.0, zorder=4, alpha=0.7,
     )
     ax.set_xticks(x)
+    ax.set_xticklabels(sub["model"], rotation=38, ha="right", fontsize=8.5)
+    ax.set_ylabel("Strict refusal rate (benign)", fontsize=10)
+    ax.set_ylim(0, 1.06)
+    ax.set_yticks(np.arange(0, 1.01, 0.2))
+    ax.set_yticklabels([f"{int(v*100)}%" for v in np.arange(0, 1.01, 0.2)])
+    _style_axes(ax)
     seen: dict[str, str] = {}
     for p, c in zip(sub["provider"], colors):
         if p not in seen:
             seen[p] = c
+    patches = [mpatches.Patch(color=c, label=p.title()) for p, c in seen.items()]
+    ax.legend(handles=patches, loc="upper right", fontsize=8, ncol=2,
+              frameon=False, labelcolor="#94A3B8")
     fig.tight_layout()
     return fig
     opus_stats["opus_label"] = opus_stats["model_id"].map(id_to_label)
     x = np.arange(len(opus_labels))
+    fig, ax = plt.subplots(figsize=(8.5, 4.6))
     for tier in ["benign", "borderline", "dual_use"]:
         sub = (
         )
         rates = np.asarray(sub["refusal_rate"], dtype=float)
         raw   = np.asarray(sub["raw_rate"],     dtype=float)
+        lo    = np.asarray(sub["ci_lo"],        dtype=float)
+        hi    = np.asarray(sub["ci_hi"],        dtype=float)
         color = TIER_COLORS[tier]
         label = TIER_LABELS[tier]
+        ax.plot(x, rates, marker="o", color=color, linewidth=2.3, label=label,
+                zorder=3, markersize=7, markeredgecolor="white", markeredgewidth=1.5)
         ax.fill_between(x, lo, hi, alpha=0.15, color=color, zorder=2)
         for xi, r, rr in zip(x, rates, raw):
             if not np.isnan(r):
                 ax.annotate(
                     f"{round(rr * 100):.0f}%",
                     (xi, r),
+                    textcoords="offset points", xytext=(0, 9),
+                    ha="center", fontsize=8.5, color=color, fontweight="600",
                 )
     ax.set_xticks(x)
+    ax.set_xticklabels(opus_labels, fontsize=10.5)
+    ax.set_ylabel("Strict refusal rate", fontsize=10)
     ax.set_ylim(0, 1.15)
+    ax.set_yticks(np.arange(0, 1.01, 0.2))
+    ax.set_yticklabels([f"{int(v*100)}%" for v in np.arange(0, 1.01, 0.2)])
+    _style_axes(ax)
+    leg = ax.legend(title="Tier", loc="center left", bbox_to_anchor=(1.01, 0.5),
+                    frameon=False, labelcolor="#94A3B8", title_fontsize=9)
+    leg.get_title().set_color("#94A3B8")
     fig.tight_layout()
     return fig
     model_order = overall["model"].tolist()
     x = np.arange(len(model_order))
+    width = 0.24
     tiers = ["benign", "borderline", "dual_use"]
     fig, ax = plt.subplots(figsize=(13, 5))
             .set_index("model")
             .reindex(model_order)
         )
+        rates = np.asarray(sub["raw_rate"].fillna(0), dtype=float)
+        lo    = np.asarray(sub["ci_lo"].fillna(0),    dtype=float)
+        hi    = np.asarray(sub["ci_hi"].fillna(0),    dtype=float)
         offset = (i - 1) * width
         ax.bar(x + offset, rates, width, label=TIER_LABELS[tier],
+               color=TIER_COLORS[tier], alpha=0.92, edgecolor="none", zorder=3)
         ax.errorbar(
             x + offset, rates,
             yerr=[(rates - lo).clip(0), (hi - rates).clip(0)],
+            fmt="none", color="#475569", capsize=2, linewidth=0.8, alpha=0.65,
+            zorder=4,
         )
     ax.set_xticks(x)
     ax.set_xticklabels(model_order, rotation=35, ha="right", fontsize=8.5)
+    ax.set_ylabel("Strict refusal rate", fontsize=10)
+    ax.set_ylim(0, 1.10)
+    ax.set_yticks(np.arange(0, 1.01, 0.2))
+    ax.set_yticklabels([f"{int(v*100)}%" for v in np.arange(0, 1.01, 0.2)])
+    _style_axes(ax)
+    leg = ax.legend(title="Tier", fontsize=9, frameon=False, labelcolor="#94A3B8",
+                    title_fontsize=9, loc="upper right")
+    leg.get_title().set_color("#94A3B8")
     fig.tight_layout()
     return fig
 # ── App ───────────────────────────────────────────────────────────────────────
 try:
 except Exception as exc:
     raise SystemExit(f"[RefusalBench Space] Failed to load stats: {exc}") from exc
+OVERALL_STATS = overall_stats(STATS)
+_LO, _HI, _LO_MODEL, _HI_MODEL = headline_spread(STATS)
+_SPREAD_PP = round((_HI - _LO) * 100)
+_N_TRIALS = int(STATS["n"].sum())
+_N_MODELS = int(STATS["model_id"].nunique())
+HEADER_HTML = f"""
+<div class="rb-header">
+  <h1>RefusalBench</h1>
+  <div class="sub">Frontier-LLM refusal calibration on biological research prompts</div>
+  <div class="meta">
+    <a href="https://github.com/AppliedScientific/refusalbench" target="_blank">GitHub</a>
+    &nbsp;·&nbsp;
+    <a href="https://arxiv.org/abs/2605.21545" target="_blank">arXiv:2605.21545</a>
     &nbsp;·&nbsp;
+    <span class="pill">v1.1-frozen · May 2026</span>
     &nbsp;·&nbsp;
+    {_N_MODELS} models &nbsp;·&nbsp; 141 prompts &nbsp;·&nbsp; {_N_TRIALS:,} trials
+  </div>
+</div>
+"""
+HERO_HTML = f"""
+<div class="rb-hero">
+  <div class="rb-hero-number">
+    <div class="big">{_SPREAD_PP} pp</div>
+    <div class="label">PC-Tier-A spread on benign</div>
+  </div>
+  <div class="rb-hero-text">
+    <span class="thesis">Refusal rate is not safety calibration.</span>
+    <span class="body">Across the eight models that reliably refuse clearly-dangerous prompts (PC&nbsp;Tier&nbsp;A,&nbsp;≥&nbsp;95&nbsp;%&nbsp;TPR), benign-prompt refusal ranges from <strong>{_LO:.0%} ({_LO_MODEL})</strong> to <strong>{_HI:.0%} ({_HI_MODEL})</strong>. The same headline number masks very different calibrations.</span>
+  </div>
 </div>
 """
 | Tier | TPR threshold | Interpretation |
 |---|---|---|
+| **A** | ≥ 95 % | Reliably refuses clearly dangerous prompts |
+| **B** | 9–73 % | Intermediate calibration |
+| **C** | ≤ 1.3 % | Effectively never refuses |
 | **—** | Gap zone | Between formal tiers |
 ---
 ## Snapshot
 - **Version:** v1.1-frozen (May 2026)
+- **Main sweep:** 18 frontier models + 1 control (Llama 3.3 70B — non-frontier open-source)
+- **v1.1 addition:** NVIDIA Nemotron 3 Super 120B
 - **Data:** `data/adjudicated.csv` (bundled in this Space) — compliance labels only; raw prompt text is not published. Full snapshot in the [GitHub repo](https://github.com/AppliedScientific/refusalbench).
 ---
 with gr.Blocks(
     theme=gr.themes.Soft(
+        primary_hue="indigo",
+        secondary_hue="red",
+        neutral_hue="slate",
+        font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
     ),
     title="RefusalBench",
+    css=CSS,
 ) as demo:
+    gr.HTML(HEADER_HTML)
+    gr.HTML(HERO_HTML)
     with gr.Tabs():
         # ── Tab 1: Leaderboard ─────────────────────────────────────────────
+        with gr.Tab("Leaderboard"):
             with gr.Row():
                 jur_dd = gr.Dropdown(
                     choices=["All", "US", "EU", "Asia"],
                 value=build_leaderboard_html(STATS, OVERALL_STATS, "All", "Overall")
             )
+            jur_dd.change(fn=update_leaderboard,
+                          inputs=[jur_dd, sort_dd], outputs=leaderboard_html)
+            sort_dd.change(fn=update_leaderboard,
+                           inputs=[jur_dd, sort_dd], outputs=leaderboard_html)
         # ── Tab 2: Provider figures ────────────────────────────────────────
+        with gr.Tab("Provider Analysis"):
             gr.Markdown(
+                "**Figure 1.** Benign-tier strict refusal rate for all 19 models, "
+                "sorted descending, coloured by provider. Error bars = Wilson 95 % CI."
             )
             gr.Plot(value=make_fig1(STATS))
             gr.Markdown(
+                "**Figure 2.** Tier-stratified rates across all 19 models — "
+                "benign / borderline / dual-use side-by-side."
             )
             gr.Plot(value=make_fig5(STATS))
         # ── Tab 3: Longitudinal ────────────────────────────────────────────
+        with gr.Tab("Opus Longitudinal"):
             gr.Markdown(
+                "**Figure 3.** Refusal trajectory across Opus 4.5 to 4.6 to 4.7 "
+                "by tier. Shaded bands = Wilson 95 % CI."
             )
             gr.Plot(value=make_fig3(STATS))
             gr.Markdown(
                 """
+**Key finding (H4).** Dual-use refusal is at ceiling (100 %) across all three Opus versions.
+Benign-tier refusal is flat from Opus 4.5 → 4.6 (33 %), then jumps **+44 pp** to 77 % at Opus 4.7,
+reducing Youden's J by 65 % (from +67 pp to +23 pp). The 4.6 → 4.7 McNemar test gives
 χ²(cc) = 107, p ≈ 0 on 703 matched triples, with 112 new benign refusals and 0 reversals.
                 """
             )
         # ── Tab 4: About ───────────────────────────────────────────────────
+        with gr.Tab("About"):
             gr.Markdown(ABOUT_MD)
 if __name__ == "__main__":
     demo.launch()