VibeCodingScientist commited on
Commit
5eaec60
·
verified ·
1 Parent(s): 49bc134

Redesign UI: theme-aware leaderboard, thesis-forward hero, cleaner cells

Browse files
Files changed (1) hide show
  1. app.py +481 -245
app.py CHANGED
@@ -1,4 +1,4 @@
1
- """RefusalBench — HuggingFace Space
2
  Interactive leaderboard and figures for the RefusalBench paper.
3
 
4
  Data: data/adjudicated.csv (13,389 adjudicated rows, v1.1-frozen snapshot)
@@ -19,19 +19,28 @@ import pandas as pd
19
  # ── Typography ────────────────────────────────────────────────────────────────
20
  mpl.rcParams.update(
21
  {
22
- "font.family": "serif",
23
- "font.serif": ["Times New Roman", "Times", "DejaVu Serif"],
24
- "mathtext.fontset": "stix",
25
- "axes.titlesize": 12,
26
  "axes.labelsize": 11,
27
  "xtick.labelsize": 9,
28
  "ytick.labelsize": 9,
29
  "legend.fontsize": 9,
 
 
 
 
 
 
 
 
 
 
30
  }
31
  )
32
 
33
  # ── Model metadata ────────────────────────────────────────────────────────────
34
- # (model_id) → (display_name, org, provider_key, jurisdiction)
35
  MODEL_META: dict[str, tuple[str, str, str, str]] = {
36
  "anthropic/claude-opus-4.7": ("Claude Opus 4.7", "Anthropic", "anthropic", "US"),
37
  "anthropic/claude-opus-4.6": ("Claude Opus 4.6", "Anthropic", "anthropic", "US"),
@@ -45,13 +54,18 @@ MODEL_META: dict[str, tuple[str, str, str, str]] = {
45
  "moonshotai/kimi-k2.6-20260420": ("Kimi K2.6", "Moonshot AI", "moonshot", "Asia"),
46
  "minimax/minimax-m2.7-20260318": ("MiniMax M2.7", "MiniMax", "minimax", "Asia"),
47
  "us.amazon.nova-pro-v1:0": ("Amazon Nova Pro", "Amazon", "amazon", "US"),
48
- "us.meta.llama3-3-70b-instruct-v1:0": ("Llama 3.3 70B", "Meta", "meta", "US"),
49
  "mistral.mistral-large-3-675b-instruct": ("Mistral Large 3", "Mistral", "mistral", "EU"),
50
  "deepseek.v3.2": ("DeepSeek V3.2", "DeepSeek", "deepseek", "Asia"),
51
  "us.deepseek.r1-v1:0": ("DeepSeek R1", "DeepSeek", "deepseek", "Asia"),
52
  "qwen.qwen3-next-80b-a3b": ("Qwen3 Next 80B", "Qwen", "qwen", "Asia"),
53
  "zai.glm-5": ("GLM-5", "Z.AI", "zai", "Asia"),
54
- "nvidia.nemotron-super-3-120b": ("Nemotron 3 Super 120B", "NVIDIA", "nvidia", "US"),
 
 
 
 
 
55
  }
56
 
57
  # PC Tier from should-refuse positive control (TPR threshold: A ≥ 95%, B 9–73%)
@@ -77,24 +91,31 @@ PC_TIER: dict[str, str] = {
77
  "us.meta.llama3-3-70b-instruct-v1:0": "—",
78
  }
79
 
 
80
  PROVIDER_COLORS: dict[str, str] = {
81
- "anthropic": "#E53E3E",
82
- "openai": "#38A169",
83
- "google": "#3182CE",
84
- "amazon": "#DD6B20",
85
- "meta": "#805AD5",
86
- "mistral": "#2B6CB0",
87
- "deepseek": "#2C7A7B",
88
- "qwen": "#D69E2E",
89
- "zai": "#319795",
90
- "xai": "#4A5568",
91
- "moonshot": "#D53F8C",
92
- "minimax": "#6B46C1",
93
  "nvidia": "#76B900",
94
- "other": "#718096",
95
  }
96
 
97
- TIER_COLORS = {"benign": "#38A169", "borderline": "#DD6B20", "dual_use": "#E53E3E"}
 
 
 
 
 
 
98
  TIER_LABELS = {"benign": "Benign", "borderline": "Borderline", "dual_use": "Dual-use"}
99
  JURS = {"US": "🇺🇸", "EU": "🇪🇺", "Asia": "🌏"}
100
 
@@ -170,29 +191,307 @@ def overall_stats(stats: pd.DataFrame) -> pd.DataFrame:
170
  return pd.DataFrame(rows).sort_values("refusal_rate", ascending=False)
171
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  # ── Leaderboard HTML ──────────────────────────────────────────────────────────
174
 
175
- _TIER_BADGE = {
176
- "A": '<span style="background:#C6F6D5;color:#276749;border-radius:4px;padding:1px 7px;font-weight:600;font-size:0.82em;">A</span>',
177
- "B": '<span style="background:#FEFCBF;color:#744210;border-radius:4px;padding:1px 7px;font-weight:600;font-size:0.82em;">B</span>',
178
- "C": '<span style="background:#FED7D7;color:#9B2335;border-radius:4px;padding:1px 7px;font-weight:600;font-size:0.82em;">C</span>',
179
- "—": '<span style="background:#EDF2F7;color:#4A5568;border-radius:4px;padding:1px 7px;font-weight:500;font-size:0.82em;">—</span>',
180
  }
181
 
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  def build_leaderboard_html(
184
  stats: pd.DataFrame,
185
  overall: pd.DataFrame,
186
  jur_filter: str = "All",
187
  sort_by: str = "Overall",
188
  ) -> str:
189
-
190
- # ── pivot per-tier data keyed by model_id ─────────────────────────────────
191
  pivot: dict[str, dict] = {}
192
  for _, row in stats.iterrows():
193
  mid = row["model_id"]
194
  if mid not in pivot:
195
  pivot[mid] = {
 
196
  "model": row["model"],
197
  "org": row["org"],
198
  "provider": row["provider"],
@@ -208,8 +507,6 @@ def build_leaderboard_html(
208
  )
209
 
210
  rows_data = list(pivot.values())
211
-
212
- # Filter & sort
213
  if jur_filter != "All":
214
  rows_data = [r for r in rows_data if r["jurisdiction"] == jur_filter]
215
 
@@ -221,113 +518,68 @@ def build_leaderboard_html(
221
  }.get(sort_by, lambda r: r.get("overall", (0,))[0])
222
  rows_data.sort(key=sort_key, reverse=True)
223
 
224
- # ── cell renderer with heatmap tint ───────────────────────────────────────
225
- def rate_cell(t: tuple | None, tier_color: str = "#3182CE") -> str:
226
- if t is None:
227
- return '<td style="text-align:center;padding:8px 10px;color:#CBD5E0;font-size:1em;">—</td>'
228
- _rate, lo, hi, raw = t
229
- alpha = raw * 0.18 # subtle blue tint scales with magnitude
230
- bg = f"rgba(49,130,206,{alpha:.2f})"
231
- bar_w = int(raw * 52) # mini progress bar 0–52 px
232
- bar = (
233
- f'<div style="height:3px;width:{bar_w}px;background:{tier_color};'
234
- f'border-radius:2px;margin:3px auto 0;opacity:0.55;"></div>'
235
- )
236
- return (
237
- f'<td style="text-align:center;padding:8px 10px;background:{bg};vertical-align:middle;">'
238
- f'<span style="font-weight:700;font-size:1.05em;">{raw:.0%}</span>'
239
- f'<br><span style="font-size:0.70em;color:#718096;font-family:monospace;">'
240
- f'[{lo:.0%}–{hi:.0%}]</span>'
241
- f'{bar}</td>'
242
- )
243
-
244
- # ── intro blurb ───────────────────────────────────────────────────────────
245
  intro = (
246
- '<p style="font-size:0.83em;color:#4A5568;margin:0 0 10px 2px;line-height:1.5;">'
247
- 'Values show the <strong>strict refusal rate</strong> — fraction of trials where the model '
248
- 'gave a direct or indirect refusal — with Wilson 95&nbsp;% confidence interval below. '
249
- 'A mini bar visualises the magnitude. Models sorted by the selected tier column&nbsp;↓.'
 
250
  '</p>'
251
  )
252
 
253
- # ── two-row header: spanning group label + per-tier sub-headers ───────────
254
- header = """
255
- <table style="width:100%;border-collapse:collapse;font-size:0.91em;">
256
  <thead>
257
- <tr style="background:#F7FAFC;">
258
- <th style="padding:7px 6px;text-align:center;border-bottom:1px solid #E2E8F0;"
259
- rowspan="2">#</th>
260
- <th style="padding:7px 10px;text-align:left;border-bottom:1px solid #E2E8F0;"
261
- rowspan="2">Model</th>
262
- <th style="padding:7px 8px;text-align:left;border-bottom:1px solid #E2E8F0;"
263
- rowspan="2">Org</th>
264
- <th style="padding:7px 6px;text-align:center;border-bottom:1px solid #E2E8F0;"
265
- rowspan="2">Jur.</th>
266
- <th colspan="4"
267
- style="padding:7px 10px;text-align:center;background:#EBF8FF;
268
- color:#2C5282;font-weight:700;letter-spacing:0.01em;
269
- border-bottom:2px solid #BEE3F8;border-top:1px solid #E2E8F0;">
270
- Strict refusal rate &nbsp;·&nbsp; Wilson 95&nbsp;% CI
271
- </th>
272
- <th style="padding:7px 8px;text-align:center;border-bottom:1px solid #E2E8F0;"
273
- rowspan="2">PC<br>Tier</th>
274
  </tr>
275
- <tr style="background:#F7FAFC;border-bottom:2px solid #E2E8F0;">
276
- <th style="padding:6px 10px;text-align:center;color:#276749;font-weight:600;">
277
- 🟢 Benign</th>
278
- <th style="padding:6px 10px;text-align:center;color:#C05621;font-weight:600;">
279
- 🟡 Borderline</th>
280
- <th style="padding:6px 10px;text-align:center;color:#C53030;font-weight:600;">
281
- 🔴 Dual-use</th>
282
- <th style="padding:6px 10px;text-align:center;color:#553C9A;font-weight:600;
283
- background:#FAF5FF;">
284
- ◆ Overall</th>
285
  </tr>
286
  </thead>
287
  <tbody>
288
  """
289
 
290
- tier_colors = {
291
- "benign": "#38A169", "borderline": "#DD6B20",
292
- "dual_use": "#E53E3E", "overall": "#805AD5",
293
- }
294
-
295
  body = ""
296
  for i, r in enumerate(rows_data):
297
- dot_color = PROVIDER_COLORS.get(r["provider"], "#718096")
298
- bg = "#FFFFFF" if i % 2 == 0 else "#F7FAFC"
299
  jur_flag = JURS.get(r["jurisdiction"], r["jurisdiction"])
300
- badge = _TIER_BADGE.get(r["pc_tier"], _TIER_BADGE["—"])
 
 
301
 
302
  body += f"""
303
- <tr style="background:{bg};border-bottom:1px solid #EDF2F7;">
304
- <td style="text-align:center;padding:8px 6px;color:#A0AEC0;
305
- font-size:0.85em;font-weight:500;">{i + 1}</td>
306
- <td style="padding:8px 10px;white-space:nowrap;">
307
- <span style="display:inline-block;width:9px;height:9px;border-radius:50%;
308
- background:{dot_color};margin-right:7px;vertical-align:middle;
309
- flex-shrink:0;"></span><strong>{r["model"]}</strong>
310
  </td>
311
- <td style="padding:8px 8px;color:#4A5568;font-size:0.88em;
312
- white-space:nowrap;">{r["org"]}</td>
313
- <td style="text-align:center;padding:8px 6px;font-size:1.1em;">{jur_flag}</td>
314
- {rate_cell(r.get("benign"), tier_colors["benign"])}
315
- {rate_cell(r.get("borderline"), tier_colors["borderline"])}
316
- {rate_cell(r.get("dual_use"), tier_colors["dual_use"])}
317
- {rate_cell(r.get("overall"), tier_colors["overall"])}
318
- <td style="text-align:center;padding:8px 8px;">{badge}</td>
319
  </tr>"""
320
 
321
  footer = """
322
  </tbody>
323
  </table>
324
- <div style="font-size:0.76em;color:#718096;margin-top:8px;line-height:1.6;
325
- border-top:1px solid #EDF2F7;padding-top:6px;">
326
- <strong>Strict refusal</strong> = direct_refusal + indirect_refusal out of all trials (3 tiers × 47 prompts × 5 trials).
327
- &nbsp;·&nbsp; <strong>PC Tier</strong>: A ≥ 95 % TPR, B 9–73 % TPR on 75-trial should-refuse positive-control sweep; — = gap zone.
328
- &nbsp;·&nbsp; <strong>Benign</strong> high refusal = over-refusal on safe prompts.
329
- &nbsp;·&nbsp; † Llama 3.3 70B = non-frontier open-source control.
330
- &nbsp;·&nbsp; ★ Nemotron added v1.1.
331
  </div>
332
  """
333
  return intro + header + body + footer
@@ -335,34 +587,45 @@ def build_leaderboard_html(
335
 
336
  # ── Figures ───────────────────────────────────────────────────────────────────
337
 
 
 
 
 
 
 
 
 
 
338
  def make_fig1(stats: pd.DataFrame) -> plt.Figure:
339
  """Provider gradient — benign tier, sorted by rate descending."""
340
  sub = stats[stats["tier"] == "benign"].copy()
341
  sub = sub.sort_values("raw_rate", ascending=False).reset_index(drop=True)
342
 
343
- colors = [PROVIDER_COLORS.get(p, "#718096") for p in sub["provider"]]
344
- fig, ax = plt.subplots(figsize=(11, 5))
345
  x = np.arange(len(sub))
346
- ax.bar(x, sub["raw_rate"], color=colors, alpha=0.87, width=0.7, zorder=3)
 
347
  ax.errorbar(
348
  x, sub["raw_rate"],
349
  yerr=[sub["raw_rate"] - sub["ci_lo"], sub["ci_hi"] - sub["raw_rate"]],
350
- fmt="none", color="black", capsize=4, linewidth=1.2, zorder=4,
351
  )
352
  ax.set_xticks(x)
353
- ax.set_xticklabels(sub["model"], rotation=40, ha="right", fontsize=8.5)
354
- ax.set_ylabel("Strict refusal rate (benign prompts)")
355
- ax.set_ylim(0, 1.08)
356
- ax.axhline(0, color="black", linewidth=0.5)
357
- ax.grid(axis="y", alpha=0.3, zorder=0)
358
- ax.set_title("Provider gradient: refusal rate on benign protein-design prompts")
359
 
360
  seen: dict[str, str] = {}
361
  for p, c in zip(sub["provider"], colors):
362
  if p not in seen:
363
  seen[p] = c
364
- patches = [mpatches.Patch(color=c, label=p.upper()) for p, c in seen.items()]
365
- ax.legend(handles=patches, loc="upper right", fontsize=8, ncol=2)
 
366
  fig.tight_layout()
367
  return fig
368
 
@@ -381,7 +644,7 @@ def make_fig3(stats: pd.DataFrame) -> plt.Figure:
381
  opus_stats["opus_label"] = opus_stats["model_id"].map(id_to_label)
382
 
383
  x = np.arange(len(opus_labels))
384
- fig, ax = plt.subplots(figsize=(7, 4.5))
385
 
386
  for tier in ["benign", "borderline", "dual_use"]:
387
  sub = (
@@ -391,29 +654,33 @@ def make_fig3(stats: pd.DataFrame) -> plt.Figure:
391
  )
392
  rates = np.asarray(sub["refusal_rate"], dtype=float)
393
  raw = np.asarray(sub["raw_rate"], dtype=float)
394
- lo = np.asarray(sub["ci_lo"], dtype=float)
395
- hi = np.asarray(sub["ci_hi"], dtype=float)
396
  color = TIER_COLORS[tier]
397
  label = TIER_LABELS[tier]
398
 
399
- ax.plot(x, rates, marker="o", color=color, linewidth=2, label=label, zorder=3)
 
400
  ax.fill_between(x, lo, hi, alpha=0.15, color=color, zorder=2)
401
  for xi, r, rr in zip(x, rates, raw):
402
  if not np.isnan(r):
403
  ax.annotate(
404
  f"{round(rr * 100):.0f}%",
405
  (xi, r),
406
- textcoords="offset points", xytext=(0, 7),
407
- ha="center", fontsize=8, color=color,
408
  )
409
 
410
  ax.set_xticks(x)
411
- ax.set_xticklabels(opus_labels, fontsize=10)
412
- ax.set_ylabel("Strict refusal rate")
413
  ax.set_ylim(0, 1.15)
414
- ax.grid(axis="y", alpha=0.3)
415
- ax.legend(title="Tier", loc="center left", bbox_to_anchor=(1.01, 0.5))
416
- ax.set_title("Longitudinal refusal trajectory: Opus 4.5 / 4.6 / 4.7")
 
 
 
417
  fig.tight_layout()
418
  return fig
419
 
@@ -424,7 +691,7 @@ def make_fig5(stats: pd.DataFrame) -> plt.Figure:
424
  model_order = overall["model"].tolist()
425
 
426
  x = np.arange(len(model_order))
427
- width = 0.22
428
  tiers = ["benign", "borderline", "dual_use"]
429
 
430
  fig, ax = plt.subplots(figsize=(13, 5))
@@ -434,67 +701,33 @@ def make_fig5(stats: pd.DataFrame) -> plt.Figure:
434
  .set_index("model")
435
  .reindex(model_order)
436
  )
437
- rates = np.asarray(sub["raw_rate"].fillna(0), dtype=float)
438
- lo = np.asarray(sub["ci_lo"].fillna(0), dtype=float)
439
- hi = np.asarray(sub["ci_hi"].fillna(0), dtype=float)
440
  offset = (i - 1) * width
441
  ax.bar(x + offset, rates, width, label=TIER_LABELS[tier],
442
- color=TIER_COLORS[tier], alpha=0.87)
443
  ax.errorbar(
444
  x + offset, rates,
445
  yerr=[(rates - lo).clip(0), (hi - rates).clip(0)],
446
- fmt="none", color="black", capsize=2.5, linewidth=0.9,
 
447
  )
448
 
449
  ax.set_xticks(x)
450
  ax.set_xticklabels(model_order, rotation=35, ha="right", fontsize=8.5)
451
- ax.set_ylabel("Strict refusal rate")
452
- ax.set_ylim(0, 1.12)
453
- ax.legend(title="Tier", fontsize=9)
454
- ax.grid(axis="y", alpha=0.3)
455
- ax.set_title("Tier-stratified refusal rates: benign vs borderline vs dual-use")
 
 
 
456
  fig.tight_layout()
457
  return fig
458
 
459
 
460
- # ── Key stats banner ──────────────────────────────────────────────────────────
461
-
462
- def _stats_banner(stats: pd.DataFrame, overall: pd.DataFrame) -> str:
463
- n_models = stats["model_id"].nunique()
464
- n_trials = stats["n"].sum()
465
- n_prompts = 141 # fixed
466
- top_model = overall.iloc[0]["model"]
467
- top_rate = overall.iloc[0]["raw_rate"]
468
- return f"""
469
- <div style="display:flex;gap:16px;flex-wrap:wrap;margin-bottom:12px;">
470
- <div style="background:#FFF5F5;border:1px solid #FEB2B2;border-radius:8px;
471
- padding:12px 18px;min-width:120px;text-align:center;">
472
- <div style="font-size:1.6em;font-weight:700;color:#C53030;">{n_models}</div>
473
- <div style="font-size:0.82em;color:#744210;">models evaluated</div>
474
- </div>
475
- <div style="background:#F0FFF4;border:1px solid #9AE6B4;border-radius:8px;
476
- padding:12px 18px;min-width:120px;text-align:center;">
477
- <div style="font-size:1.6em;font-weight:700;color:#276749;">{n_prompts}</div>
478
- <div style="font-size:0.82em;color:#276749;">prompts (v1.0)</div>
479
- </div>
480
- <div style="background:#EBF8FF;border:1px solid #90CDF4;border-radius:8px;
481
- padding:12px 18px;min-width:120px;text-align:center;">
482
- <div style="font-size:1.6em;font-weight:700;color:#2C5282;">{n_trials:,}</div>
483
- <div style="font-size:0.82em;color:#2C5282;">adjudicated trials</div>
484
- </div>
485
- <div style="background:#FAF5FF;border:1px solid #D6BCFA;border-radius:8px;
486
- padding:12px 18px;min-width:180px;text-align:center;">
487
- <div style="font-size:1.6em;font-weight:700;color:#553C9A;">
488
- {top_rate:.0%}
489
- </div>
490
- <div style="font-size:0.82em;color:#553C9A;">
491
- highest refusal ({top_model})
492
- </div>
493
- </div>
494
- </div>
495
- """
496
-
497
-
498
  # ── App ───────────────────────────────────────────────────────────────────────
499
 
500
  try:
@@ -507,25 +740,38 @@ except FileNotFoundError as exc:
507
  except Exception as exc:
508
  raise SystemExit(f"[RefusalBench Space] Failed to load stats: {exc}") from exc
509
 
510
- OVERALL_STATS = overall_stats(STATS) # pre-computed once; reused by leaderboard & banner
511
-
512
- HEADER = """
513
- <div style="text-align:center;padding:16px 0 8px;">
514
- <h1 style="margin:0;font-family:serif;font-size:2em;">🧬 RefusalBench</h1>
515
- <p style="margin:4px 0 0;color:#4A5568;font-size:1.05em;">
516
- Frontier LLM refusal on biological research prompts — 19 models · 141 prompts · 3 tiers
517
- </p>
518
- <p style="margin:8px 0 0;font-size:0.9em;">
519
- <a href="https://github.com/AppliedScientific/refusalbench" target="_blank">
520
- 📦 GitHub
521
- </a>
 
 
522
  &nbsp;·&nbsp;
523
- <a href="https://arxiv.org/abs/2605.21545" target="_blank">
524
- 📄 Paper (arXiv:2605.21545)
525
- </a>
526
  &nbsp;·&nbsp;
527
- Snapshot: <code>v1.1-frozen · May 2026</code>
528
- </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  </div>
530
  """
531
 
@@ -551,9 +797,9 @@ Based on a 75-trial should-refuse positive-control sweep (15 prompts × 5 trials
551
 
552
  | Tier | TPR threshold | Interpretation |
553
  |---|---|---|
554
- | **A** | ≥ 95% | Reliably refuses clearly dangerous prompts |
555
- | **B** | 9–73% | Intermediate calibration |
556
- | **C** | ≤ 1.3% | Effectively never refuses |
557
  | **—** | Gap zone | Between formal tiers |
558
 
559
  ---
@@ -561,8 +807,8 @@ Based on a 75-trial should-refuse positive-control sweep (15 prompts × 5 trials
561
  ## Snapshot
562
 
563
  - **Version:** v1.1-frozen (May 2026)
564
- - **Main sweep:** 18 frontier models + 1 control (Llama 3.3 70B)
565
- - **v1.1 addition:** NVIDIA Nemotron 3 Super 120B (★)
566
  - **Data:** `data/adjudicated.csv` (bundled in this Space) — compliance labels only; raw prompt text is not published. Full snapshot in the [GitHub repo](https://github.com/AppliedScientific/refusalbench).
567
 
568
  ---
@@ -595,23 +841,22 @@ def update_leaderboard(jur_filter: str, sort_by: str) -> str:
595
 
596
  with gr.Blocks(
597
  theme=gr.themes.Soft(
598
- primary_hue="red",
599
- secondary_hue="indigo",
 
 
600
  ),
601
  title="RefusalBench",
602
- css="""
603
- .gradio-container { max-width: 1100px !important; }
604
- footer { display: none !important; }
605
- """,
606
  ) as demo:
607
 
608
- gr.HTML(HEADER)
609
- gr.HTML(_stats_banner(STATS, OVERALL_STATS))
610
 
611
  with gr.Tabs():
612
 
613
  # ── Tab 1: Leaderboard ─────────────────────────────────────────────
614
- with gr.Tab("🏆 Leaderboard"):
615
  with gr.Row():
616
  jur_dd = gr.Dropdown(
617
  choices=["All", "US", "EU", "Asia"],
@@ -630,54 +875,45 @@ with gr.Blocks(
630
  value=build_leaderboard_html(STATS, OVERALL_STATS, "All", "Overall")
631
  )
632
 
633
- jur_dd.change(
634
- fn=update_leaderboard,
635
- inputs=[jur_dd, sort_dd],
636
- outputs=leaderboard_html,
637
- )
638
- sort_dd.change(
639
- fn=update_leaderboard,
640
- inputs=[jur_dd, sort_dd],
641
- outputs=leaderboard_html,
642
- )
643
 
644
  # ── Tab 2: Provider figures ────────────────────────────────────────
645
- with gr.Tab("📊 Provider Analysis"):
646
  gr.Markdown(
647
- "**Figure 1** Benign-tier strict refusal rate for all 19 models, "
648
- "sorted descending, coloured by provider organisation. "
649
- "Error bars = Wilson 95% CI."
650
  )
651
  gr.Plot(value=make_fig1(STATS))
652
 
653
  gr.Markdown(
654
- "**Figure 2** Tier-stratified rates for all 19 models. "
655
- "Benign (green) / Borderline (amber) / Dual-use (red). "
656
- "Models sorted by overall rate descending."
657
  )
658
  gr.Plot(value=make_fig5(STATS))
659
 
660
  # ── Tab 3: Longitudinal ────────────────────────────────────────────
661
- with gr.Tab("📈 Opus Longitudinal"):
662
  gr.Markdown(
663
- "**Figure 3** Refusal trajectory across Opus 4.5 4.6 4.7 "
664
- "by tier. Shaded bands = Wilson 95% CI. "
665
- "Point labels use raw rates (n_refused / n); "
666
- "line position uses Wilson centre."
667
  )
668
  gr.Plot(value=make_fig3(STATS))
669
  gr.Markdown(
670
  """
671
- **Key finding (H4):** Dual-use refusal is at ceiling (100%) across all three Opus versions.
672
- Benign-tier refusal is flat from Opus 4.5 → 4.6 (33%), then jumps +44 pp to 77% at Opus 4.7,
673
- reducing Youden's J by 65% (from +67 pp to +23 pp). The 4.6 → 4.7 McNemar test gives
674
  χ²(cc) = 107, p ≈ 0 on 703 matched triples, with 112 new benign refusals and 0 reversals.
675
  """
676
  )
677
 
678
  # ── Tab 4: About ───────────────────────────────────────────────────
679
- with gr.Tab("ℹ️ About"):
680
  gr.Markdown(ABOUT_MD)
681
 
 
682
  if __name__ == "__main__":
683
  demo.launch()
 
1
+ """RefusalBench — HuggingFace Space (v2)
2
  Interactive leaderboard and figures for the RefusalBench paper.
3
 
4
  Data: data/adjudicated.csv (13,389 adjudicated rows, v1.1-frozen snapshot)
 
19
  # ── Typography ────────────────────────────────────────────────────────────────
20
  mpl.rcParams.update(
21
  {
22
+ "font.family": "sans-serif",
23
+ "font.sans-serif": ["Inter", "Helvetica Neue", "Helvetica", "Arial", "DejaVu Sans"],
24
+ "axes.titlesize": 13,
25
+ "axes.titleweight": "semibold",
26
  "axes.labelsize": 11,
27
  "xtick.labelsize": 9,
28
  "ytick.labelsize": 9,
29
  "legend.fontsize": 9,
30
+ "axes.spines.top": False,
31
+ "axes.spines.right": False,
32
+ "axes.edgecolor": "#94A3B8",
33
+ "axes.labelcolor": "#94A3B8",
34
+ "xtick.color": "#94A3B8",
35
+ "ytick.color": "#94A3B8",
36
+ "figure.facecolor": "none",
37
+ "axes.facecolor": "none",
38
+ "savefig.facecolor": "none",
39
+ "savefig.transparent": True,
40
  }
41
  )
42
 
43
  # ── Model metadata ────────────────────────────────────────────────────────────
 
44
  MODEL_META: dict[str, tuple[str, str, str, str]] = {
45
  "anthropic/claude-opus-4.7": ("Claude Opus 4.7", "Anthropic", "anthropic", "US"),
46
  "anthropic/claude-opus-4.6": ("Claude Opus 4.6", "Anthropic", "anthropic", "US"),
 
54
  "moonshotai/kimi-k2.6-20260420": ("Kimi K2.6", "Moonshot AI", "moonshot", "Asia"),
55
  "minimax/minimax-m2.7-20260318": ("MiniMax M2.7", "MiniMax", "minimax", "Asia"),
56
  "us.amazon.nova-pro-v1:0": ("Amazon Nova Pro", "Amazon", "amazon", "US"),
57
+ "us.meta.llama3-3-70b-instruct-v1:0": ("Llama 3.3 70B", "Meta", "meta", "US"),
58
  "mistral.mistral-large-3-675b-instruct": ("Mistral Large 3", "Mistral", "mistral", "EU"),
59
  "deepseek.v3.2": ("DeepSeek V3.2", "DeepSeek", "deepseek", "Asia"),
60
  "us.deepseek.r1-v1:0": ("DeepSeek R1", "DeepSeek", "deepseek", "Asia"),
61
  "qwen.qwen3-next-80b-a3b": ("Qwen3 Next 80B", "Qwen", "qwen", "Asia"),
62
  "zai.glm-5": ("GLM-5", "Z.AI", "zai", "Asia"),
63
+ "nvidia.nemotron-super-3-120b": ("Nemotron 3 Super 120B", "NVIDIA", "nvidia", "US"),
64
+ }
65
+
66
+ NOTE_FLAGS: dict[str, str] = {
67
+ "us.meta.llama3-3-70b-instruct-v1:0": "non-frontier open-source control",
68
+ "nvidia.nemotron-super-3-120b": "added v1.1",
69
  }
70
 
71
  # PC Tier from should-refuse positive control (TPR threshold: A ≥ 95%, B 9–73%)
 
91
  "us.meta.llama3-3-70b-instruct-v1:0": "—",
92
  }
93
 
94
+ # Restrained provider palette — saturated enough to read on dark + light
95
  PROVIDER_COLORS: dict[str, str] = {
96
+ "anthropic": "#D97757",
97
+ "openai": "#10A37F",
98
+ "google": "#4285F4",
99
+ "amazon": "#FF9900",
100
+ "meta": "#0866FF",
101
+ "mistral": "#FA520F",
102
+ "deepseek": "#4D6BFE",
103
+ "qwen": "#615CED",
104
+ "zai": "#06A77D",
105
+ "xai": "#1DA1F2",
106
+ "moonshot": "#8B5CF6",
107
+ "minimax": "#EC4899",
108
  "nvidia": "#76B900",
109
+ "other": "#94A3B8",
110
  }
111
 
112
+ # Tier colors (chosen to work on both dark and light Gradio Soft backgrounds)
113
+ TIER_COLORS = {
114
+ "benign": "#10B981", # emerald
115
+ "borderline": "#F59E0B", # amber
116
+ "dual_use": "#EF4444", # red
117
+ "overall": "#6366F1", # indigo
118
+ }
119
  TIER_LABELS = {"benign": "Benign", "borderline": "Borderline", "dual_use": "Dual-use"}
120
  JURS = {"US": "🇺🇸", "EU": "🇪🇺", "Asia": "🌏"}
121
 
 
191
  return pd.DataFrame(rows).sort_values("refusal_rate", ascending=False)
192
 
193
 
194
+ def headline_spread(stats: pd.DataFrame) -> tuple[float, float, str, str]:
195
+ """Return (min, max, min_model, max_model) for PC-Tier-A models on benign."""
196
+ sub = stats[(stats["pc_tier"] == "A") & (stats["tier"] == "benign")].copy()
197
+ if sub.empty:
198
+ return 0.0, 0.0, "", ""
199
+ lo_row = sub.loc[sub["raw_rate"].idxmin()]
200
+ hi_row = sub.loc[sub["raw_rate"].idxmax()]
201
+ return (
202
+ float(lo_row["raw_rate"]),
203
+ float(hi_row["raw_rate"]),
204
+ str(lo_row["model"]),
205
+ str(hi_row["model"]),
206
+ )
207
+
208
+
209
+ # ── Theme-aware CSS (uses Gradio CSS variables for dark/light support) ───────
210
+
211
+ _PC_BADGE_CSS = """
212
+ .pc-badge {
213
+ display: inline-block;
214
+ min-width: 22px;
215
+ padding: 2px 8px;
216
+ border-radius: 999px;
217
+ font-weight: 700;
218
+ font-size: 0.78em;
219
+ text-align: center;
220
+ letter-spacing: 0.02em;
221
+ }
222
+ .pc-A { background: rgba(16, 185, 129, 0.16); color: #059669; border: 1px solid rgba(16, 185, 129, 0.35); }
223
+ .pc-B { background: rgba(245, 158, 11, 0.16); color: #B45309; border: 1px solid rgba(245, 158, 11, 0.40); }
224
+ .pc-C { background: rgba(239, 68, 68, 0.16); color: #B91C1C; border: 1px solid rgba(239, 68, 68, 0.40); }
225
+ .pc-x { background: var(--background-fill-secondary, #F1F5F9); color: var(--body-text-color-subdued, #64748B); border: 1px solid var(--border-color-primary, #E2E8F0); }
226
+ @media (prefers-color-scheme: dark) {
227
+ .pc-A { color: #34D399; }
228
+ .pc-B { color: #FBBF24; }
229
+ .pc-C { color: #F87171; }
230
+ }
231
+ """
232
+
233
+ _HERO_CSS = """
234
+ .rb-hero {
235
+ display: flex;
236
+ gap: 22px;
237
+ align-items: center;
238
+ padding: 22px 26px;
239
+ border-radius: 16px;
240
+ background:
241
+ linear-gradient(135deg, rgba(239, 68, 68, 0.10), rgba(99, 102, 241, 0.10)),
242
+ var(--background-fill-secondary, #F8FAFC);
243
+ border: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.3));
244
+ margin: 6px 0 18px;
245
+ }
246
+ .rb-hero-number {
247
+ flex-shrink: 0;
248
+ text-align: center;
249
+ padding: 0 14px;
250
+ border-right: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.3));
251
+ }
252
+ .rb-hero-number .big {
253
+ font-size: 2.6em;
254
+ font-weight: 800;
255
+ line-height: 1;
256
+ letter-spacing: -0.02em;
257
+ background: linear-gradient(135deg, #EF4444, #6366F1);
258
+ -webkit-background-clip: text;
259
+ background-clip: text;
260
+ color: transparent;
261
+ }
262
+ .rb-hero-number .label {
263
+ font-size: 0.75em;
264
+ color: var(--body-text-color-subdued, #64748B);
265
+ margin-top: 4px;
266
+ text-transform: uppercase;
267
+ letter-spacing: 0.08em;
268
+ }
269
+ .rb-hero-text {
270
+ flex: 1;
271
+ color: var(--body-text-color, inherit);
272
+ font-size: 1em;
273
+ line-height: 1.5;
274
+ }
275
+ .rb-hero-text strong { font-weight: 700; }
276
+ .rb-hero-text .thesis { font-size: 1.08em; font-weight: 600; display: block; margin-bottom: 4px; }
277
+ .rb-hero-text .body { color: var(--body-text-color-subdued, #475569); }
278
+ """
279
+
280
+ _HEADER_CSS = """
281
+ .rb-header { text-align: center; padding: 18px 0 6px; }
282
+ .rb-header h1 {
283
+ margin: 0;
284
+ font-size: 2.4em;
285
+ font-weight: 800;
286
+ letter-spacing: -0.025em;
287
+ background: linear-gradient(135deg, #EF4444, #6366F1);
288
+ -webkit-background-clip: text;
289
+ background-clip: text;
290
+ color: transparent;
291
+ }
292
+ .rb-header .sub {
293
+ margin: 6px 0 10px;
294
+ color: var(--body-text-color-subdued, #64748B);
295
+ font-size: 1.02em;
296
+ }
297
+ .rb-header .meta { font-size: 0.86em; color: var(--body-text-color-subdued, #64748B); }
298
+ .rb-header .meta a { color: var(--body-text-color, inherit); text-decoration: none; border-bottom: 1px dotted currentColor; }
299
+ .rb-header .meta a:hover { color: #6366F1; }
300
+ .rb-header .pill {
301
+ display: inline-block;
302
+ padding: 2px 9px;
303
+ border-radius: 999px;
304
+ font-family: ui-monospace, SFMono-Regular, monospace;
305
+ font-size: 0.82em;
306
+ background: var(--background-fill-secondary, rgba(99, 102, 241, 0.08));
307
+ border: 1px solid var(--border-color-primary, rgba(99, 102, 241, 0.2));
308
+ color: var(--body-text-color, inherit);
309
+ }
310
+ """
311
+
312
+ _TABLE_CSS = """
313
+ .rb-tablewrap {
314
+ border: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.25));
315
+ border-radius: 12px;
316
+ overflow: hidden;
317
+ background: var(--background-fill-primary, transparent);
318
+ }
319
+ .rb-tablewrap table {
320
+ width: 100%;
321
+ border-collapse: separate;
322
+ border-spacing: 0;
323
+ font-size: 0.92em;
324
+ color: var(--body-text-color, inherit);
325
+ }
326
+ .rb-tablewrap thead th {
327
+ position: sticky;
328
+ top: 0;
329
+ z-index: 2;
330
+ background: var(--background-fill-secondary, #F8FAFC);
331
+ color: var(--body-text-color-subdued, #475569);
332
+ font-weight: 600;
333
+ font-size: 0.82em;
334
+ letter-spacing: 0.04em;
335
+ text-transform: uppercase;
336
+ padding: 10px 10px;
337
+ text-align: left;
338
+ border-bottom: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.25));
339
+ }
340
+ .rb-tablewrap thead th.center { text-align: center; }
341
+ .rb-tablewrap thead .grp {
342
+ text-transform: none;
343
+ letter-spacing: 0;
344
+ font-weight: 700;
345
+ color: var(--body-text-color, inherit);
346
+ font-size: 0.86em;
347
+ border-bottom: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.18));
348
+ background: var(--background-fill-secondary, rgba(99, 102, 241, 0.05));
349
+ }
350
+ .rb-tablewrap tbody tr { transition: background 120ms ease; }
351
+ .rb-tablewrap tbody tr:hover {
352
+ background: var(--background-fill-secondary, rgba(99, 102, 241, 0.04)) !important;
353
+ }
354
+ .rb-tablewrap tbody td {
355
+ padding: 11px 10px;
356
+ border-bottom: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.14));
357
+ vertical-align: middle;
358
+ }
359
+ .rb-tablewrap tbody tr:last-child td { border-bottom: 0; }
360
+ .rb-rank {
361
+ color: var(--body-text-color-subdued, #94A3B8);
362
+ font-size: 0.85em;
363
+ font-variant-numeric: tabular-nums;
364
+ text-align: center;
365
+ width: 30px;
366
+ }
367
+ .rb-model {
368
+ white-space: nowrap;
369
+ font-weight: 600;
370
+ color: var(--body-text-color, inherit);
371
+ }
372
+ .rb-dot {
373
+ display: inline-block;
374
+ width: 9px; height: 9px;
375
+ border-radius: 50%;
376
+ margin-right: 8px;
377
+ vertical-align: middle;
378
+ box-shadow: 0 0 0 1.5px var(--background-fill-primary, white);
379
+ }
380
+ .rb-org {
381
+ color: var(--body-text-color-subdued, #64748B);
382
+ font-size: 0.88em;
383
+ white-space: nowrap;
384
+ }
385
+ .rb-flag { text-align: center; font-size: 1.05em; }
386
+ .rb-note {
387
+ font-size: 0.72em;
388
+ color: var(--body-text-color-subdued, #94A3B8);
389
+ font-style: italic;
390
+ margin-left: 6px;
391
+ }
392
+ .rb-cell {
393
+ text-align: right;
394
+ font-variant-numeric: tabular-nums;
395
+ padding: 11px 12px !important;
396
+ min-width: 92px;
397
+ }
398
+ .rb-pct {
399
+ font-size: 1.05em;
400
+ font-weight: 700;
401
+ color: var(--body-text-color, inherit);
402
+ letter-spacing: -0.01em;
403
+ }
404
+ .rb-bar {
405
+ height: 5px;
406
+ border-radius: 3px;
407
+ margin-top: 5px;
408
+ background: var(--background-fill-secondary, rgba(148, 163, 184, 0.18));
409
+ overflow: hidden;
410
+ position: relative;
411
+ }
412
+ .rb-bar-fill {
413
+ display: block;
414
+ height: 100%;
415
+ border-radius: 3px;
416
+ }
417
+ .rb-na { color: var(--body-text-color-subdued, #94A3B8); font-weight: 500; }
418
+ .rb-intro {
419
+ color: var(--body-text-color-subdued, #64748B);
420
+ font-size: 0.88em;
421
+ margin: 4px 2px 14px;
422
+ line-height: 1.55;
423
+ }
424
+ .rb-footer {
425
+ margin-top: 14px;
426
+ padding: 12px 4px 0;
427
+ font-size: 0.78em;
428
+ color: var(--body-text-color-subdued, #64748B);
429
+ line-height: 1.7;
430
+ border-top: 1px solid var(--border-color-primary, rgba(148, 163, 184, 0.18));
431
+ }
432
+ .rb-footer strong { color: var(--body-text-color, inherit); font-weight: 600; }
433
+ .rb-footer code {
434
+ background: var(--background-fill-secondary, rgba(148, 163, 184, 0.12));
435
+ padding: 1px 5px;
436
+ border-radius: 4px;
437
+ font-size: 0.92em;
438
+ }
439
+ """
440
+
441
+ CSS = (
442
+ """
443
+ .gradio-container { max-width: 1240px !important; }
444
+ footer { display: none !important; }
445
+ /* hide gr.Plot's locale-translated floating label ("Diagramm"/"Plot") */
446
+ .block.auto-margin > label.float { display: none !important; }
447
+ """
448
+ + _HEADER_CSS
449
+ + _HERO_CSS
450
+ + _PC_BADGE_CSS
451
+ + _TABLE_CSS
452
+ )
453
+
454
+
455
  # ── Leaderboard HTML ──────────────────────────────────────────────────────────
456
 
457
+ _PC_BADGE = {
458
+ "A": '<span class="pc-badge pc-A" title="≥95% TPR on the should-refuse positive control">A</span>',
459
+ "B": '<span class="pc-badge pc-B" title="9–73% TPR on the should-refuse positive control">B</span>',
460
+ "C": '<span class="pc-badge pc-C" title="≤1.3% TPR on the should-refuse positive control">C</span>',
461
+ "—": '<span class="pc-badge pc-x" title="In the gap zone between formal tiers">—</span>',
462
  }
463
 
464
 
465
+ def _rate_cell(t: tuple | None, tier_color: str) -> str:
466
+ """Render a single rate cell: %, bar below, full Wilson CI on hover."""
467
+ if t is None:
468
+ return '<td class="rb-cell"><span class="rb-na">—</span></td>'
469
+ _rate, lo, hi, raw = t
470
+ pct = f"{raw:.0%}"
471
+ bar_w = f"{max(2, raw * 100):.1f}%" # min width so tiny rates still show
472
+ tooltip = f"Wilson 95% CI: {lo:.1%} – {hi:.1%} (raw = {raw:.1%})"
473
+ return (
474
+ f'<td class="rb-cell" title="{tooltip}">'
475
+ f'<div class="rb-pct">{pct}</div>'
476
+ f'<div class="rb-bar"><span class="rb-bar-fill" '
477
+ f'style="width:{bar_w};background:{tier_color};"></span></div>'
478
+ f'</td>'
479
+ )
480
+
481
+
482
  def build_leaderboard_html(
483
  stats: pd.DataFrame,
484
  overall: pd.DataFrame,
485
  jur_filter: str = "All",
486
  sort_by: str = "Overall",
487
  ) -> str:
488
+ # Pivot per-tier data keyed by model_id
 
489
  pivot: dict[str, dict] = {}
490
  for _, row in stats.iterrows():
491
  mid = row["model_id"]
492
  if mid not in pivot:
493
  pivot[mid] = {
494
+ "model_id": mid,
495
  "model": row["model"],
496
  "org": row["org"],
497
  "provider": row["provider"],
 
507
  )
508
 
509
  rows_data = list(pivot.values())
 
 
510
  if jur_filter != "All":
511
  rows_data = [r for r in rows_data if r["jurisdiction"] == jur_filter]
512
 
 
518
  }.get(sort_by, lambda r: r.get("overall", (0,))[0])
519
  rows_data.sort(key=sort_key, reverse=True)
520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  intro = (
522
+ '<p class="rb-intro">'
523
+ 'Each cell shows the <strong>strict refusal rate</strong> '
524
+ '(direct + indirect refusal)hover for the Wilson 95 % confidence interval. '
525
+ 'Bars scale with magnitude. PC Tier reflects positive-control calibration on '
526
+ '15 clearly-dangerous prompts.'
527
  '</p>'
528
  )
529
 
530
+ header = f"""
531
+ <div class="rb-tablewrap">
532
+ <table>
533
  <thead>
534
+ <tr>
535
+ <th rowspan="2">#</th>
536
+ <th rowspan="2">Model</th>
537
+ <th rowspan="2">Org</th>
538
+ <th rowspan="2" class="center">Jur.</th>
539
+ <th colspan="4" class="center grp">Strict refusal rate</th>
540
+ <th rowspan="2" class="center">PC<br>Tier</th>
 
 
 
 
 
 
 
 
 
 
541
  </tr>
542
+ <tr>
543
+ <th class="center" style="color:{TIER_COLORS['benign']};">Benign</th>
544
+ <th class="center" style="color:{TIER_COLORS['borderline']};">Borderline</th>
545
+ <th class="center" style="color:{TIER_COLORS['dual_use']};">Dual-use</th>
546
+ <th class="center" style="color:{TIER_COLORS['overall']};">Overall</th>
 
 
 
 
 
547
  </tr>
548
  </thead>
549
  <tbody>
550
  """
551
 
 
 
 
 
 
552
  body = ""
553
  for i, r in enumerate(rows_data):
554
+ dot_color = PROVIDER_COLORS.get(r["provider"], "#94A3B8")
 
555
  jur_flag = JURS.get(r["jurisdiction"], r["jurisdiction"])
556
+ badge = _PC_BADGE.get(r["pc_tier"], _PC_BADGE["—"])
557
+ note = NOTE_FLAGS.get(r["model_id"])
558
+ note_html = f'<span class="rb-note">— {note}</span>' if note else ""
559
 
560
  body += f"""
561
+ <tr>
562
+ <td class="rb-rank">{i + 1}</td>
563
+ <td class="rb-model">
564
+ <span class="rb-dot" style="background:{dot_color};"></span>{r["model"]}{note_html}
 
 
 
565
  </td>
566
+ <td class="rb-org">{r["org"]}</td>
567
+ <td class="rb-flag">{jur_flag}</td>
568
+ {_rate_cell(r.get("benign"), TIER_COLORS["benign"])}
569
+ {_rate_cell(r.get("borderline"), TIER_COLORS["borderline"])}
570
+ {_rate_cell(r.get("dual_use"), TIER_COLORS["dual_use"])}
571
+ {_rate_cell(r.get("overall"), TIER_COLORS["overall"])}
572
+ <td class="rb-flag">{badge}</td>
 
573
  </tr>"""
574
 
575
  footer = """
576
  </tbody>
577
  </table>
578
+ </div>
579
+ <div class="rb-footer">
580
+ <strong>Strict refusal</strong> = <code>direct_refusal</code> + <code>indirect_refusal</code> across 3 tiers × 47 prompts × 5 trials.
581
+ &nbsp;·&nbsp; <strong>PC Tier</strong>: A ≥ 95 % TPR, B 9–73 % TPR on the 75-trial should-refuse positive control; — = gap zone.
582
+ &nbsp;·&nbsp; High <strong>benign</strong> refusal indicates over-refusal on safe prompts.
 
 
583
  </div>
584
  """
585
  return intro + header + body + footer
 
587
 
588
  # ── Figures ───────────────────────────────────────────────────────────────────
589
 
590
+ def _style_axes(ax: plt.Axes) -> None:
591
+ """Apply theme-neutral axis styling."""
592
+ ax.grid(axis="y", alpha=0.18, linestyle="-", linewidth=0.7, zorder=0)
593
+ ax.set_axisbelow(True)
594
+ for spine in ax.spines.values():
595
+ spine.set_color("#94A3B8")
596
+ spine.set_linewidth(0.7)
597
+
598
+
599
  def make_fig1(stats: pd.DataFrame) -> plt.Figure:
600
  """Provider gradient — benign tier, sorted by rate descending."""
601
  sub = stats[stats["tier"] == "benign"].copy()
602
  sub = sub.sort_values("raw_rate", ascending=False).reset_index(drop=True)
603
 
604
+ colors = [PROVIDER_COLORS.get(p, "#94A3B8") for p in sub["provider"]]
605
+ fig, ax = plt.subplots(figsize=(11, 4.8))
606
  x = np.arange(len(sub))
607
+ ax.bar(x, sub["raw_rate"], color=colors, alpha=0.92, width=0.72, zorder=3,
608
+ edgecolor="none")
609
  ax.errorbar(
610
  x, sub["raw_rate"],
611
  yerr=[sub["raw_rate"] - sub["ci_lo"], sub["ci_hi"] - sub["raw_rate"]],
612
+ fmt="none", color="#475569", capsize=3, linewidth=1.0, zorder=4, alpha=0.7,
613
  )
614
  ax.set_xticks(x)
615
+ ax.set_xticklabels(sub["model"], rotation=38, ha="right", fontsize=8.5)
616
+ ax.set_ylabel("Strict refusal rate (benign)", fontsize=10)
617
+ ax.set_ylim(0, 1.06)
618
+ ax.set_yticks(np.arange(0, 1.01, 0.2))
619
+ ax.set_yticklabels([f"{int(v*100)}%" for v in np.arange(0, 1.01, 0.2)])
620
+ _style_axes(ax)
621
 
622
  seen: dict[str, str] = {}
623
  for p, c in zip(sub["provider"], colors):
624
  if p not in seen:
625
  seen[p] = c
626
+ patches = [mpatches.Patch(color=c, label=p.title()) for p, c in seen.items()]
627
+ ax.legend(handles=patches, loc="upper right", fontsize=8, ncol=2,
628
+ frameon=False, labelcolor="#94A3B8")
629
  fig.tight_layout()
630
  return fig
631
 
 
644
  opus_stats["opus_label"] = opus_stats["model_id"].map(id_to_label)
645
 
646
  x = np.arange(len(opus_labels))
647
+ fig, ax = plt.subplots(figsize=(8.5, 4.6))
648
 
649
  for tier in ["benign", "borderline", "dual_use"]:
650
  sub = (
 
654
  )
655
  rates = np.asarray(sub["refusal_rate"], dtype=float)
656
  raw = np.asarray(sub["raw_rate"], dtype=float)
657
+ lo = np.asarray(sub["ci_lo"], dtype=float)
658
+ hi = np.asarray(sub["ci_hi"], dtype=float)
659
  color = TIER_COLORS[tier]
660
  label = TIER_LABELS[tier]
661
 
662
+ ax.plot(x, rates, marker="o", color=color, linewidth=2.3, label=label,
663
+ zorder=3, markersize=7, markeredgecolor="white", markeredgewidth=1.5)
664
  ax.fill_between(x, lo, hi, alpha=0.15, color=color, zorder=2)
665
  for xi, r, rr in zip(x, rates, raw):
666
  if not np.isnan(r):
667
  ax.annotate(
668
  f"{round(rr * 100):.0f}%",
669
  (xi, r),
670
+ textcoords="offset points", xytext=(0, 9),
671
+ ha="center", fontsize=8.5, color=color, fontweight="600",
672
  )
673
 
674
  ax.set_xticks(x)
675
+ ax.set_xticklabels(opus_labels, fontsize=10.5)
676
+ ax.set_ylabel("Strict refusal rate", fontsize=10)
677
  ax.set_ylim(0, 1.15)
678
+ ax.set_yticks(np.arange(0, 1.01, 0.2))
679
+ ax.set_yticklabels([f"{int(v*100)}%" for v in np.arange(0, 1.01, 0.2)])
680
+ _style_axes(ax)
681
+ leg = ax.legend(title="Tier", loc="center left", bbox_to_anchor=(1.01, 0.5),
682
+ frameon=False, labelcolor="#94A3B8", title_fontsize=9)
683
+ leg.get_title().set_color("#94A3B8")
684
  fig.tight_layout()
685
  return fig
686
 
 
691
  model_order = overall["model"].tolist()
692
 
693
  x = np.arange(len(model_order))
694
+ width = 0.24
695
  tiers = ["benign", "borderline", "dual_use"]
696
 
697
  fig, ax = plt.subplots(figsize=(13, 5))
 
701
  .set_index("model")
702
  .reindex(model_order)
703
  )
704
+ rates = np.asarray(sub["raw_rate"].fillna(0), dtype=float)
705
+ lo = np.asarray(sub["ci_lo"].fillna(0), dtype=float)
706
+ hi = np.asarray(sub["ci_hi"].fillna(0), dtype=float)
707
  offset = (i - 1) * width
708
  ax.bar(x + offset, rates, width, label=TIER_LABELS[tier],
709
+ color=TIER_COLORS[tier], alpha=0.92, edgecolor="none", zorder=3)
710
  ax.errorbar(
711
  x + offset, rates,
712
  yerr=[(rates - lo).clip(0), (hi - rates).clip(0)],
713
+ fmt="none", color="#475569", capsize=2, linewidth=0.8, alpha=0.65,
714
+ zorder=4,
715
  )
716
 
717
  ax.set_xticks(x)
718
  ax.set_xticklabels(model_order, rotation=35, ha="right", fontsize=8.5)
719
+ ax.set_ylabel("Strict refusal rate", fontsize=10)
720
+ ax.set_ylim(0, 1.10)
721
+ ax.set_yticks(np.arange(0, 1.01, 0.2))
722
+ ax.set_yticklabels([f"{int(v*100)}%" for v in np.arange(0, 1.01, 0.2)])
723
+ _style_axes(ax)
724
+ leg = ax.legend(title="Tier", fontsize=9, frameon=False, labelcolor="#94A3B8",
725
+ title_fontsize=9, loc="upper right")
726
+ leg.get_title().set_color("#94A3B8")
727
  fig.tight_layout()
728
  return fig
729
 
730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  # ── App ───────────────────────────────────────────────────────────────────────
732
 
733
  try:
 
740
  except Exception as exc:
741
  raise SystemExit(f"[RefusalBench Space] Failed to load stats: {exc}") from exc
742
 
743
+ OVERALL_STATS = overall_stats(STATS)
744
+ _LO, _HI, _LO_MODEL, _HI_MODEL = headline_spread(STATS)
745
+ _SPREAD_PP = round((_HI - _LO) * 100)
746
+ _N_TRIALS = int(STATS["n"].sum())
747
+ _N_MODELS = int(STATS["model_id"].nunique())
748
+
749
+ HEADER_HTML = f"""
750
+ <div class="rb-header">
751
+ <h1>RefusalBench</h1>
752
+ <div class="sub">Frontier-LLM refusal calibration on biological research prompts</div>
753
+ <div class="meta">
754
+ <a href="https://github.com/AppliedScientific/refusalbench" target="_blank">GitHub</a>
755
+ &nbsp;·&nbsp;
756
+ <a href="https://arxiv.org/abs/2605.21545" target="_blank">arXiv:2605.21545</a>
757
  &nbsp;·&nbsp;
758
+ <span class="pill">v1.1-frozen · May 2026</span>
 
 
759
  &nbsp;·&nbsp;
760
+ {_N_MODELS} models &nbsp;·&nbsp; 141 prompts &nbsp;·&nbsp; {_N_TRIALS:,} trials
761
+ </div>
762
+ </div>
763
+ """
764
+
765
+ HERO_HTML = f"""
766
+ <div class="rb-hero">
767
+ <div class="rb-hero-number">
768
+ <div class="big">{_SPREAD_PP} pp</div>
769
+ <div class="label">PC-Tier-A spread on benign</div>
770
+ </div>
771
+ <div class="rb-hero-text">
772
+ <span class="thesis">Refusal rate is not safety calibration.</span>
773
+ <span class="body">Across the eight models that reliably refuse clearly-dangerous prompts (PC&nbsp;Tier&nbsp;A,&nbsp;≥&nbsp;95&nbsp;%&nbsp;TPR), benign-prompt refusal ranges from <strong>{_LO:.0%} ({_LO_MODEL})</strong> to <strong>{_HI:.0%} ({_HI_MODEL})</strong>. The same headline number masks very different calibrations.</span>
774
+ </div>
775
  </div>
776
  """
777
 
 
797
 
798
  | Tier | TPR threshold | Interpretation |
799
  |---|---|---|
800
+ | **A** | ≥ 95 % | Reliably refuses clearly dangerous prompts |
801
+ | **B** | 9–73 % | Intermediate calibration |
802
+ | **C** | ≤ 1.3 % | Effectively never refuses |
803
  | **—** | Gap zone | Between formal tiers |
804
 
805
  ---
 
807
  ## Snapshot
808
 
809
  - **Version:** v1.1-frozen (May 2026)
810
+ - **Main sweep:** 18 frontier models + 1 control (Llama 3.3 70B — non-frontier open-source)
811
+ - **v1.1 addition:** NVIDIA Nemotron 3 Super 120B
812
  - **Data:** `data/adjudicated.csv` (bundled in this Space) — compliance labels only; raw prompt text is not published. Full snapshot in the [GitHub repo](https://github.com/AppliedScientific/refusalbench).
813
 
814
  ---
 
841
 
842
  with gr.Blocks(
843
  theme=gr.themes.Soft(
844
+ primary_hue="indigo",
845
+ secondary_hue="red",
846
+ neutral_hue="slate",
847
+ font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
848
  ),
849
  title="RefusalBench",
850
+ css=CSS,
 
 
 
851
  ) as demo:
852
 
853
+ gr.HTML(HEADER_HTML)
854
+ gr.HTML(HERO_HTML)
855
 
856
  with gr.Tabs():
857
 
858
  # ── Tab 1: Leaderboard ─────────────────────────────────────────────
859
+ with gr.Tab("Leaderboard"):
860
  with gr.Row():
861
  jur_dd = gr.Dropdown(
862
  choices=["All", "US", "EU", "Asia"],
 
875
  value=build_leaderboard_html(STATS, OVERALL_STATS, "All", "Overall")
876
  )
877
 
878
+ jur_dd.change(fn=update_leaderboard,
879
+ inputs=[jur_dd, sort_dd], outputs=leaderboard_html)
880
+ sort_dd.change(fn=update_leaderboard,
881
+ inputs=[jur_dd, sort_dd], outputs=leaderboard_html)
 
 
 
 
 
 
882
 
883
  # ── Tab 2: Provider figures ────────────────────────────────────────
884
+ with gr.Tab("Provider Analysis"):
885
  gr.Markdown(
886
+ "**Figure 1.** Benign-tier strict refusal rate for all 19 models, "
887
+ "sorted descending, coloured by provider. Error bars = Wilson 95 % CI."
 
888
  )
889
  gr.Plot(value=make_fig1(STATS))
890
 
891
  gr.Markdown(
892
+ "**Figure 2.** Tier-stratified rates across all 19 models "
893
+ "benign / borderline / dual-use side-by-side."
 
894
  )
895
  gr.Plot(value=make_fig5(STATS))
896
 
897
  # ── Tab 3: Longitudinal ────────────────────────────────────────────
898
+ with gr.Tab("Opus Longitudinal"):
899
  gr.Markdown(
900
+ "**Figure 3.** Refusal trajectory across Opus 4.5 to 4.6 to 4.7 "
901
+ "by tier. Shaded bands = Wilson 95 % CI."
 
 
902
  )
903
  gr.Plot(value=make_fig3(STATS))
904
  gr.Markdown(
905
  """
906
+ **Key finding (H4).** Dual-use refusal is at ceiling (100 %) across all three Opus versions.
907
+ Benign-tier refusal is flat from Opus 4.5 → 4.6 (33 %), then jumps **+44 pp** to 77 % at Opus 4.7,
908
+ reducing Youden's J by 65 % (from +67 pp to +23 pp). The 4.6 → 4.7 McNemar test gives
909
  χ²(cc) = 107, p ≈ 0 on 703 matched triples, with 112 new benign refusals and 0 reversals.
910
  """
911
  )
912
 
913
  # ── Tab 4: About ───────────────────────────────────────────────────
914
+ with gr.Tab("About"):
915
  gr.Markdown(ABOUT_MD)
916
 
917
+
918
  if __name__ == "__main__":
919
  demo.launch()