lbartoszcze commited on
Commit
f24cb80
·
verified ·
1 Parent(s): 26802a6

Add Methods Comparison tab with delta from baseline

Browse files
Files changed (1) hide show
  1. app.py +275 -50
app.py CHANGED
@@ -15,6 +15,28 @@ server = app.server
15
  # Load leaderboard data
16
  DATA_FILE = "leaderboard.csv"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def load_data():
19
  """Load leaderboard data from CSV."""
20
  if os.path.exists(DATA_FILE):
@@ -32,8 +54,47 @@ def load_data():
32
  "total_prompts", "timestamp", "submitter", "sample_responses_url"
33
  ])
34
 
35
- # Column definitions for AG Grid
36
- COLUMN_DEFS = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  {
38
  "field": "Rank",
39
  "headerName": "🏆",
@@ -112,6 +173,71 @@ COLUMN_DEFS = [
112
  },
113
  ]
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  # Topic-specific columns (added dynamically if present)
116
  TOPIC_COLUMNS = [
117
  "cybersecurity", "piracy", "weapons", "drugs", "fraud",
@@ -120,10 +246,11 @@ TOPIC_COLUMNS = [
120
  "evasion", "self_harm", "adult_content"
121
  ]
122
 
123
- def get_column_defs(df):
 
124
  """Get column definitions based on available data."""
125
- cols = COLUMN_DEFS.copy()
126
-
127
  # Add topic columns if they exist in the data
128
  for topic in TOPIC_COLUMNS:
129
  if topic in df.columns:
@@ -134,9 +261,10 @@ def get_column_defs(df):
134
  "sortable": True,
135
  "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
136
  })
137
-
138
  return cols
139
 
 
140
  # App layout
141
  app.layout = html.Div([
142
  # Header
@@ -147,7 +275,7 @@ app.layout = html.Div([
147
  style={"color": "#666", "marginTop": "0"}
148
  ),
149
  ], style={"textAlign": "center", "padding": "20px"}),
150
-
151
  # Info banner
152
  html.Div([
153
  html.Div([
@@ -179,7 +307,7 @@ app.layout = html.Div([
179
  "marginLeft": "20px",
180
  "marginRight": "20px",
181
  }),
182
-
183
  # Stats summary
184
  html.Div(id="stats-summary", style={
185
  "display": "flex",
@@ -187,35 +315,23 @@ app.layout = html.Div([
187
  "gap": "40px",
188
  "marginBottom": "20px",
189
  }),
190
-
191
- # Leaderboard table
192
- html.Div([
193
- dag.AgGrid(
194
- id="leaderboard-grid",
195
- columnDefs=COLUMN_DEFS,
196
- rowData=[],
197
- defaultColDef={
198
- "resizable": True,
199
- "sortable": True,
200
- },
201
- dashGridOptions={
202
- "pagination": True,
203
- "paginationPageSize": 50,
204
- "animateRows": True,
205
- "rowSelection": "single",
206
- },
207
- style={"height": "600px"},
208
- className="ag-theme-alpine",
209
- ),
210
- ], style={"padding": "0 20px 20px 20px"}),
211
-
212
  # Refresh interval
213
  dcc.Interval(
214
  id="refresh-interval",
215
  interval=60000, # Refresh every 60 seconds
216
  n_intervals=0
217
  ),
218
-
219
  # Footer
220
  html.Div([
221
  html.Hr(),
@@ -231,37 +347,54 @@ app.layout = html.Div([
231
  html.A("Submit your model", href="https://github.com/wisent-ai/uncensorbench#how-to-submit", target="_blank"),
232
  ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
233
  ], style={"padding": "20px"}),
234
-
235
  ], style={"fontFamily": "system-ui, -apple-system, sans-serif"})
236
 
237
 
238
  @callback(
239
- [Output("leaderboard-grid", "rowData"),
240
- Output("leaderboard-grid", "columnDefs"),
241
- Output("stats-summary", "children")],
242
  Input("refresh-interval", "n_intervals")
243
  )
244
- def update_leaderboard(n):
245
- """Update the leaderboard data."""
246
  df = load_data()
247
-
248
- # Get column definitions
249
- col_defs = get_column_defs(df)
250
-
251
- # Calculate stats
252
  if len(df) > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  stats = [
254
  html.Div([
255
  html.Div(str(len(df)), style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
256
  html.Div("Models", style={"color": "#666"}),
257
  ], style={"textAlign": "center"}),
258
  html.Div([
259
- html.Div(f"{df['uncensored_rate'].mean():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#4CAF50"}),
260
- html.Div("Avg Uncensored Rate", style={"color": "#666"}),
261
  ], style={"textAlign": "center"}),
262
  html.Div([
263
  html.Div(f"{df['uncensored_rate'].max():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#FF9800"}),
264
- html.Div("Best Uncensored Rate", style={"color": "#666"}),
 
 
 
 
 
 
 
265
  ], style={"textAlign": "center"}),
266
  ]
267
  else:
@@ -274,11 +407,103 @@ def update_leaderboard(n):
274
  html.P("No submissions yet. Be the first to submit!", style={"color": "#666"}),
275
  ], style={"textAlign": "center"}),
276
  ]
277
-
278
- # Convert to records for AG Grid
279
- row_data = df.to_dict("records") if len(df) > 0 else []
280
-
281
- return row_data, col_defs, stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
 
284
  if __name__ == "__main__":
 
15
  # Load leaderboard data
16
  DATA_FILE = "leaderboard.csv"
17
 
18
+ # Valid methods for censorship removal
19
+ VALID_METHODS = ["none", "abliteration", "steering", "finetuning", "prompting", "other"]
20
+
21
+ METHOD_DESCRIPTIONS = {
22
+ "none": "Baseline (no modification)",
23
+ "abliteration": "Abliteration technique",
24
+ "steering": "Steering vectors",
25
+ "finetuning": "Fine-tuning based",
26
+ "prompting": "Prompt-based jailbreaking",
27
+ "other": "Other methods",
28
+ }
29
+
30
+ METHOD_COLORS = {
31
+ "none": "#9E9E9E",
32
+ "abliteration": "#E91E63",
33
+ "steering": "#2196F3",
34
+ "finetuning": "#4CAF50",
35
+ "prompting": "#FF9800",
36
+ "other": "#9C27B0",
37
+ }
38
+
39
+
40
  def load_data():
41
  """Load leaderboard data from CSV."""
42
  if os.path.exists(DATA_FILE):
 
54
  "total_prompts", "timestamp", "submitter", "sample_responses_url"
55
  ])
56
 
57
+
58
+ def calculate_method_stats(df):
59
+ """Calculate statistics for each method including delta from baseline."""
60
+ if len(df) == 0:
61
+ return pd.DataFrame()
62
+
63
+ # Get baseline average (method = "none")
64
+ baseline_df = df[df["method"] == "none"]
65
+ baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
66
+
67
+ # Group by method
68
+ method_stats = []
69
+ for method in VALID_METHODS:
70
+ method_df = df[df["method"] == method]
71
+ if len(method_df) > 0:
72
+ avg_rate = method_df["uncensored_rate"].mean()
73
+ max_rate = method_df["uncensored_rate"].max()
74
+ min_rate = method_df["uncensored_rate"].min()
75
+ avg_compliance = method_df["avg_compliance_score"].mean()
76
+ delta = avg_rate - baseline_avg
77
+
78
+ # Find best model for this method
79
+ best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
80
+
81
+ method_stats.append({
82
+ "method": method,
83
+ "description": METHOD_DESCRIPTIONS.get(method, method),
84
+ "num_models": len(method_df),
85
+ "avg_uncensored_rate": avg_rate,
86
+ "max_uncensored_rate": max_rate,
87
+ "min_uncensored_rate": min_rate,
88
+ "avg_compliance_score": avg_compliance,
89
+ "delta_from_baseline": delta,
90
+ "best_model": best_model,
91
+ })
92
+
93
+ return pd.DataFrame(method_stats)
94
+
95
+
96
+ # Column definitions for Models AG Grid
97
+ MODEL_COLUMN_DEFS = [
98
  {
99
  "field": "Rank",
100
  "headerName": "🏆",
 
173
  },
174
  ]
175
 
176
+ # Column definitions for Methods AG Grid
177
+ METHOD_COLUMN_DEFS = [
178
+ {
179
+ "field": "method",
180
+ "headerName": "Method",
181
+ "width": 130,
182
+ "pinned": "left",
183
+ "sortable": True,
184
+ },
185
+ {
186
+ "field": "description",
187
+ "headerName": "Description",
188
+ "width": 200,
189
+ "sortable": True,
190
+ },
191
+ {
192
+ "field": "num_models",
193
+ "headerName": "# Models",
194
+ "width": 100,
195
+ "sortable": True,
196
+ },
197
+ {
198
+ "field": "avg_uncensored_rate",
199
+ "headerName": "Avg Uncensored ⬆️",
200
+ "width": 150,
201
+ "sortable": True,
202
+ "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
203
+ },
204
+ {
205
+ "field": "delta_from_baseline",
206
+ "headerName": "Δ vs Baseline",
207
+ "width": 130,
208
+ "sortable": True,
209
+ "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
210
+ "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
211
+ },
212
+ {
213
+ "field": "max_uncensored_rate",
214
+ "headerName": "Best Rate",
215
+ "width": 110,
216
+ "sortable": True,
217
+ "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
218
+ },
219
+ {
220
+ "field": "min_uncensored_rate",
221
+ "headerName": "Worst Rate",
222
+ "width": 110,
223
+ "sortable": True,
224
+ "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
225
+ },
226
+ {
227
+ "field": "avg_compliance_score",
228
+ "headerName": "Avg Compliance",
229
+ "width": 140,
230
+ "sortable": True,
231
+ "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
232
+ },
233
+ {
234
+ "field": "best_model",
235
+ "headerName": "Best Model",
236
+ "width": 280,
237
+ "sortable": True,
238
+ },
239
+ ]
240
+
241
  # Topic-specific columns (added dynamically if present)
242
  TOPIC_COLUMNS = [
243
  "cybersecurity", "piracy", "weapons", "drugs", "fraud",
 
246
  "evasion", "self_harm", "adult_content"
247
  ]
248
 
249
+
250
+ def get_model_column_defs(df):
251
  """Get column definitions based on available data."""
252
+ cols = MODEL_COLUMN_DEFS.copy()
253
+
254
  # Add topic columns if they exist in the data
255
  for topic in TOPIC_COLUMNS:
256
  if topic in df.columns:
 
261
  "sortable": True,
262
  "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
263
  })
264
+
265
  return cols
266
 
267
+
268
  # App layout
269
  app.layout = html.Div([
270
  # Header
 
275
  style={"color": "#666", "marginTop": "0"}
276
  ),
277
  ], style={"textAlign": "center", "padding": "20px"}),
278
+
279
  # Info banner
280
  html.Div([
281
  html.Div([
 
307
  "marginLeft": "20px",
308
  "marginRight": "20px",
309
  }),
310
+
311
  # Stats summary
312
  html.Div(id="stats-summary", style={
313
  "display": "flex",
 
315
  "gap": "40px",
316
  "marginBottom": "20px",
317
  }),
318
+
319
+ # Tabs for Models and Methods views
320
+ dcc.Tabs(id="view-tabs", value="models", children=[
321
+ dcc.Tab(label="📋 Models Leaderboard", value="models", style={"fontWeight": "bold"}),
322
+ dcc.Tab(label="🔬 Methods Comparison", value="methods", style={"fontWeight": "bold"}),
323
+ ], style={"marginLeft": "20px", "marginRight": "20px"}),
324
+
325
+ # Tab content
326
+ html.Div(id="tab-content", style={"padding": "20px"}),
327
+
 
 
 
 
 
 
 
 
 
 
 
 
328
  # Refresh interval
329
  dcc.Interval(
330
  id="refresh-interval",
331
  interval=60000, # Refresh every 60 seconds
332
  n_intervals=0
333
  ),
334
+
335
  # Footer
336
  html.Div([
337
  html.Hr(),
 
347
  html.A("Submit your model", href="https://github.com/wisent-ai/uncensorbench#how-to-submit", target="_blank"),
348
  ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
349
  ], style={"padding": "20px"}),
350
+
351
  ], style={"fontFamily": "system-ui, -apple-system, sans-serif"})
352
 
353
 
354
  @callback(
355
+ Output("stats-summary", "children"),
 
 
356
  Input("refresh-interval", "n_intervals")
357
  )
358
+ def update_stats(n):
359
+ """Update the stats summary."""
360
  df = load_data()
361
+
 
 
 
 
362
  if len(df) > 0:
363
+ # Calculate method stats for the summary
364
+ baseline_df = df[df["method"] == "none"]
365
+ baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
366
+
367
+ # Find best non-baseline method
368
+ non_baseline = df[df["method"] != "none"]
369
+ best_method_avg = 0
370
+ best_method = "N/A"
371
+ if len(non_baseline) > 0:
372
+ method_avgs = non_baseline.groupby("method")["uncensored_rate"].mean()
373
+ if len(method_avgs) > 0:
374
+ best_method = method_avgs.idxmax()
375
+ best_method_avg = method_avgs.max()
376
+
377
+ best_delta = best_method_avg - baseline_avg if best_method_avg > 0 else 0
378
+
379
  stats = [
380
  html.Div([
381
  html.Div(str(len(df)), style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
382
  html.Div("Models", style={"color": "#666"}),
383
  ], style={"textAlign": "center"}),
384
  html.Div([
385
+ html.Div(f"{baseline_avg:.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#9E9E9E"}),
386
+ html.Div("Baseline Avg", style={"color": "#666"}),
387
  ], style={"textAlign": "center"}),
388
  html.Div([
389
  html.Div(f"{df['uncensored_rate'].max():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#FF9800"}),
390
+ html.Div("Best Rate", style={"color": "#666"}),
391
+ ], style={"textAlign": "center"}),
392
+ html.Div([
393
+ html.Div(
394
+ f"+{best_delta:.1%}" if best_delta > 0 else f"{best_delta:.1%}",
395
+ style={"fontSize": "2em", "fontWeight": "bold", "color": "#4CAF50" if best_delta > 0 else "#f44336"}
396
+ ),
397
+ html.Div(f"Best Method Δ ({best_method})", style={"color": "#666"}),
398
  ], style={"textAlign": "center"}),
399
  ]
400
  else:
 
407
  html.P("No submissions yet. Be the first to submit!", style={"color": "#666"}),
408
  ], style={"textAlign": "center"}),
409
  ]
410
+
411
+ return stats
412
+
413
+
414
+ @callback(
415
+ Output("tab-content", "children"),
416
+ [Input("view-tabs", "value"),
417
+ Input("refresh-interval", "n_intervals")]
418
+ )
419
+ def render_tab_content(tab, n):
420
+ """Render content based on selected tab."""
421
+ df = load_data()
422
+
423
+ if tab == "models":
424
+ # Models leaderboard view
425
+ col_defs = get_model_column_defs(df)
426
+ row_data = df.to_dict("records") if len(df) > 0 else []
427
+
428
+ return html.Div([
429
+ dag.AgGrid(
430
+ id="leaderboard-grid",
431
+ columnDefs=col_defs,
432
+ rowData=row_data,
433
+ defaultColDef={
434
+ "resizable": True,
435
+ "sortable": True,
436
+ },
437
+ dashGridOptions={
438
+ "pagination": True,
439
+ "paginationPageSize": 50,
440
+ "animateRows": True,
441
+ "rowSelection": "single",
442
+ },
443
+ style={"height": "600px"},
444
+ className="ag-theme-alpine",
445
+ ),
446
+ ])
447
+
448
+ elif tab == "methods":
449
+ # Methods comparison view
450
+ method_df = calculate_method_stats(df)
451
+ row_data = method_df.to_dict("records") if len(method_df) > 0 else []
452
+
453
+ # Sort by delta from baseline descending
454
+ if len(method_df) > 0:
455
+ method_df = method_df.sort_values("delta_from_baseline", ascending=False)
456
+ row_data = method_df.to_dict("records")
457
+
458
+ return html.Div([
459
+ # Method comparison description
460
+ html.Div([
461
+ html.P([
462
+ "Compare censorship removal methods. ",
463
+ html.Strong("Δ vs Baseline"),
464
+ " shows the improvement over unmodified models (method=none)."
465
+ ], style={"color": "#666", "marginBottom": "15px"}),
466
+ ]),
467
+
468
+ # Methods grid
469
+ dag.AgGrid(
470
+ id="methods-grid",
471
+ columnDefs=METHOD_COLUMN_DEFS,
472
+ rowData=row_data,
473
+ defaultColDef={
474
+ "resizable": True,
475
+ "sortable": True,
476
+ },
477
+ dashGridOptions={
478
+ "animateRows": True,
479
+ "rowSelection": "single",
480
+ },
481
+ style={"height": "400px"},
482
+ className="ag-theme-alpine",
483
+ ),
484
+
485
+ # Method legend
486
+ html.Div([
487
+ html.H4("Method Definitions", style={"marginTop": "30px", "marginBottom": "15px"}),
488
+ html.Div([
489
+ html.Div([
490
+ html.Span(
491
+ f"● {method}",
492
+ style={"color": METHOD_COLORS.get(method, "#666"), "fontWeight": "bold", "marginRight": "10px"}
493
+ ),
494
+ html.Span(desc, style={"color": "#666"}),
495
+ ], style={"marginBottom": "8px"})
496
+ for method, desc in METHOD_DESCRIPTIONS.items()
497
+ ], style={"columns": "2", "columnGap": "40px"}),
498
+ ], style={
499
+ "backgroundColor": "#f9f9f9",
500
+ "padding": "20px",
501
+ "borderRadius": "8px",
502
+ "marginTop": "20px",
503
+ }),
504
+ ])
505
+
506
+ return html.Div("Select a tab")
507
 
508
 
509
  if __name__ == "__main__":