lbartoszcze commited on
Commit
e2129ad
·
verified ·
1 Parent(s): a55ede4

Add paired comparison logic for accurate method effectiveness calculation

Browse files
Files changed (1) hide show
  1. app.py +112 -15
app.py CHANGED
@@ -65,18 +65,42 @@ def get_method_color(method, method_index=0):
65
  return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def calculate_method_stats(df):
69
- """Calculate statistics for each method including delta from baseline."""
 
 
 
 
 
 
 
 
 
70
  if len(df) == 0:
71
  return pd.DataFrame(), {}
72
 
73
  # Get all unique methods from the actual data
74
  all_methods = df["method"].dropna().unique().tolist()
75
 
76
- # Get baseline average (method = "none")
77
- baseline_df = df[df["method"] == "none"]
78
- baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
79
-
80
  # Build dynamic color mapping for any new methods
81
  dynamic_method_colors = {}
82
  dynamic_idx = 0
@@ -87,6 +111,20 @@ def calculate_method_stats(df):
87
  dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
88
  dynamic_idx += 1
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  # Group by method - iterate over actual methods in the data
91
  method_stats = []
92
  for method in all_methods:
@@ -96,11 +134,45 @@ def calculate_method_stats(df):
96
  max_rate = method_df["uncensored_rate"].max()
97
  min_rate = method_df["uncensored_rate"].min()
98
  avg_compliance = method_df["avg_compliance_score"].mean()
99
- delta = avg_rate - baseline_avg
100
 
101
  # Find best model for this method
102
  best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  # Get description - use predefined or just capitalize the method name
105
  description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
106
 
@@ -113,6 +185,8 @@ def calculate_method_stats(df):
113
  "min_uncensored_rate": min_rate,
114
  "avg_compliance_score": avg_compliance,
115
  "delta_from_baseline": delta,
 
 
116
  "best_model": best_model,
117
  })
118
 
@@ -211,13 +285,13 @@ METHOD_COLUMN_DEFS = [
211
  {
212
  "field": "description",
213
  "headerName": "Description",
214
- "width": 200,
215
  "sortable": True,
216
  },
217
  {
218
  "field": "num_models",
219
  "headerName": "# Models",
220
- "width": 100,
221
  "sortable": True,
222
  },
223
  {
@@ -230,36 +304,50 @@ METHOD_COLUMN_DEFS = [
230
  {
231
  "field": "delta_from_baseline",
232
  "headerName": "Δ vs Baseline",
233
- "width": 130,
234
  "sortable": True,
235
  "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
236
  "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
237
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  {
239
  "field": "max_uncensored_rate",
240
  "headerName": "Best Rate",
241
- "width": 110,
242
  "sortable": True,
243
  "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
244
  },
245
  {
246
  "field": "min_uncensored_rate",
247
  "headerName": "Worst Rate",
248
- "width": 110,
249
  "sortable": True,
250
  "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
251
  },
252
  {
253
  "field": "avg_compliance_score",
254
  "headerName": "Avg Compliance",
255
- "width": 140,
256
  "sortable": True,
257
  "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
258
  },
259
  {
260
  "field": "best_model",
261
  "headerName": "Best Model",
262
- "width": 280,
263
  "sortable": True,
264
  },
265
  ]
@@ -503,8 +591,17 @@ def render_tab_content(tab, n):
503
  html.P([
504
  "Compare censorship removal methods. ",
505
  html.Strong("Δ vs Baseline"),
506
- " shows the improvement over unmodified models (method=none)."
507
- ], style={"color": "#666", "marginBottom": "15px"}),
 
 
 
 
 
 
 
 
 
508
  ]),
509
 
510
  # Methods grid
 
65
  return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
66
 
67
 
68
+ def extract_base_model_name(model_name):
69
+ """
70
+ Extract the base model name for pairing.
71
+ E.g., 'meta-llama/Llama-3.2-1B-Instruct-abliterated' -> 'meta-llama/Llama-3.2-1B-Instruct'
72
+ """
73
+ # Common suffixes added by methods
74
+ suffixes_to_remove = [
75
+ "-abliterated", "-uncensored", "-steered", "-finetuned",
76
+ "_abliterated", "_uncensored", "_steered", "_finetuned",
77
+ "-ablation", "-steering", "-ft",
78
+ ]
79
+ base_name = model_name
80
+ for suffix in suffixes_to_remove:
81
+ if base_name.lower().endswith(suffix.lower()):
82
+ base_name = base_name[:-len(suffix)]
83
+ break
84
+ return base_name
85
+
86
+
87
  def calculate_method_stats(df):
88
+ """
89
+ Calculate statistics for each method including delta from baseline.
90
+
91
+ Delta calculation:
92
+ 1. PAIRED: For models that have both baseline (none) and method versions,
93
+ calculate the actual improvement (method_rate - baseline_rate) for each pair,
94
+ then average across pairs.
95
+ 2. UNPAIRED: For methods without paired baselines, show the difference from
96
+ the global baseline average (less reliable).
97
+ """
98
  if len(df) == 0:
99
  return pd.DataFrame(), {}
100
 
101
  # Get all unique methods from the actual data
102
  all_methods = df["method"].dropna().unique().tolist()
103
 
 
 
 
 
104
  # Build dynamic color mapping for any new methods
105
  dynamic_method_colors = {}
106
  dynamic_idx = 0
 
111
  dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
112
  dynamic_idx += 1
113
 
114
+ # Get baseline data
115
+ baseline_df = df[df["method"] == "none"].copy()
116
+ global_baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
117
+
118
+ # Create lookup for baseline rates by model family + size (for pairing)
119
+ baseline_lookup = {}
120
+ if len(baseline_df) > 0:
121
+ for _, row in baseline_df.iterrows():
122
+ # Key by model_family + model_size for matching
123
+ key = (row.get("model_family", ""), row.get("model_size", ""))
124
+ base_model_key = extract_base_model_name(row.get("model", ""))
125
+ baseline_lookup[key] = row["uncensored_rate"]
126
+ baseline_lookup[base_model_key] = row["uncensored_rate"]
127
+
128
  # Group by method - iterate over actual methods in the data
129
  method_stats = []
130
  for method in all_methods:
 
134
  max_rate = method_df["uncensored_rate"].max()
135
  min_rate = method_df["uncensored_rate"].min()
136
  avg_compliance = method_df["avg_compliance_score"].mean()
 
137
 
138
  # Find best model for this method
139
  best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
140
 
141
+ # Calculate paired delta where possible
142
+ paired_deltas = []
143
+ unpaired_count = 0
144
+
145
+ if method != "none":
146
+ for _, row in method_df.iterrows():
147
+ # Try to find matching baseline
148
+ key = (row.get("model_family", ""), row.get("model_size", ""))
149
+ base_model_key = extract_base_model_name(row.get("model", ""))
150
+
151
+ baseline_rate = None
152
+ if base_model_key in baseline_lookup:
153
+ baseline_rate = baseline_lookup[base_model_key]
154
+ elif key in baseline_lookup:
155
+ baseline_rate = baseline_lookup[key]
156
+
157
+ if baseline_rate is not None:
158
+ paired_deltas.append(row["uncensored_rate"] - baseline_rate)
159
+ else:
160
+ unpaired_count += 1
161
+
162
+ # Calculate delta
163
+ if method == "none":
164
+ delta = 0.0
165
+ paired_count = len(method_df)
166
+ delta_type = "baseline"
167
+ elif len(paired_deltas) > 0:
168
+ delta = sum(paired_deltas) / len(paired_deltas)
169
+ paired_count = len(paired_deltas)
170
+ delta_type = "paired" if unpaired_count == 0 else "mixed"
171
+ else:
172
+ delta = avg_rate - global_baseline_avg
173
+ paired_count = 0
174
+ delta_type = "unpaired"
175
+
176
  # Get description - use predefined or just capitalize the method name
177
  description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
178
 
 
185
  "min_uncensored_rate": min_rate,
186
  "avg_compliance_score": avg_compliance,
187
  "delta_from_baseline": delta,
188
+ "paired_comparisons": paired_count,
189
+ "delta_type": delta_type,
190
  "best_model": best_model,
191
  })
192
 
 
285
  {
286
  "field": "description",
287
  "headerName": "Description",
288
+ "width": 180,
289
  "sortable": True,
290
  },
291
  {
292
  "field": "num_models",
293
  "headerName": "# Models",
294
+ "width": 90,
295
  "sortable": True,
296
  },
297
  {
 
304
  {
305
  "field": "delta_from_baseline",
306
  "headerName": "Δ vs Baseline",
307
+ "width": 120,
308
  "sortable": True,
309
  "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
310
  "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
311
  },
312
+ {
313
+ "field": "delta_type",
314
+ "headerName": "Δ Type",
315
+ "width": 100,
316
+ "sortable": True,
317
+ "cellStyle": {"function": "params.value === 'paired' ? {'color': '#4CAF50'} : params.value === 'unpaired' ? {'color': '#FF9800'} : {}"},
318
+ "tooltipField": "delta_type",
319
+ },
320
+ {
321
+ "field": "paired_comparisons",
322
+ "headerName": "# Pairs",
323
+ "width": 80,
324
+ "sortable": True,
325
+ },
326
  {
327
  "field": "max_uncensored_rate",
328
  "headerName": "Best Rate",
329
+ "width": 100,
330
  "sortable": True,
331
  "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
332
  },
333
  {
334
  "field": "min_uncensored_rate",
335
  "headerName": "Worst Rate",
336
+ "width": 100,
337
  "sortable": True,
338
  "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
339
  },
340
  {
341
  "field": "avg_compliance_score",
342
  "headerName": "Avg Compliance",
343
+ "width": 130,
344
  "sortable": True,
345
  "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
346
  },
347
  {
348
  "field": "best_model",
349
  "headerName": "Best Model",
350
+ "width": 260,
351
  "sortable": True,
352
  },
353
  ]
 
591
  html.P([
592
  "Compare censorship removal methods. ",
593
  html.Strong("Δ vs Baseline"),
594
+ " shows the improvement over unmodified models."
595
+ ], style={"color": "#666", "marginBottom": "5px"}),
596
+ html.P([
597
+ html.Strong("Δ Type: ", style={"color": "#333"}),
598
+ html.Span("paired", style={"color": "#4CAF50", "fontWeight": "bold"}),
599
+ " = same model compared with/without method (reliable). ",
600
+ html.Span("unpaired", style={"color": "#FF9800", "fontWeight": "bold"}),
601
+ " = compared to global baseline avg (less reliable). ",
602
+ html.Span("mixed", style={"color": "#666"}),
603
+ " = some paired, some unpaired."
604
+ ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
605
  ]),
606
 
607
  # Methods grid