lbartoszcze commited on
Commit
605a4dd
·
verified ·
1 Parent(s): e2129ad

Update Methods tab to use paired comparisons only

Browse files
Files changed (1) hide show
  1. app.py +110 -129
app.py CHANGED
@@ -65,35 +65,15 @@ def get_method_color(method, method_index=0):
65
  return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
66
 
67
 
68
- def extract_base_model_name(model_name):
69
- """
70
- Extract the base model name for pairing.
71
- E.g., 'meta-llama/Llama-3.2-1B-Instruct-abliterated' -> 'meta-llama/Llama-3.2-1B-Instruct'
72
  """
73
- # Common suffixes added by methods
74
- suffixes_to_remove = [
75
- "-abliterated", "-uncensored", "-steered", "-finetuned",
76
- "_abliterated", "_uncensored", "_steered", "_finetuned",
77
- "-ablation", "-steering", "-ft",
78
- ]
79
- base_name = model_name
80
- for suffix in suffixes_to_remove:
81
- if base_name.lower().endswith(suffix.lower()):
82
- base_name = base_name[:-len(suffix)]
83
- break
84
- return base_name
85
 
 
 
 
86
 
87
- def calculate_method_stats(df):
88
- """
89
- Calculate statistics for each method including delta from baseline.
90
-
91
- Delta calculation:
92
- 1. PAIRED: For models that have both baseline (none) and method versions,
93
- calculate the actual improvement (method_rate - baseline_rate) for each pair,
94
- then average across pairs.
95
- 2. UNPAIRED: For methods without paired baselines, show the difference from
96
- the global baseline average (less reliable).
97
  """
98
  if len(df) == 0:
99
  return pd.DataFrame(), {}
@@ -111,84 +91,102 @@ def calculate_method_stats(df):
111
  dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
112
  dynamic_idx += 1
113
 
114
- # Get baseline data
115
  baseline_df = df[df["method"] == "none"].copy()
116
- global_baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
117
-
118
- # Create lookup for baseline rates by model family + size (for pairing)
119
  baseline_lookup = {}
120
  if len(baseline_df) > 0:
121
  for _, row in baseline_df.iterrows():
122
- # Key by model_family + model_size for matching
123
- key = (row.get("model_family", ""), row.get("model_size", ""))
124
- base_model_key = extract_base_model_name(row.get("model", ""))
125
- baseline_lookup[key] = row["uncensored_rate"]
126
- baseline_lookup[base_model_key] = row["uncensored_rate"]
127
 
128
- # Group by method - iterate over actual methods in the data
129
  method_stats = []
 
130
  for method in all_methods:
131
  method_df = df[df["method"] == method]
132
- if len(method_df) > 0:
133
- avg_rate = method_df["uncensored_rate"].mean()
134
- max_rate = method_df["uncensored_rate"].max()
135
- min_rate = method_df["uncensored_rate"].min()
136
- avg_compliance = method_df["avg_compliance_score"].mean()
137
-
138
- # Find best model for this method
139
- best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
140
-
141
- # Calculate paired delta where possible
142
- paired_deltas = []
143
- unpaired_count = 0
144
-
145
- if method != "none":
146
- for _, row in method_df.iterrows():
147
- # Try to find matching baseline
148
- key = (row.get("model_family", ""), row.get("model_size", ""))
149
- base_model_key = extract_base_model_name(row.get("model", ""))
150
-
151
- baseline_rate = None
152
- if base_model_key in baseline_lookup:
153
- baseline_rate = baseline_lookup[base_model_key]
154
- elif key in baseline_lookup:
155
- baseline_rate = baseline_lookup[key]
156
-
157
- if baseline_rate is not None:
158
- paired_deltas.append(row["uncensored_rate"] - baseline_rate)
159
- else:
160
- unpaired_count += 1
161
-
162
- # Calculate delta
163
- if method == "none":
164
- delta = 0.0
165
- paired_count = len(method_df)
166
- delta_type = "baseline"
167
- elif len(paired_deltas) > 0:
168
- delta = sum(paired_deltas) / len(paired_deltas)
169
- paired_count = len(paired_deltas)
170
- delta_type = "paired" if unpaired_count == 0 else "mixed"
171
- else:
172
- delta = avg_rate - global_baseline_avg
173
- paired_count = 0
174
- delta_type = "unpaired"
175
-
176
- # Get description - use predefined or just capitalize the method name
177
- description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
178
-
179
- method_stats.append({
180
- "method": method,
181
- "description": description,
182
- "num_models": len(method_df),
183
- "avg_uncensored_rate": avg_rate,
184
- "max_uncensored_rate": max_rate,
185
- "min_uncensored_rate": min_rate,
186
- "avg_compliance_score": avg_compliance,
187
- "delta_from_baseline": delta,
188
- "paired_comparisons": paired_count,
189
- "delta_type": delta_type,
190
- "best_model": best_model,
191
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  return pd.DataFrame(method_stats), dynamic_method_colors
194
 
@@ -273,7 +271,7 @@ MODEL_COLUMN_DEFS = [
273
  },
274
  ]
275
 
276
- # Column definitions for Methods AG Grid
277
  METHOD_COLUMN_DEFS = [
278
  {
279
  "field": "method",
@@ -289,39 +287,25 @@ METHOD_COLUMN_DEFS = [
289
  "sortable": True,
290
  },
291
  {
292
- "field": "num_models",
293
- "headerName": "# Models",
294
- "width": 90,
295
- "sortable": True,
296
- },
297
- {
298
- "field": "avg_uncensored_rate",
299
- "headerName": "Avg Uncensored ⬆️",
300
- "width": 150,
301
  "sortable": True,
302
- "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
303
  },
304
  {
305
  "field": "delta_from_baseline",
306
- "headerName": "Δ vs Baseline",
307
- "width": 120,
308
  "sortable": True,
309
  "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
310
  "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
311
  },
312
  {
313
- "field": "delta_type",
314
- "headerName": "Δ Type",
315
  "width": 100,
316
  "sortable": True,
317
- "cellStyle": {"function": "params.value === 'paired' ? {'color': '#4CAF50'} : params.value === 'unpaired' ? {'color': '#FF9800'} : {}"},
318
- "tooltipField": "delta_type",
319
- },
320
- {
321
- "field": "paired_comparisons",
322
- "headerName": "# Pairs",
323
- "width": 80,
324
- "sortable": True,
325
  },
326
  {
327
  "field": "max_uncensored_rate",
@@ -589,18 +573,15 @@ def render_tab_content(tab, n):
589
  # Method comparison description
590
  html.Div([
591
  html.P([
592
- "Compare censorship removal methods. ",
593
- html.Strong("Δ vs Baseline"),
594
- " shows the improvement over unmodified models."
 
 
595
  ], style={"color": "#666", "marginBottom": "5px"}),
596
  html.P([
597
- html.Strong("Δ Type: ", style={"color": "#333"}),
598
- html.Span("paired", style={"color": "#4CAF50", "fontWeight": "bold"}),
599
- " = same model compared with/without method (reliable). ",
600
- html.Span("unpaired", style={"color": "#FF9800", "fontWeight": "bold"}),
601
- " = compared to global baseline avg (less reliable). ",
602
- html.Span("mixed", style={"color": "#666"}),
603
- " = some paired, some unpaired."
604
  ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
605
  ]),
606
 
 
65
  return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
66
 
67
 
68
+ def calculate_method_stats(df):
 
 
 
69
  """
70
+ Calculate statistics for each method based on PAIRED comparisons only.
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ A paired comparison requires the exact same base model to have both:
73
+ - A baseline submission (method="none")
74
+ - A method-applied submission (method=X)
75
 
76
+ Only shows delta for methods where paired comparisons exist.
 
 
 
 
 
 
 
 
 
77
  """
78
  if len(df) == 0:
79
  return pd.DataFrame(), {}
 
91
  dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
92
  dynamic_idx += 1
93
 
94
+ # Get baseline data - create lookup by exact model name
95
  baseline_df = df[df["method"] == "none"].copy()
 
 
 
96
  baseline_lookup = {}
97
  if len(baseline_df) > 0:
98
  for _, row in baseline_df.iterrows():
99
+ model_name = row.get("model", "")
100
+ baseline_lookup[model_name] = {
101
+ "uncensored_rate": row["uncensored_rate"],
102
+ "avg_compliance_score": row.get("avg_compliance_score", 0),
103
+ }
104
 
105
+ # Calculate paired comparisons for each method
106
  method_stats = []
107
+
108
  for method in all_methods:
109
  method_df = df[df["method"] == method]
110
+
111
+ if method == "none":
112
+ # Baseline method - show stats but no delta
113
+ if len(method_df) > 0:
114
+ avg_rate = method_df["uncensored_rate"].mean()
115
+ max_rate = method_df["uncensored_rate"].max()
116
+ min_rate = method_df["uncensored_rate"].min()
117
+ avg_compliance = method_df["avg_compliance_score"].mean()
118
+ best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
119
+ description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
120
+
121
+ method_stats.append({
122
+ "method": method,
123
+ "description": description,
124
+ "num_models": len(method_df),
125
+ "num_pairs": len(method_df),
126
+ "avg_uncensored_rate": avg_rate,
127
+ "delta_from_baseline": 0.0,
128
+ "max_uncensored_rate": max_rate,
129
+ "min_uncensored_rate": min_rate,
130
+ "avg_compliance_score": avg_compliance,
131
+ "best_model": best_model,
132
+ })
133
+ else:
134
+ # Non-baseline method - only count paired comparisons
135
+ paired_data = []
136
+
137
+ for _, row in method_df.iterrows():
138
+ method_model = row.get("model", "")
139
+ method_rate = row["uncensored_rate"]
140
+ method_compliance = row.get("avg_compliance_score", 0)
141
+
142
+ # Find exact baseline match by model_family + model_size
143
+ model_family = row.get("model_family", "")
144
+ model_size = row.get("model_size", "")
145
+
146
+ # Look for baseline with same family and size
147
+ baseline_match = None
148
+ for baseline_model, baseline_data in baseline_lookup.items():
149
+ baseline_row = baseline_df[baseline_df["model"] == baseline_model].iloc[0]
150
+ if (baseline_row.get("model_family", "") == model_family and
151
+ baseline_row.get("model_size", "") == model_size):
152
+ baseline_match = baseline_data
153
+ break
154
+
155
+ if baseline_match is not None:
156
+ paired_data.append({
157
+ "model": method_model,
158
+ "method_rate": method_rate,
159
+ "baseline_rate": baseline_match["uncensored_rate"],
160
+ "delta": method_rate - baseline_match["uncensored_rate"],
161
+ "method_compliance": method_compliance,
162
+ })
163
+
164
+ # Only add method if it has paired comparisons
165
+ if len(paired_data) > 0:
166
+ avg_delta = sum(p["delta"] for p in paired_data) / len(paired_data)
167
+ avg_rate = sum(p["method_rate"] for p in paired_data) / len(paired_data)
168
+ max_rate = max(p["method_rate"] for p in paired_data)
169
+ min_rate = min(p["method_rate"] for p in paired_data)
170
+ avg_compliance = sum(p["method_compliance"] for p in paired_data) / len(paired_data)
171
+
172
+ # Best model is the one with highest delta
173
+ best_pair = max(paired_data, key=lambda x: x["delta"])
174
+ best_model = best_pair["model"]
175
+
176
+ description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
177
+
178
+ method_stats.append({
179
+ "method": method,
180
+ "description": description,
181
+ "num_models": len(method_df),
182
+ "num_pairs": len(paired_data),
183
+ "avg_uncensored_rate": avg_rate,
184
+ "delta_from_baseline": avg_delta,
185
+ "max_uncensored_rate": max_rate,
186
+ "min_uncensored_rate": min_rate,
187
+ "avg_compliance_score": avg_compliance,
188
+ "best_model": best_model,
189
+ })
190
 
191
  return pd.DataFrame(method_stats), dynamic_method_colors
192
 
 
271
  },
272
  ]
273
 
274
+ # Column definitions for Methods AG Grid (paired comparisons only)
275
  METHOD_COLUMN_DEFS = [
276
  {
277
  "field": "method",
 
287
  "sortable": True,
288
  },
289
  {
290
+ "field": "num_pairs",
291
+ "headerName": "# Pairs",
292
+ "width": 80,
 
 
 
 
 
 
293
  "sortable": True,
 
294
  },
295
  {
296
  "field": "delta_from_baseline",
297
+ "headerName": "Δ vs Baseline ⬆️",
298
+ "width": 140,
299
  "sortable": True,
300
  "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
301
  "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
302
  },
303
  {
304
+ "field": "avg_uncensored_rate",
305
+ "headerName": "Avg Rate",
306
  "width": 100,
307
  "sortable": True,
308
+ "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
 
 
 
 
 
 
 
309
  },
310
  {
311
  "field": "max_uncensored_rate",
 
573
  # Method comparison description
574
  html.Div([
575
  html.P([
576
+ "Compare censorship removal methods using ",
577
+ html.Strong("paired comparisons only"),
578
+ ". Delta (Δ) is calculated by comparing the ",
579
+ html.Strong("same base model"),
580
+ " with and without each method applied."
581
  ], style={"color": "#666", "marginBottom": "5px"}),
582
  html.P([
583
+ "Methods are only shown if they have at least one paired comparison ",
584
+ "(matching model_family + model_size with a baseline 'none' submission)."
 
 
 
 
 
585
  ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
586
  ]),
587