Spaces:

wisent-ai
/

UncensorBench-Leaderboard

Paused

App Files Files Community

lbartoszcze commited on Dec 1, 2025

Commit

605a4dd

verified ·

1 Parent(s): e2129ad

Update Methods tab to use paired comparisons only

Browse files

Files changed (1) hide show

app.py +110 -129

app.py CHANGED Viewed

@@ -65,35 +65,15 @@ def get_method_color(method, method_index=0):
     return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
-def extract_base_model_name(model_name):
-    """
-    Extract the base model name for pairing.
-    E.g., 'meta-llama/Llama-3.2-1B-Instruct-abliterated' -> 'meta-llama/Llama-3.2-1B-Instruct'
     """
-    # Common suffixes added by methods
-    suffixes_to_remove = [
-        "-abliterated", "-uncensored", "-steered", "-finetuned",
-        "_abliterated", "_uncensored", "_steered", "_finetuned",
-        "-ablation", "-steering", "-ft",
-    ]
-    base_name = model_name
-    for suffix in suffixes_to_remove:
-        if base_name.lower().endswith(suffix.lower()):
-            base_name = base_name[:-len(suffix)]
-            break
-    return base_name
-def calculate_method_stats(df):
-    """
-    Calculate statistics for each method including delta from baseline.
-    Delta calculation:
-    1. PAIRED: For models that have both baseline (none) and method versions,
-       calculate the actual improvement (method_rate - baseline_rate) for each pair,
-       then average across pairs.
-    2. UNPAIRED: For methods without paired baselines, show the difference from
-       the global baseline average (less reliable).
     """
     if len(df) == 0:
         return pd.DataFrame(), {}
@@ -111,84 +91,102 @@ def calculate_method_stats(df):
             dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
             dynamic_idx += 1
-    # Get baseline data
     baseline_df = df[df["method"] == "none"].copy()
-    global_baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
-    # Create lookup for baseline rates by model family + size (for pairing)
     baseline_lookup = {}
     if len(baseline_df) > 0:
         for _, row in baseline_df.iterrows():
-            # Key by model_family + model_size for matching
-            key = (row.get("model_family", ""), row.get("model_size", ""))
-            base_model_key = extract_base_model_name(row.get("model", ""))
-            baseline_lookup[key] = row["uncensored_rate"]
-            baseline_lookup[base_model_key] = row["uncensored_rate"]
-    # Group by method - iterate over actual methods in the data
     method_stats = []
     for method in all_methods:
         method_df = df[df["method"] == method]
-        if len(method_df) > 0:
-            avg_rate = method_df["uncensored_rate"].mean()
-            max_rate = method_df["uncensored_rate"].max()
-            min_rate = method_df["uncensored_rate"].min()
-            avg_compliance = method_df["avg_compliance_score"].mean()
-            # Find best model for this method
-            best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
-            # Calculate paired delta where possible
-            paired_deltas = []
-            unpaired_count = 0
-            if method != "none":
-                for _, row in method_df.iterrows():
-                    # Try to find matching baseline
-                    key = (row.get("model_family", ""), row.get("model_size", ""))
-                    base_model_key = extract_base_model_name(row.get("model", ""))
-                    baseline_rate = None
-                    if base_model_key in baseline_lookup:
-                        baseline_rate = baseline_lookup[base_model_key]
-                    elif key in baseline_lookup:
-                        baseline_rate = baseline_lookup[key]
-                    if baseline_rate is not None:
-                        paired_deltas.append(row["uncensored_rate"] - baseline_rate)
-                    else:
-                        unpaired_count += 1
-            # Calculate delta
-            if method == "none":
-                delta = 0.0
-                paired_count = len(method_df)
-                delta_type = "baseline"
-            elif len(paired_deltas) > 0:
-                delta = sum(paired_deltas) / len(paired_deltas)
-                paired_count = len(paired_deltas)
-                delta_type = "paired" if unpaired_count == 0 else "mixed"
-            else:
-                delta = avg_rate - global_baseline_avg
-                paired_count = 0
-                delta_type = "unpaired"
-            # Get description - use predefined or just capitalize the method name
-            description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
-            method_stats.append({
-                "method": method,
-                "description": description,
-                "num_models": len(method_df),
-                "avg_uncensored_rate": avg_rate,
-                "max_uncensored_rate": max_rate,
-                "min_uncensored_rate": min_rate,
-                "avg_compliance_score": avg_compliance,
-                "delta_from_baseline": delta,
-                "paired_comparisons": paired_count,
-                "delta_type": delta_type,
-                "best_model": best_model,
-            })
     return pd.DataFrame(method_stats), dynamic_method_colors
@@ -273,7 +271,7 @@ MODEL_COLUMN_DEFS = [
     },
 ]
-# Column definitions for Methods AG Grid
 METHOD_COLUMN_DEFS = [
     {
         "field": "method",
@@ -289,39 +287,25 @@ METHOD_COLUMN_DEFS = [
         "sortable": True,
     },
     {
-        "field": "num_models",
-        "headerName": "# Models",
-        "width": 90,
-        "sortable": True,
-    },
-    {
-        "field": "avg_uncensored_rate",
-        "headerName": "Avg Uncensored ⬆️",
-        "width": 150,
         "sortable": True,
-        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
     },
     {
         "field": "delta_from_baseline",
-        "headerName": "Δ vs Baseline",
-        "width": 120,
         "sortable": True,
         "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
         "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
     },
     {
-        "field": "delta_type",
-        "headerName": "Δ Type",
         "width": 100,
         "sortable": True,
-        "cellStyle": {"function": "params.value === 'paired' ? {'color': '#4CAF50'} : params.value === 'unpaired' ? {'color': '#FF9800'} : {}"},
-        "tooltipField": "delta_type",
-    },
-    {
-        "field": "paired_comparisons",
-        "headerName": "# Pairs",
-        "width": 80,
-        "sortable": True,
     },
     {
         "field": "max_uncensored_rate",
@@ -589,18 +573,15 @@ def render_tab_content(tab, n):
             # Method comparison description
             html.Div([
                 html.P([
-                    "Compare censorship removal methods. ",
-                    html.Strong("Δ vs Baseline"),
-                    " shows the improvement over unmodified models."
                 ], style={"color": "#666", "marginBottom": "5px"}),
                 html.P([
-                    html.Strong("Δ Type: ", style={"color": "#333"}),
-                    html.Span("paired", style={"color": "#4CAF50", "fontWeight": "bold"}),
-                    " = same model compared with/without method (reliable). ",
-                    html.Span("unpaired", style={"color": "#FF9800", "fontWeight": "bold"}),
-                    " = compared to global baseline avg (less reliable). ",
-                    html.Span("mixed", style={"color": "#666"}),
-                    " = some paired, some unpaired."
                 ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
             ]),

     return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
+def calculate_method_stats(df):
     """
+    Calculate statistics for each method based on PAIRED comparisons only.
+    A paired comparison requires the exact same base model to have both:
+    - A baseline submission (method="none")
+    - A method-applied submission (method=X)
+    Only shows delta for methods where paired comparisons exist.
     """
     if len(df) == 0:
         return pd.DataFrame(), {}
             dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
             dynamic_idx += 1
+    # Get baseline data - create lookup by exact model name
     baseline_df = df[df["method"] == "none"].copy()
     baseline_lookup = {}
     if len(baseline_df) > 0:
         for _, row in baseline_df.iterrows():
+            model_name = row.get("model", "")
+            baseline_lookup[model_name] = {
+                "uncensored_rate": row["uncensored_rate"],
+                "avg_compliance_score": row.get("avg_compliance_score", 0),
+            }
+    # Calculate paired comparisons for each method
     method_stats = []
     for method in all_methods:
         method_df = df[df["method"] == method]
+        if method == "none":
+            # Baseline method - show stats but no delta
+            if len(method_df) > 0:
+                avg_rate = method_df["uncensored_rate"].mean()
+                max_rate = method_df["uncensored_rate"].max()
+                min_rate = method_df["uncensored_rate"].min()
+                avg_compliance = method_df["avg_compliance_score"].mean()
+                best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
+                description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
+                method_stats.append({
+                    "method": method,
+                    "description": description,
+                    "num_models": len(method_df),
+                    "num_pairs": len(method_df),
+                    "avg_uncensored_rate": avg_rate,
+                    "delta_from_baseline": 0.0,
+                    "max_uncensored_rate": max_rate,
+                    "min_uncensored_rate": min_rate,
+                    "avg_compliance_score": avg_compliance,
+                    "best_model": best_model,
+                })
+        else:
+            # Non-baseline method - only count paired comparisons
+            paired_data = []
+            for _, row in method_df.iterrows():
+                method_model = row.get("model", "")
+                method_rate = row["uncensored_rate"]
+                method_compliance = row.get("avg_compliance_score", 0)
+                # Find exact baseline match by model_family + model_size
+                model_family = row.get("model_family", "")
+                model_size = row.get("model_size", "")
+                # Look for baseline with same family and size
+                baseline_match = None
+                for baseline_model, baseline_data in baseline_lookup.items():
+                    baseline_row = baseline_df[baseline_df["model"] == baseline_model].iloc[0]
+                    if (baseline_row.get("model_family", "") == model_family and
+                        baseline_row.get("model_size", "") == model_size):
+                        baseline_match = baseline_data
+                        break
+                if baseline_match is not None:
+                    paired_data.append({
+                        "model": method_model,
+                        "method_rate": method_rate,
+                        "baseline_rate": baseline_match["uncensored_rate"],
+                        "delta": method_rate - baseline_match["uncensored_rate"],
+                        "method_compliance": method_compliance,
+                    })
+            # Only add method if it has paired comparisons
+            if len(paired_data) > 0:
+                avg_delta = sum(p["delta"] for p in paired_data) / len(paired_data)
+                avg_rate = sum(p["method_rate"] for p in paired_data) / len(paired_data)
+                max_rate = max(p["method_rate"] for p in paired_data)
+                min_rate = min(p["method_rate"] for p in paired_data)
+                avg_compliance = sum(p["method_compliance"] for p in paired_data) / len(paired_data)
+                # Best model is the one with highest delta
+                best_pair = max(paired_data, key=lambda x: x["delta"])
+                best_model = best_pair["model"]
+                description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
+                method_stats.append({
+                    "method": method,
+                    "description": description,
+                    "num_models": len(method_df),
+                    "num_pairs": len(paired_data),
+                    "avg_uncensored_rate": avg_rate,
+                    "delta_from_baseline": avg_delta,
+                    "max_uncensored_rate": max_rate,
+                    "min_uncensored_rate": min_rate,
+                    "avg_compliance_score": avg_compliance,
+                    "best_model": best_model,
+                })
     return pd.DataFrame(method_stats), dynamic_method_colors
     },
 ]
+# Column definitions for Methods AG Grid (paired comparisons only)
 METHOD_COLUMN_DEFS = [
     {
         "field": "method",
         "sortable": True,
     },
     {
+        "field": "num_pairs",
+        "headerName": "# Pairs",
+        "width": 80,
         "sortable": True,
     },
     {
         "field": "delta_from_baseline",
+        "headerName": "Δ vs Baseline ⬆️",
+        "width": 140,
         "sortable": True,
         "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
         "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
     },
     {
+        "field": "avg_uncensored_rate",
+        "headerName": "Avg Rate",
         "width": 100,
         "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
     },
     {
         "field": "max_uncensored_rate",
             # Method comparison description
             html.Div([
                 html.P([
+                    "Compare censorship removal methods using ",
+                    html.Strong("paired comparisons only"),
+                    ". Delta (Δ) is calculated by comparing the ",
+                    html.Strong("same base model"),
+                    " with and without each method applied."
                 ], style={"color": "#666", "marginBottom": "5px"}),
                 html.P([
+                    "Methods are only shown if they have at least one paired comparison ",
+                    "(matching model_family + model_size with a baseline 'none' submission)."
                 ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
             ]),