Spaces:

wisent-ai
/

UncensorBench-Leaderboard

Paused

App Files Files Community

lbartoszcze commited on Dec 1, 2025

Commit

e2129ad

verified ·

1 Parent(s): a55ede4

Add paired comparison logic for accurate method effectiveness calculation

Browse files

Files changed (1) hide show

app.py +112 -15

app.py CHANGED Viewed

@@ -65,18 +65,42 @@ def get_method_color(method, method_index=0):
     return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
 def calculate_method_stats(df):
-    """Calculate statistics for each method including delta from baseline."""
     if len(df) == 0:
         return pd.DataFrame(), {}
     # Get all unique methods from the actual data
     all_methods = df["method"].dropna().unique().tolist()
-    # Get baseline average (method = "none")
-    baseline_df = df[df["method"] == "none"]
-    baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
     # Build dynamic color mapping for any new methods
     dynamic_method_colors = {}
     dynamic_idx = 0
@@ -87,6 +111,20 @@ def calculate_method_stats(df):
             dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
             dynamic_idx += 1
     # Group by method - iterate over actual methods in the data
     method_stats = []
     for method in all_methods:
@@ -96,11 +134,45 @@ def calculate_method_stats(df):
             max_rate = method_df["uncensored_rate"].max()
             min_rate = method_df["uncensored_rate"].min()
             avg_compliance = method_df["avg_compliance_score"].mean()
-            delta = avg_rate - baseline_avg
             # Find best model for this method
             best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
             # Get description - use predefined or just capitalize the method name
             description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
@@ -113,6 +185,8 @@ def calculate_method_stats(df):
                 "min_uncensored_rate": min_rate,
                 "avg_compliance_score": avg_compliance,
                 "delta_from_baseline": delta,
                 "best_model": best_model,
             })
@@ -211,13 +285,13 @@ METHOD_COLUMN_DEFS = [
     {
         "field": "description",
         "headerName": "Description",
-        "width": 200,
         "sortable": True,
     },
     {
         "field": "num_models",
         "headerName": "# Models",
-        "width": 100,
         "sortable": True,
     },
     {
@@ -230,36 +304,50 @@ METHOD_COLUMN_DEFS = [
     {
         "field": "delta_from_baseline",
         "headerName": "Δ vs Baseline",
-        "width": 130,
         "sortable": True,
         "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
         "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
     },
     {
         "field": "max_uncensored_rate",
         "headerName": "Best Rate",
-        "width": 110,
         "sortable": True,
         "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
     },
     {
         "field": "min_uncensored_rate",
         "headerName": "Worst Rate",
-        "width": 110,
         "sortable": True,
         "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
     },
     {
         "field": "avg_compliance_score",
         "headerName": "Avg Compliance",
-        "width": 140,
         "sortable": True,
         "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
     },
     {
         "field": "best_model",
         "headerName": "Best Model",
-        "width": 280,
         "sortable": True,
     },
 ]
@@ -503,8 +591,17 @@ def render_tab_content(tab, n):
                 html.P([
                     "Compare censorship removal methods. ",
                     html.Strong("Δ vs Baseline"),
-                    " shows the improvement over unmodified models (method=none)."
-                ], style={"color": "#666", "marginBottom": "15px"}),
             ]),
             # Methods grid

     return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
+def extract_base_model_name(model_name):
+    """
+    Extract the base model name for pairing.
+    E.g., 'meta-llama/Llama-3.2-1B-Instruct-abliterated' -> 'meta-llama/Llama-3.2-1B-Instruct'
+    """
+    # Common suffixes added by methods
+    suffixes_to_remove = [
+        "-abliterated", "-uncensored", "-steered", "-finetuned",
+        "_abliterated", "_uncensored", "_steered", "_finetuned",
+        "-ablation", "-steering", "-ft",
+    ]
+    base_name = model_name
+    for suffix in suffixes_to_remove:
+        if base_name.lower().endswith(suffix.lower()):
+            base_name = base_name[:-len(suffix)]
+            break
+    return base_name
 def calculate_method_stats(df):
+    """
+    Calculate statistics for each method including delta from baseline.
+    Delta calculation:
+    1. PAIRED: For models that have both baseline (none) and method versions,
+       calculate the actual improvement (method_rate - baseline_rate) for each pair,
+       then average across pairs.
+    2. UNPAIRED: For methods without paired baselines, show the difference from
+       the global baseline average (less reliable).
+    """
     if len(df) == 0:
         return pd.DataFrame(), {}
     # Get all unique methods from the actual data
     all_methods = df["method"].dropna().unique().tolist()
     # Build dynamic color mapping for any new methods
     dynamic_method_colors = {}
     dynamic_idx = 0
             dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
             dynamic_idx += 1
+    # Get baseline data
+    baseline_df = df[df["method"] == "none"].copy()
+    global_baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
+    # Create lookup for baseline rates by model family + size (for pairing)
+    baseline_lookup = {}
+    if len(baseline_df) > 0:
+        for _, row in baseline_df.iterrows():
+            # Key by model_family + model_size for matching
+            key = (row.get("model_family", ""), row.get("model_size", ""))
+            base_model_key = extract_base_model_name(row.get("model", ""))
+            baseline_lookup[key] = row["uncensored_rate"]
+            baseline_lookup[base_model_key] = row["uncensored_rate"]
     # Group by method - iterate over actual methods in the data
     method_stats = []
     for method in all_methods:
             max_rate = method_df["uncensored_rate"].max()
             min_rate = method_df["uncensored_rate"].min()
             avg_compliance = method_df["avg_compliance_score"].mean()
             # Find best model for this method
             best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
+            # Calculate paired delta where possible
+            paired_deltas = []
+            unpaired_count = 0
+            if method != "none":
+                for _, row in method_df.iterrows():
+                    # Try to find matching baseline
+                    key = (row.get("model_family", ""), row.get("model_size", ""))
+                    base_model_key = extract_base_model_name(row.get("model", ""))
+                    baseline_rate = None
+                    if base_model_key in baseline_lookup:
+                        baseline_rate = baseline_lookup[base_model_key]
+                    elif key in baseline_lookup:
+                        baseline_rate = baseline_lookup[key]
+                    if baseline_rate is not None:
+                        paired_deltas.append(row["uncensored_rate"] - baseline_rate)
+                    else:
+                        unpaired_count += 1
+            # Calculate delta
+            if method == "none":
+                delta = 0.0
+                paired_count = len(method_df)
+                delta_type = "baseline"
+            elif len(paired_deltas) > 0:
+                delta = sum(paired_deltas) / len(paired_deltas)
+                paired_count = len(paired_deltas)
+                delta_type = "paired" if unpaired_count == 0 else "mixed"
+            else:
+                delta = avg_rate - global_baseline_avg
+                paired_count = 0
+                delta_type = "unpaired"
             # Get description - use predefined or just capitalize the method name
             description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
                 "min_uncensored_rate": min_rate,
                 "avg_compliance_score": avg_compliance,
                 "delta_from_baseline": delta,
+                "paired_comparisons": paired_count,
+                "delta_type": delta_type,
                 "best_model": best_model,
             })
     {
         "field": "description",
         "headerName": "Description",
+        "width": 180,
         "sortable": True,
     },
     {
         "field": "num_models",
         "headerName": "# Models",
+        "width": 90,
         "sortable": True,
     },
     {
     {
         "field": "delta_from_baseline",
         "headerName": "Δ vs Baseline",
+        "width": 120,
         "sortable": True,
         "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
         "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
     },
+    {
+        "field": "delta_type",
+        "headerName": "Δ Type",
+        "width": 100,
+        "sortable": True,
+        "cellStyle": {"function": "params.value === 'paired' ? {'color': '#4CAF50'} : params.value === 'unpaired' ? {'color': '#FF9800'} : {}"},
+        "tooltipField": "delta_type",
+    },
+    {
+        "field": "paired_comparisons",
+        "headerName": "# Pairs",
+        "width": 80,
+        "sortable": True,
+    },
     {
         "field": "max_uncensored_rate",
         "headerName": "Best Rate",
+        "width": 100,
         "sortable": True,
         "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
     },
     {
         "field": "min_uncensored_rate",
         "headerName": "Worst Rate",
+        "width": 100,
         "sortable": True,
         "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
     },
     {
         "field": "avg_compliance_score",
         "headerName": "Avg Compliance",
+        "width": 130,
         "sortable": True,
         "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
     },
     {
         "field": "best_model",
         "headerName": "Best Model",
+        "width": 260,
         "sortable": True,
     },
 ]
                 html.P([
                     "Compare censorship removal methods. ",
                     html.Strong("Δ vs Baseline"),
+                    " shows the improvement over unmodified models."
+                ], style={"color": "#666", "marginBottom": "5px"}),
+                html.P([
+                    html.Strong("Δ Type: ", style={"color": "#333"}),
+                    html.Span("paired", style={"color": "#4CAF50", "fontWeight": "bold"}),
+                    " = same model compared with/without method (reliable). ",
+                    html.Span("unpaired", style={"color": "#FF9800", "fontWeight": "bold"}),
+                    " = compared to global baseline avg (less reliable). ",
+                    html.Span("mixed", style={"color": "#666"}),
+                    " = some paired, some unpaired."
+                ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
             ]),
             # Methods grid