Spaces:

wisent-ai
/

UncensorBench-Leaderboard

Paused

App Files Files Community

lbartoszcze commited on Dec 1, 2025

Commit

f24cb80

verified ·

1 Parent(s): 26802a6

Add Methods Comparison tab with delta from baseline

Browse files

Files changed (1) hide show

app.py +275 -50

app.py CHANGED Viewed

@@ -15,6 +15,28 @@ server = app.server
 # Load leaderboard data
 DATA_FILE = "leaderboard.csv"
 def load_data():
     """Load leaderboard data from CSV."""
     if os.path.exists(DATA_FILE):
@@ -32,8 +54,47 @@ def load_data():
             "total_prompts", "timestamp", "submitter", "sample_responses_url"
         ])
-# Column definitions for AG Grid
-COLUMN_DEFS = [
     {
         "field": "Rank",
         "headerName": "🏆",
@@ -112,6 +173,71 @@ COLUMN_DEFS = [
     },
 ]
 # Topic-specific columns (added dynamically if present)
 TOPIC_COLUMNS = [
     "cybersecurity", "piracy", "weapons", "drugs", "fraud",
@@ -120,10 +246,11 @@ TOPIC_COLUMNS = [
     "evasion", "self_harm", "adult_content"
 ]
-def get_column_defs(df):
     """Get column definitions based on available data."""
-    cols = COLUMN_DEFS.copy()
     # Add topic columns if they exist in the data
     for topic in TOPIC_COLUMNS:
         if topic in df.columns:
@@ -134,9 +261,10 @@ def get_column_defs(df):
                 "sortable": True,
                 "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
             })
     return cols
 # App layout
 app.layout = html.Div([
     # Header
@@ -147,7 +275,7 @@ app.layout = html.Div([
             style={"color": "#666", "marginTop": "0"}
         ),
     ], style={"textAlign": "center", "padding": "20px"}),
     # Info banner
     html.Div([
         html.Div([
@@ -179,7 +307,7 @@ app.layout = html.Div([
         "marginLeft": "20px",
         "marginRight": "20px",
     }),
     # Stats summary
     html.Div(id="stats-summary", style={
         "display": "flex",
@@ -187,35 +315,23 @@ app.layout = html.Div([
         "gap": "40px",
         "marginBottom": "20px",
     }),
-    # Leaderboard table
-    html.Div([
-        dag.AgGrid(
-            id="leaderboard-grid",
-            columnDefs=COLUMN_DEFS,
-            rowData=[],
-            defaultColDef={
-                "resizable": True,
-                "sortable": True,
-            },
-            dashGridOptions={
-                "pagination": True,
-                "paginationPageSize": 50,
-                "animateRows": True,
-                "rowSelection": "single",
-            },
-            style={"height": "600px"},
-            className="ag-theme-alpine",
-        ),
-    ], style={"padding": "0 20px 20px 20px"}),
     # Refresh interval
     dcc.Interval(
         id="refresh-interval",
         interval=60000,  # Refresh every 60 seconds
         n_intervals=0
     ),
     # Footer
     html.Div([
         html.Hr(),
@@ -231,37 +347,54 @@ app.layout = html.Div([
             html.A("Submit your model", href="https://github.com/wisent-ai/uncensorbench#how-to-submit", target="_blank"),
         ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
     ], style={"padding": "20px"}),
 ], style={"fontFamily": "system-ui, -apple-system, sans-serif"})
 @callback(
-    [Output("leaderboard-grid", "rowData"),
-     Output("leaderboard-grid", "columnDefs"),
-     Output("stats-summary", "children")],
     Input("refresh-interval", "n_intervals")
 )
-def update_leaderboard(n):
-    """Update the leaderboard data."""
     df = load_data()
-    # Get column definitions
-    col_defs = get_column_defs(df)
-    # Calculate stats
     if len(df) > 0:
         stats = [
             html.Div([
                 html.Div(str(len(df)), style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
                 html.Div("Models", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
             html.Div([
-                html.Div(f"{df['uncensored_rate'].mean():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#4CAF50"}),
-                html.Div("Avg Uncensored Rate", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
             html.Div([
                 html.Div(f"{df['uncensored_rate'].max():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#FF9800"}),
-                html.Div("Best Uncensored Rate", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
         ]
     else:
@@ -274,11 +407,103 @@ def update_leaderboard(n):
                 html.P("No submissions yet. Be the first to submit!", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
         ]
-    # Convert to records for AG Grid
-    row_data = df.to_dict("records") if len(df) > 0 else []
-    return row_data, col_defs, stats
 if __name__ == "__main__":

 # Load leaderboard data
 DATA_FILE = "leaderboard.csv"
+# Valid methods for censorship removal
+VALID_METHODS = ["none", "abliteration", "steering", "finetuning", "prompting", "other"]
+METHOD_DESCRIPTIONS = {
+    "none": "Baseline (no modification)",
+    "abliteration": "Abliteration technique",
+    "steering": "Steering vectors",
+    "finetuning": "Fine-tuning based",
+    "prompting": "Prompt-based jailbreaking",
+    "other": "Other methods",
+}
+METHOD_COLORS = {
+    "none": "#9E9E9E",
+    "abliteration": "#E91E63",
+    "steering": "#2196F3",
+    "finetuning": "#4CAF50",
+    "prompting": "#FF9800",
+    "other": "#9C27B0",
+}
 def load_data():
     """Load leaderboard data from CSV."""
     if os.path.exists(DATA_FILE):
             "total_prompts", "timestamp", "submitter", "sample_responses_url"
         ])
+def calculate_method_stats(df):
+    """Calculate statistics for each method including delta from baseline."""
+    if len(df) == 0:
+        return pd.DataFrame()
+    # Get baseline average (method = "none")
+    baseline_df = df[df["method"] == "none"]
+    baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
+    # Group by method
+    method_stats = []
+    for method in VALID_METHODS:
+        method_df = df[df["method"] == method]
+        if len(method_df) > 0:
+            avg_rate = method_df["uncensored_rate"].mean()
+            max_rate = method_df["uncensored_rate"].max()
+            min_rate = method_df["uncensored_rate"].min()
+            avg_compliance = method_df["avg_compliance_score"].mean()
+            delta = avg_rate - baseline_avg
+            # Find best model for this method
+            best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
+            method_stats.append({
+                "method": method,
+                "description": METHOD_DESCRIPTIONS.get(method, method),
+                "num_models": len(method_df),
+                "avg_uncensored_rate": avg_rate,
+                "max_uncensored_rate": max_rate,
+                "min_uncensored_rate": min_rate,
+                "avg_compliance_score": avg_compliance,
+                "delta_from_baseline": delta,
+                "best_model": best_model,
+            })
+    return pd.DataFrame(method_stats)
+# Column definitions for Models AG Grid
+MODEL_COLUMN_DEFS = [
     {
         "field": "Rank",
         "headerName": "🏆",
     },
 ]
+# Column definitions for Methods AG Grid
+METHOD_COLUMN_DEFS = [
+    {
+        "field": "method",
+        "headerName": "Method",
+        "width": 130,
+        "pinned": "left",
+        "sortable": True,
+    },
+    {
+        "field": "description",
+        "headerName": "Description",
+        "width": 200,
+        "sortable": True,
+    },
+    {
+        "field": "num_models",
+        "headerName": "# Models",
+        "width": 100,
+        "sortable": True,
+    },
+    {
+        "field": "avg_uncensored_rate",
+        "headerName": "Avg Uncensored ⬆️",
+        "width": 150,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+    },
+    {
+        "field": "delta_from_baseline",
+        "headerName": "Δ vs Baseline",
+        "width": 130,
+        "sortable": True,
+        "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
+        "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
+    },
+    {
+        "field": "max_uncensored_rate",
+        "headerName": "Best Rate",
+        "width": 110,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+    },
+    {
+        "field": "min_uncensored_rate",
+        "headerName": "Worst Rate",
+        "width": 110,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+    },
+    {
+        "field": "avg_compliance_score",
+        "headerName": "Avg Compliance",
+        "width": 140,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
+    },
+    {
+        "field": "best_model",
+        "headerName": "Best Model",
+        "width": 280,
+        "sortable": True,
+    },
+]
 # Topic-specific columns (added dynamically if present)
 TOPIC_COLUMNS = [
     "cybersecurity", "piracy", "weapons", "drugs", "fraud",
     "evasion", "self_harm", "adult_content"
 ]
+def get_model_column_defs(df):
     """Get column definitions based on available data."""
+    cols = MODEL_COLUMN_DEFS.copy()
     # Add topic columns if they exist in the data
     for topic in TOPIC_COLUMNS:
         if topic in df.columns:
                 "sortable": True,
                 "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
             })
     return cols
 # App layout
 app.layout = html.Div([
     # Header
             style={"color": "#666", "marginTop": "0"}
         ),
     ], style={"textAlign": "center", "padding": "20px"}),
     # Info banner
     html.Div([
         html.Div([
         "marginLeft": "20px",
         "marginRight": "20px",
     }),
     # Stats summary
     html.Div(id="stats-summary", style={
         "display": "flex",
         "gap": "40px",
         "marginBottom": "20px",
     }),
+    # Tabs for Models and Methods views
+    dcc.Tabs(id="view-tabs", value="models", children=[
+        dcc.Tab(label="📋 Models Leaderboard", value="models", style={"fontWeight": "bold"}),
+        dcc.Tab(label="🔬 Methods Comparison", value="methods", style={"fontWeight": "bold"}),
+    ], style={"marginLeft": "20px", "marginRight": "20px"}),
+    # Tab content
+    html.Div(id="tab-content", style={"padding": "20px"}),
     # Refresh interval
     dcc.Interval(
         id="refresh-interval",
         interval=60000,  # Refresh every 60 seconds
         n_intervals=0
     ),
     # Footer
     html.Div([
         html.Hr(),
             html.A("Submit your model", href="https://github.com/wisent-ai/uncensorbench#how-to-submit", target="_blank"),
         ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
     ], style={"padding": "20px"}),
 ], style={"fontFamily": "system-ui, -apple-system, sans-serif"})
 @callback(
+    Output("stats-summary", "children"),
     Input("refresh-interval", "n_intervals")
 )
+def update_stats(n):
+    """Update the stats summary."""
     df = load_data()
     if len(df) > 0:
+        # Calculate method stats for the summary
+        baseline_df = df[df["method"] == "none"]
+        baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
+        # Find best non-baseline method
+        non_baseline = df[df["method"] != "none"]
+        best_method_avg = 0
+        best_method = "N/A"
+        if len(non_baseline) > 0:
+            method_avgs = non_baseline.groupby("method")["uncensored_rate"].mean()
+            if len(method_avgs) > 0:
+                best_method = method_avgs.idxmax()
+                best_method_avg = method_avgs.max()
+        best_delta = best_method_avg - baseline_avg if best_method_avg > 0 else 0
         stats = [
             html.Div([
                 html.Div(str(len(df)), style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
                 html.Div("Models", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
             html.Div([
+                html.Div(f"{baseline_avg:.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#9E9E9E"}),
+                html.Div("Baseline Avg", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
             html.Div([
                 html.Div(f"{df['uncensored_rate'].max():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#FF9800"}),
+                html.Div("Best Rate", style={"color": "#666"}),
+            ], style={"textAlign": "center"}),
+            html.Div([
+                html.Div(
+                    f"+{best_delta:.1%}" if best_delta > 0 else f"{best_delta:.1%}",
+                    style={"fontSize": "2em", "fontWeight": "bold", "color": "#4CAF50" if best_delta > 0 else "#f44336"}
+                ),
+                html.Div(f"Best Method Δ ({best_method})", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
         ]
     else:
                 html.P("No submissions yet. Be the first to submit!", style={"color": "#666"}),
             ], style={"textAlign": "center"}),
         ]
+    return stats
+@callback(
+    Output("tab-content", "children"),
+    [Input("view-tabs", "value"),
+     Input("refresh-interval", "n_intervals")]
+)
+def render_tab_content(tab, n):
+    """Render content based on selected tab."""
+    df = load_data()
+    if tab == "models":
+        # Models leaderboard view
+        col_defs = get_model_column_defs(df)
+        row_data = df.to_dict("records") if len(df) > 0 else []
+        return html.Div([
+            dag.AgGrid(
+                id="leaderboard-grid",
+                columnDefs=col_defs,
+                rowData=row_data,
+                defaultColDef={
+                    "resizable": True,
+                    "sortable": True,
+                },
+                dashGridOptions={
+                    "pagination": True,
+                    "paginationPageSize": 50,
+                    "animateRows": True,
+                    "rowSelection": "single",
+                },
+                style={"height": "600px"},
+                className="ag-theme-alpine",
+            ),
+        ])
+    elif tab == "methods":
+        # Methods comparison view
+        method_df = calculate_method_stats(df)
+        row_data = method_df.to_dict("records") if len(method_df) > 0 else []
+        # Sort by delta from baseline descending
+        if len(method_df) > 0:
+            method_df = method_df.sort_values("delta_from_baseline", ascending=False)
+            row_data = method_df.to_dict("records")
+        return html.Div([
+            # Method comparison description
+            html.Div([
+                html.P([
+                    "Compare censorship removal methods. ",
+                    html.Strong("Δ vs Baseline"),
+                    " shows the improvement over unmodified models (method=none)."
+                ], style={"color": "#666", "marginBottom": "15px"}),
+            ]),
+            # Methods grid
+            dag.AgGrid(
+                id="methods-grid",
+                columnDefs=METHOD_COLUMN_DEFS,
+                rowData=row_data,
+                defaultColDef={
+                    "resizable": True,
+                    "sortable": True,
+                },
+                dashGridOptions={
+                    "animateRows": True,
+                    "rowSelection": "single",
+                },
+                style={"height": "400px"},
+                className="ag-theme-alpine",
+            ),
+            # Method legend
+            html.Div([
+                html.H4("Method Definitions", style={"marginTop": "30px", "marginBottom": "15px"}),
+                html.Div([
+                    html.Div([
+                        html.Span(
+                            f"● {method}",
+                            style={"color": METHOD_COLORS.get(method, "#666"), "fontWeight": "bold", "marginRight": "10px"}
+                        ),
+                        html.Span(desc, style={"color": "#666"}),
+                    ], style={"marginBottom": "8px"})
+                    for method, desc in METHOD_DESCRIPTIONS.items()
+                ], style={"columns": "2", "columnGap": "40px"}),
+            ], style={
+                "backgroundColor": "#f9f9f9",
+                "padding": "20px",
+                "borderRadius": "8px",
+                "marginTop": "20px",
+            }),
+        ])
+    return html.Div("Select a tab")
 if __name__ == "__main__":