Spaces:

wisent-ai
/

UncensorBench

Running

App Files Files Community

lbartoszcze commited on Dec 1, 2025

Commit

8a6708f

verified ·

1 Parent(s): 6e2f828

Add app.py

Browse files

Files changed (1) hide show

app.py +624 -0

app.py ADDED Viewed

	@@ -0,0 +1,624 @@

+"""
+UncensorBench Leaderboard - A Dash application for tracking LLM censorship removal benchmarks.
+"""
+import dash
+from dash import html, dcc, callback, Input, Output, State
+import dash_ag_grid as dag
+import pandas as pd
+import os
+# Initialize the Dash app
+app = dash.Dash(__name__, title="UncensorBench Leaderboard")
+server = app.server
+# Load leaderboard data
+DATA_FILE = "leaderboard.csv"
+# Known method descriptions (for display purposes, but we accept any method)
+METHOD_DESCRIPTIONS = {
+    "none": "Baseline (no modification)",
+    "abliteration": "Abliteration technique",
+    "steering": "Steering vectors",
+    "finetuning": "Fine-tuning based",
+    "prompting": "Prompt-based jailbreaking",
+    "other": "Other methods",
+}
+# Colors for known methods, dynamic methods get auto-assigned colors
+METHOD_COLORS = {
+    "none": "#9E9E9E",
+    "abliteration": "#E91E63",
+    "steering": "#2196F3",
+    "finetuning": "#4CAF50",
+    "prompting": "#FF9800",
+    "other": "#9C27B0",
+}
+# Fallback colors for dynamically discovered methods
+DYNAMIC_COLORS = ["#00BCD4", "#795548", "#607D8B", "#3F51B5", "#009688", "#CDDC39", "#FF5722", "#673AB7"]
+def load_data():
+    """Load leaderboard data from CSV."""
+    if os.path.exists(DATA_FILE):
+        df = pd.read_csv(DATA_FILE)
+        # Sort by uncensored_rate descending
+        if len(df) > 0:
+            df = df.sort_values("uncensored_rate", ascending=False).reset_index(drop=True)
+            df.insert(0, "Rank", range(1, len(df) + 1))
+        return df
+    else:
+        # Return empty dataframe with expected columns
+        return pd.DataFrame(columns=[
+            "Rank", "model", "model_family", "model_size", "method",
+            "uncensored_rate", "avg_compliance_score",
+            "total_prompts", "timestamp", "submitter", "sample_responses_url"
+        ])
+def get_method_color(method, method_index=0):
+    """Get color for a method, using predefined or dynamic colors."""
+    if method in METHOD_COLORS:
+        return METHOD_COLORS[method]
+    # Assign a dynamic color based on index
+    return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]
+def calculate_method_stats(df):
+    """
+    Calculate statistics for each method based on PAIRED comparisons only.
+    A paired comparison requires the exact same base model to have both:
+    - A baseline submission (method="none")
+    - A method-applied submission (method=X)
+    Only shows delta for methods where paired comparisons exist.
+    """
+    if len(df) == 0:
+        return pd.DataFrame(), {}
+    # Get all unique methods from the actual data
+    all_methods = df["method"].dropna().unique().tolist()
+    # Build dynamic color mapping for any new methods
+    dynamic_method_colors = {}
+    dynamic_idx = 0
+    for method in all_methods:
+        if method in METHOD_COLORS:
+            dynamic_method_colors[method] = METHOD_COLORS[method]
+        else:
+            dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
+            dynamic_idx += 1
+    # Get baseline data - create lookup by exact model name
+    baseline_df = df[df["method"] == "none"].copy()
+    baseline_lookup = {}
+    if len(baseline_df) > 0:
+        for _, row in baseline_df.iterrows():
+            model_name = row.get("model", "")
+            baseline_lookup[model_name] = {
+                "uncensored_rate": row["uncensored_rate"],
+                "avg_compliance_score": row.get("avg_compliance_score", 0),
+            }
+    # Calculate paired comparisons for each method
+    method_stats = []
+    for method in all_methods:
+        method_df = df[df["method"] == method]
+        if method == "none":
+            # Baseline method - show stats but no delta
+            if len(method_df) > 0:
+                avg_rate = method_df["uncensored_rate"].mean()
+                max_rate = method_df["uncensored_rate"].max()
+                min_rate = method_df["uncensored_rate"].min()
+                avg_compliance = method_df["avg_compliance_score"].mean()
+                best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
+                description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
+                method_stats.append({
+                    "method": method,
+                    "description": description,
+                    "num_models": len(method_df),
+                    "num_pairs": len(method_df),
+                    "avg_uncensored_rate": avg_rate,
+                    "delta_from_baseline": 0.0,
+                    "max_uncensored_rate": max_rate,
+                    "min_uncensored_rate": min_rate,
+                    "avg_compliance_score": avg_compliance,
+                    "best_model": best_model,
+                })
+        else:
+            # Non-baseline method - only count paired comparisons
+            paired_data = []
+            for _, row in method_df.iterrows():
+                method_model = row.get("model", "")
+                method_rate = row["uncensored_rate"]
+                method_compliance = row.get("avg_compliance_score", 0)
+                # Find exact baseline match by model_family + model_size
+                model_family = row.get("model_family", "")
+                model_size = row.get("model_size", "")
+                # Look for baseline with same family and size
+                baseline_match = None
+                for baseline_model, baseline_data in baseline_lookup.items():
+                    baseline_row = baseline_df[baseline_df["model"] == baseline_model].iloc[0]
+                    if (baseline_row.get("model_family", "") == model_family and
+                        baseline_row.get("model_size", "") == model_size):
+                        baseline_match = baseline_data
+                        break
+                if baseline_match is not None:
+                    paired_data.append({
+                        "model": method_model,
+                        "method_rate": method_rate,
+                        "baseline_rate": baseline_match["uncensored_rate"],
+                        "delta": method_rate - baseline_match["uncensored_rate"],
+                        "method_compliance": method_compliance,
+                    })
+            # Only add method if it has paired comparisons
+            if len(paired_data) > 0:
+                avg_delta = sum(p["delta"] for p in paired_data) / len(paired_data)
+                avg_rate = sum(p["method_rate"] for p in paired_data) / len(paired_data)
+                max_rate = max(p["method_rate"] for p in paired_data)
+                min_rate = min(p["method_rate"] for p in paired_data)
+                avg_compliance = sum(p["method_compliance"] for p in paired_data) / len(paired_data)
+                # Best model is the one with highest delta
+                best_pair = max(paired_data, key=lambda x: x["delta"])
+                best_model = best_pair["model"]
+                description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())
+                method_stats.append({
+                    "method": method,
+                    "description": description,
+                    "num_models": len(method_df),
+                    "num_pairs": len(paired_data),
+                    "avg_uncensored_rate": avg_rate,
+                    "delta_from_baseline": avg_delta,
+                    "max_uncensored_rate": max_rate,
+                    "min_uncensored_rate": min_rate,
+                    "avg_compliance_score": avg_compliance,
+                    "best_model": best_model,
+                })
+    return pd.DataFrame(method_stats), dynamic_method_colors
+# Column definitions for Models AG Grid
+MODEL_COLUMN_DEFS = [
+    {
+        "field": "Rank",
+        "headerName": "🏆",
+        "width": 70,
+        "pinned": "left",
+        "sortable": True,
+    },
+    {
+        "field": "model",
+        "headerName": "Model",
+        "width": 300,
+        "pinned": "left",
+        "sortable": True,
+        "filter": True,
+    },
+    {
+        "field": "model_family",
+        "headerName": "Family",
+        "width": 120,
+        "sortable": True,
+        "filter": True,
+    },
+    {
+        "field": "model_size",
+        "headerName": "Size",
+        "width": 80,
+        "sortable": True,
+        "filter": True,
+    },
+    {
+        "field": "method",
+        "headerName": "Method",
+        "width": 120,
+        "sortable": True,
+        "filter": True,
+    },
+    {
+        "field": "uncensored_rate",
+        "headerName": "Uncensored Rate ⬆️",
+        "width": 160,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+    },
+    {
+        "field": "avg_compliance_score",
+        "headerName": "Avg Compliance",
+        "width": 140,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
+    },
+    {
+        "field": "total_prompts",
+        "headerName": "Prompts",
+        "width": 90,
+        "sortable": True,
+    },
+    {
+        "field": "timestamp",
+        "headerName": "Submitted",
+        "width": 180,
+        "sortable": True,
+    },
+    {
+        "field": "submitter",
+        "headerName": "Submitter",
+        "width": 130,
+        "sortable": True,
+        "filter": True,
+    },
+    {
+        "field": "sample_responses_url",
+        "headerName": "Responses",
+        "width": 110,
+        "cellRenderer": "markdown",
+        "valueGetter": {"function": "params.data.sample_responses_url ? '[📄 View](' + params.data.sample_responses_url + ')' : ''"},
+    },
+]
+# Column definitions for Methods AG Grid (paired comparisons only)
+METHOD_COLUMN_DEFS = [
+    {
+        "field": "method",
+        "headerName": "Method",
+        "width": 130,
+        "pinned": "left",
+        "sortable": True,
+    },
+    {
+        "field": "description",
+        "headerName": "Description",
+        "width": 180,
+        "sortable": True,
+    },
+    {
+        "field": "num_pairs",
+        "headerName": "# Pairs",
+        "width": 80,
+        "sortable": True,
+    },
+    {
+        "field": "delta_from_baseline",
+        "headerName": "Δ vs Baseline ⬆️",
+        "width": 140,
+        "sortable": True,
+        "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
+        "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
+    },
+    {
+        "field": "avg_uncensored_rate",
+        "headerName": "Avg Rate",
+        "width": 100,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+    },
+    {
+        "field": "max_uncensored_rate",
+        "headerName": "Best Rate",
+        "width": 100,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+    },
+    {
+        "field": "min_uncensored_rate",
+        "headerName": "Worst Rate",
+        "width": 100,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+    },
+    {
+        "field": "avg_compliance_score",
+        "headerName": "Avg Compliance",
+        "width": 130,
+        "sortable": True,
+        "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
+    },
+    {
+        "field": "best_model",
+        "headerName": "Best Model",
+        "width": 260,
+        "sortable": True,
+    },
+]
+# Topic-specific columns (added dynamically if present)
+TOPIC_COLUMNS = [
+    "cybersecurity", "piracy", "weapons", "drugs", "fraud",
+    "manipulation", "violence", "privacy_invasion", "illegal_activities",
+    "academic_dishonesty", "gambling", "controversial_speech",
+    "evasion", "self_harm", "adult_content"
+]
+def get_model_column_defs(df):
+    """Get column definitions based on available data."""
+    cols = MODEL_COLUMN_DEFS.copy()
+    # Add topic columns if they exist in the data
+    for topic in TOPIC_COLUMNS:
+        if topic in df.columns:
+            cols.append({
+                "field": topic,
+                "headerName": topic.replace("_", " ").title(),
+                "width": 130,
+                "sortable": True,
+                "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
+            })
+    return cols
+# App layout
+app.layout = html.Div([
+    # Header
+    html.Div([
+        html.H1("🦬 UncensorBench Leaderboard", style={"marginBottom": "5px"}),
+        html.P(
+            "Tracking LLM performance on censorship removal benchmarks",
+            style={"color": "#666", "marginTop": "0"}
+        ),
+    ], style={"textAlign": "center", "padding": "20px"}),
+    # Info banner
+    html.Div([
+        html.Div([
+            html.Span("📊 ", style={"fontSize": "1.2em"}),
+            html.A(
+                "UncensorBench on PyPI",
+                href="https://pypi.org/project/uncensorbench/",
+                target="_blank",
+                style={"marginRight": "20px"}
+            ),
+            html.Span("📓 ", style={"fontSize": "1.2em"}),
+            html.A(
+                "Run Benchmark Notebook",
+                href="https://github.com/wisent-ai/uncensorbench/blob/main/examples/notebooks/establish_baseline.ipynb",
+                target="_blank",
+                style={"marginRight": "20px"}
+            ),
+            html.Span("🐙 ", style={"fontSize": "1.2em"}),
+            html.A(
+                "GitHub",
+                href="https://github.com/wisent-ai/uncensorbench",
+                target="_blank",
+            ),
+        ], style={"textAlign": "center", "padding": "10px"})
+    ], style={
+        "backgroundColor": "#f0f0f0",
+        "borderRadius": "8px",
+        "marginBottom": "20px",
+        "marginLeft": "20px",
+        "marginRight": "20px",
+    }),
+    # Stats summary
+    html.Div(id="stats-summary", style={
+        "display": "flex",
+        "justifyContent": "center",
+        "gap": "40px",
+        "marginBottom": "20px",
+    }),
+    # Tabs for Models and Methods views
+    dcc.Tabs(id="view-tabs", value="models", children=[
+        dcc.Tab(label="📋 Models Leaderboard", value="models", style={"fontWeight": "bold"}),
+        dcc.Tab(label="🔬 Methods Comparison", value="methods", style={"fontWeight": "bold"}),
+    ], style={"marginLeft": "20px", "marginRight": "20px"}),
+    # Tab content
+    html.Div(id="tab-content", style={"padding": "20px"}),
+    # Refresh interval
+    dcc.Interval(
+        id="refresh-interval",
+        interval=60000,  # Refresh every 60 seconds
+        n_intervals=0
+    ),
+    # Footer
+    html.Div([
+        html.Hr(),
+        html.P([
+            "UncensorBench measures how models respond to prompts that typically trigger refusal. ",
+            html.Strong("Higher uncensored rate = more compliant responses. "),
+            "This benchmark is for research purposes only."
+        ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
+        html.P([
+            "Powered by ",
+            html.A("Wisent AI", href="https://wisent.ai", target="_blank"),
+            " • ",
+            html.A("Submit your model", href="https://github.com/wisent-ai/uncensorbench#how-to-submit", target="_blank"),
+        ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
+    ], style={"padding": "20px"}),
+], style={"fontFamily": "system-ui, -apple-system, sans-serif"})
+@callback(
+    Output("stats-summary", "children"),
+    Input("refresh-interval", "n_intervals")
+)
+def update_stats(n):
+    """Update the stats summary."""
+    df = load_data()
+    if len(df) > 0:
+        # Calculate method stats for the summary
+        baseline_df = df[df["method"] == "none"]
+        baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0
+        # Find best non-baseline method
+        non_baseline = df[df["method"] != "none"]
+        best_method_avg = 0
+        best_method = "N/A"
+        if len(non_baseline) > 0:
+            method_avgs = non_baseline.groupby("method")["uncensored_rate"].mean()
+            if len(method_avgs) > 0:
+                best_method = method_avgs.idxmax()
+                best_method_avg = method_avgs.max()
+        best_delta = best_method_avg - baseline_avg if best_method_avg > 0 else 0
+        stats = [
+            html.Div([
+                html.Div(str(len(df)), style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
+                html.Div("Models", style={"color": "#666"}),
+            ], style={"textAlign": "center"}),
+            html.Div([
+                html.Div(f"{baseline_avg:.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#9E9E9E"}),
+                html.Div("Baseline Avg", style={"color": "#666"}),
+            ], style={"textAlign": "center"}),
+            html.Div([
+                html.Div(f"{df['uncensored_rate'].max():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#FF9800"}),
+                html.Div("Best Rate", style={"color": "#666"}),
+            ], style={"textAlign": "center"}),
+            html.Div([
+                html.Div(
+                    f"+{best_delta:.1%}" if best_delta > 0 else f"{best_delta:.1%}",
+                    style={"fontSize": "2em", "fontWeight": "bold", "color": "#4CAF50" if best_delta > 0 else "#f44336"}
+                ),
+                html.Div(f"Best Method Δ ({best_method})", style={"color": "#666"}),
+            ], style={"textAlign": "center"}),
+        ]
+    else:
+        stats = [
+            html.Div([
+                html.Div("0", style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
+                html.Div("Models", style={"color": "#666"}),
+            ], style={"textAlign": "center"}),
+            html.Div([
+                html.P("No submissions yet. Be the first to submit!", style={"color": "#666"}),
+            ], style={"textAlign": "center"}),
+        ]
+    return stats
+@callback(
+    Output("tab-content", "children"),
+    [Input("view-tabs", "value"),
+     Input("refresh-interval", "n_intervals")]
+)
+def render_tab_content(tab, n):
+    """Render content based on selected tab."""
+    df = load_data()
+    if tab == "models":
+        # Models leaderboard view
+        col_defs = get_model_column_defs(df)
+        row_data = df.to_dict("records") if len(df) > 0 else []
+        return html.Div([
+            dag.AgGrid(
+                id="leaderboard-grid",
+                columnDefs=col_defs,
+                rowData=row_data,
+                defaultColDef={
+                    "resizable": True,
+                    "sortable": True,
+                },
+                dashGridOptions={
+                    "pagination": True,
+                    "paginationPageSize": 50,
+                    "animateRows": True,
+                    "rowSelection": "single",
+                },
+                style={"height": "600px"},
+                className="ag-theme-alpine",
+            ),
+        ])
+    elif tab == "methods":
+        # Methods comparison view
+        method_df, method_colors = calculate_method_stats(df)
+        row_data = method_df.to_dict("records") if len(method_df) > 0 else []
+        # Sort by delta from baseline descending
+        if len(method_df) > 0:
+            method_df = method_df.sort_values("delta_from_baseline", ascending=False)
+            row_data = method_df.to_dict("records")
+        # Build method legend from actual data
+        method_legend_items = []
+        for _, row in method_df.iterrows():
+            method = row["method"]
+            desc = row["description"]
+            color = method_colors.get(method, "#666")
+            method_legend_items.append(
+                html.Div([
+                    html.Span(
+                        f"● {method}",
+                        style={"color": color, "fontWeight": "bold", "marginRight": "10px"}
+                    ),
+                    html.Span(desc, style={"color": "#666"}),
+                ], style={"marginBottom": "8px"})
+            )
+        return html.Div([
+            # Method comparison description
+            html.Div([
+                html.P([
+                    "Compare censorship removal methods using ",
+                    html.Strong("paired comparisons only"),
+                    ". Delta (Δ) is calculated by comparing the ",
+                    html.Strong("same base model"),
+                    " with and without each method applied."
+                ], style={"color": "#666", "marginBottom": "5px"}),
+                html.P([
+                    "Methods are only shown if they have at least one paired comparison ",
+                    "(matching model_family + model_size with a baseline 'none' submission)."
+                ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
+            ]),
+            # Methods grid
+            dag.AgGrid(
+                id="methods-grid",
+                columnDefs=METHOD_COLUMN_DEFS,
+                rowData=row_data,
+                defaultColDef={
+                    "resizable": True,
+                    "sortable": True,
+                },
+                dashGridOptions={
+                    "animateRows": True,
+                    "rowSelection": "single",
+                },
+                style={"height": "400px"},
+                className="ag-theme-alpine",
+            ),
+            # Method legend - dynamically built from actual data
+            html.Div([
+                html.H4("Method Definitions", style={"marginTop": "30px", "marginBottom": "15px"}),
+                html.Div(
+                    method_legend_items if method_legend_items else [html.P("No methods submitted yet.", style={"color": "#666"})],
+                    style={"columns": "2", "columnGap": "40px"} if len(method_legend_items) > 3 else {}
+                ),
+            ], style={
+                "backgroundColor": "#f9f9f9",
+                "padding": "20px",
+                "borderRadius": "8px",
+                "marginTop": "20px",
+            }),
+        ])
+    return html.Div("Select a tab")
+if __name__ == "__main__":
+    app.run_server(debug=True, host="0.0.0.0", port=7860)