Spaces:

wisent-ai
/

UncensorBench

Sleeping

File size: 23,200 Bytes

"""
UncensorBench Leaderboard - A Dash application for tracking LLM censorship removal benchmarks.
"""

import dash
from dash import html, dcc, callback, Input, Output, State
import dash_ag_grid as dag
import pandas as pd
import os

# Initialize the Dash app
app = dash.Dash(__name__, title="UncensorBench Leaderboard")
server = app.server

# Load leaderboard data
DATA_FILE = "leaderboard.csv"

# Known method descriptions (for display purposes, but we accept any method)
METHOD_DESCRIPTIONS = {
    "none": "Baseline (no modification)",
    "abliteration": "Abliteration technique",
    "steering": "Steering vectors",
    "finetuning": "Fine-tuning based",
    "prompting": "Prompt-based jailbreaking",
    "other": "Other methods",
}

# Colors for known methods, dynamic methods get auto-assigned colors
METHOD_COLORS = {
    "none": "#9E9E9E",
    "abliteration": "#E91E63",
    "steering": "#2196F3",
    "finetuning": "#4CAF50",
    "prompting": "#FF9800",
    "other": "#9C27B0",
}

# Fallback colors for dynamically discovered methods
DYNAMIC_COLORS = ["#00BCD4", "#795548", "#607D8B", "#3F51B5", "#009688", "#CDDC39", "#FF5722", "#673AB7"]


def load_data():
    """Load leaderboard data from CSV."""
    if os.path.exists(DATA_FILE):
        df = pd.read_csv(DATA_FILE)
        # Sort by uncensored_rate descending
        if len(df) > 0:
            df = df.sort_values("uncensored_rate", ascending=False).reset_index(drop=True)
            df.insert(0, "Rank", range(1, len(df) + 1))
        return df
    else:
        # Return empty dataframe with expected columns
        return pd.DataFrame(columns=[
            "Rank", "model", "model_family", "model_size", "method",
            "uncensored_rate", "avg_compliance_score",
            "total_prompts", "timestamp", "submitter", "sample_responses_url"
        ])


def get_method_color(method, method_index=0):
    """Get color for a method, using predefined or dynamic colors."""
    if method in METHOD_COLORS:
        return METHOD_COLORS[method]
    # Assign a dynamic color based on index
    return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)]


def calculate_method_stats(df):
    """
    Calculate statistics for each method based on PAIRED comparisons only.

    A paired comparison requires the exact same base model to have both:
    - A baseline submission (method="none")
    - A method-applied submission (method=X)

    Only shows delta for methods where paired comparisons exist.
    """
    if len(df) == 0:
        return pd.DataFrame(), {}

    # Get all unique methods from the actual data
    all_methods = df["method"].dropna().unique().tolist()

    # Build dynamic color mapping for any new methods
    dynamic_method_colors = {}
    dynamic_idx = 0
    for method in all_methods:
        if method in METHOD_COLORS:
            dynamic_method_colors[method] = METHOD_COLORS[method]
        else:
            dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)]
            dynamic_idx += 1

    # Get baseline data - create lookup by exact model name
    baseline_df = df[df["method"] == "none"].copy()
    baseline_lookup = {}
    if len(baseline_df) > 0:
        for _, row in baseline_df.iterrows():
            model_name = row.get("model", "")
            baseline_lookup[model_name] = {
                "uncensored_rate": row["uncensored_rate"],
                "avg_compliance_score": row.get("avg_compliance_score", 0),
            }

    # Calculate paired comparisons for each method
    method_stats = []

    for method in all_methods:
        method_df = df[df["method"] == method]

        if method == "none":
            # Baseline method - show stats but no delta
            if len(method_df) > 0:
                avg_rate = method_df["uncensored_rate"].mean()
                max_rate = method_df["uncensored_rate"].max()
                min_rate = method_df["uncensored_rate"].min()
                avg_compliance = method_df["avg_compliance_score"].mean()
                best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"]
                description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())

                method_stats.append({
                    "method": method,
                    "description": description,
                    "num_models": len(method_df),
                    "num_pairs": len(method_df),
                    "avg_uncensored_rate": avg_rate,
                    "delta_from_baseline": 0.0,
                    "max_uncensored_rate": max_rate,
                    "min_uncensored_rate": min_rate,
                    "avg_compliance_score": avg_compliance,
                    "best_model": best_model,
                })
        else:
            # Non-baseline method - only count paired comparisons
            paired_data = []

            for _, row in method_df.iterrows():
                method_model = row.get("model", "")
                method_rate = row["uncensored_rate"]
                method_compliance = row.get("avg_compliance_score", 0)

                # Find exact baseline match by model_family + model_size
                model_family = row.get("model_family", "")
                model_size = row.get("model_size", "")

                # Look for baseline with same family and size
                baseline_match = None
                for baseline_model, baseline_data in baseline_lookup.items():
                    baseline_row = baseline_df[baseline_df["model"] == baseline_model].iloc[0]
                    if (baseline_row.get("model_family", "") == model_family and
                        baseline_row.get("model_size", "") == model_size):
                        baseline_match = baseline_data
                        break

                if baseline_match is not None:
                    paired_data.append({
                        "model": method_model,
                        "method_rate": method_rate,
                        "baseline_rate": baseline_match["uncensored_rate"],
                        "delta": method_rate - baseline_match["uncensored_rate"],
                        "method_compliance": method_compliance,
                    })

            # Only add method if it has paired comparisons
            if len(paired_data) > 0:
                avg_delta = sum(p["delta"] for p in paired_data) / len(paired_data)
                avg_rate = sum(p["method_rate"] for p in paired_data) / len(paired_data)
                max_rate = max(p["method_rate"] for p in paired_data)
                min_rate = min(p["method_rate"] for p in paired_data)
                avg_compliance = sum(p["method_compliance"] for p in paired_data) / len(paired_data)

                # Best model is the one with highest delta
                best_pair = max(paired_data, key=lambda x: x["delta"])
                best_model = best_pair["model"]

                description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title())

                method_stats.append({
                    "method": method,
                    "description": description,
                    "num_models": len(method_df),
                    "num_pairs": len(paired_data),
                    "avg_uncensored_rate": avg_rate,
                    "delta_from_baseline": avg_delta,
                    "max_uncensored_rate": max_rate,
                    "min_uncensored_rate": min_rate,
                    "avg_compliance_score": avg_compliance,
                    "best_model": best_model,
                })

    return pd.DataFrame(method_stats), dynamic_method_colors


# Column definitions for Models AG Grid
MODEL_COLUMN_DEFS = [
    {
        "field": "Rank",
        "headerName": "🏆",
        "width": 70,
        "pinned": "left",
        "sortable": True,
    },
    {
        "field": "model",
        "headerName": "Model",
        "width": 300,
        "pinned": "left",
        "sortable": True,
        "filter": True,
    },
    {
        "field": "model_family",
        "headerName": "Family",
        "width": 120,
        "sortable": True,
        "filter": True,
    },
    {
        "field": "model_size",
        "headerName": "Size",
        "width": 80,
        "sortable": True,
        "filter": True,
    },
    {
        "field": "method",
        "headerName": "Method",
        "width": 120,
        "sortable": True,
        "filter": True,
    },
    {
        "field": "uncensored_rate",
        "headerName": "Uncensored Rate ⬆️",
        "width": 160,
        "sortable": True,
        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
    },
    {
        "field": "avg_compliance_score",
        "headerName": "Avg Compliance",
        "width": 140,
        "sortable": True,
        "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
    },
    {
        "field": "total_prompts",
        "headerName": "Prompts",
        "width": 90,
        "sortable": True,
    },
    {
        "field": "timestamp",
        "headerName": "Submitted",
        "width": 180,
        "sortable": True,
    },
    {
        "field": "submitter",
        "headerName": "Submitter",
        "width": 130,
        "sortable": True,
        "filter": True,
    },
]

# Column definitions for Methods AG Grid (paired comparisons only)
METHOD_COLUMN_DEFS = [
    {
        "field": "method",
        "headerName": "Method",
        "width": 130,
        "pinned": "left",
        "sortable": True,
    },
    {
        "field": "description",
        "headerName": "Description",
        "width": 180,
        "sortable": True,
    },
    {
        "field": "num_pairs",
        "headerName": "# Pairs",
        "width": 80,
        "sortable": True,
    },
    {
        "field": "delta_from_baseline",
        "headerName": "Δ vs Baseline ⬆️",
        "width": 140,
        "sortable": True,
        "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"},
        "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"},
    },
    {
        "field": "avg_uncensored_rate",
        "headerName": "Avg Rate",
        "width": 100,
        "sortable": True,
        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
    },
    {
        "field": "max_uncensored_rate",
        "headerName": "Best Rate",
        "width": 100,
        "sortable": True,
        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
    },
    {
        "field": "min_uncensored_rate",
        "headerName": "Worst Rate",
        "width": 100,
        "sortable": True,
        "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
    },
    {
        "field": "avg_compliance_score",
        "headerName": "Avg Compliance",
        "width": 130,
        "sortable": True,
        "valueFormatter": {"function": "d3.format('.3f')(params.value)"},
    },
    {
        "field": "best_model",
        "headerName": "Best Model",
        "width": 260,
        "sortable": True,
    },
]

# Topic-specific columns (added dynamically if present)
TOPIC_COLUMNS = [
    "cybersecurity", "piracy", "weapons", "drugs", "fraud",
    "manipulation", "violence", "privacy_invasion", "illegal_activities",
    "academic_dishonesty", "gambling", "controversial_speech",
    "evasion", "self_harm", "adult_content"
]


def get_model_column_defs(df):
    """Get column definitions based on available data."""
    cols = MODEL_COLUMN_DEFS.copy()

    # Add topic columns if they exist in the data
    for topic in TOPIC_COLUMNS:
        if topic in df.columns:
            cols.append({
                "field": topic,
                "headerName": topic.replace("_", " ").title(),
                "width": 130,
                "sortable": True,
                "valueFormatter": {"function": "d3.format('.1%')(params.value)"},
            })

    return cols


# App layout
app.layout = html.Div([
    # Header
    html.Div([
        html.H1("🦬 UncensorBench Leaderboard", style={"marginBottom": "5px"}),
        html.P(
            "Tracking LLM performance on censorship removal benchmarks",
            style={"color": "#666", "marginTop": "0"}
        ),
    ], style={"textAlign": "center", "padding": "20px"}),

    # Info banner
    html.Div([
        html.Div([
            html.Span("📊 ", style={"fontSize": "1.2em"}),
            html.A(
                "UncensorBench on PyPI",
                href="https://pypi.org/project/uncensorbench/",
                target="_blank",
                style={"marginRight": "20px"}
            ),
            html.Span("📓 ", style={"fontSize": "1.2em"}),
            html.A(
                "Run Benchmark Notebook",
                href="https://github.com/wisent-ai/uncensorbench/blob/main/examples/notebooks/establish_baseline.ipynb",
                target="_blank",
                style={"marginRight": "20px"}
            ),
            html.Span("🐙 ", style={"fontSize": "1.2em"}),
            html.A(
                "GitHub",
                href="https://github.com/wisent-ai/uncensorbench",
                target="_blank",
            ),
        ], style={"textAlign": "center", "padding": "10px"})
    ], style={
        "backgroundColor": "#f0f0f0",
        "borderRadius": "8px",
        "marginBottom": "20px",
        "marginLeft": "20px",
        "marginRight": "20px",
    }),

    # Stats summary
    html.Div(id="stats-summary", style={
        "display": "flex",
        "justifyContent": "center",
        "gap": "40px",
        "marginBottom": "20px",
    }),

    # Tabs for Models and Methods views
    dcc.Tabs(id="view-tabs", value="models", children=[
        dcc.Tab(label="📋 Models Leaderboard", value="models", style={"fontWeight": "bold"}),
        dcc.Tab(label="🔬 Methods Comparison", value="methods", style={"fontWeight": "bold"}),
    ], style={"marginLeft": "20px", "marginRight": "20px"}),

    # Tab content
    html.Div(id="tab-content", style={"padding": "20px"}),

    # Refresh interval
    dcc.Interval(
        id="refresh-interval",
        interval=60000,  # Refresh every 60 seconds
        n_intervals=0
    ),

    # Footer
    html.Div([
        html.Hr(),
        html.P([
            "UncensorBench measures how models respond to prompts that typically trigger refusal. ",
            html.Strong("Higher uncensored rate = more compliant responses. "),
            "This benchmark is for research purposes only."
        ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
        html.P([
            "Powered by ",
            html.A("Wisent AI", href="https://wisent.ai", target="_blank"),
            " • ",
            html.A("Submit your model", href="https://github.com/wisent-ai/uncensorbench#how-to-submit", target="_blank"),
        ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}),
    ], style={"padding": "20px"}),

], style={"fontFamily": "system-ui, -apple-system, sans-serif"})


@callback(
    Output("stats-summary", "children"),
    Input("refresh-interval", "n_intervals")
)
def update_stats(n):
    """Update the stats summary."""
    df = load_data()

    if len(df) > 0:
        # Calculate method stats for the summary
        baseline_df = df[df["method"] == "none"]
        baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0

        # Find best non-baseline method
        non_baseline = df[df["method"] != "none"]
        best_method_avg = 0
        best_method = "N/A"
        if len(non_baseline) > 0:
            method_avgs = non_baseline.groupby("method")["uncensored_rate"].mean()
            if len(method_avgs) > 0:
                best_method = method_avgs.idxmax()
                best_method_avg = method_avgs.max()

        best_delta = best_method_avg - baseline_avg if best_method_avg > 0 else 0

        stats = [
            html.Div([
                html.Div(str(len(df)), style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
                html.Div("Models", style={"color": "#666"}),
            ], style={"textAlign": "center"}),
            html.Div([
                html.Div(f"{baseline_avg:.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#9E9E9E"}),
                html.Div("Baseline Avg", style={"color": "#666"}),
            ], style={"textAlign": "center"}),
            html.Div([
                html.Div(f"{df['uncensored_rate'].max():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#FF9800"}),
                html.Div("Best Rate", style={"color": "#666"}),
            ], style={"textAlign": "center"}),
            html.Div([
                html.Div(
                    f"+{best_delta:.1%}" if best_delta > 0 else f"{best_delta:.1%}",
                    style={"fontSize": "2em", "fontWeight": "bold", "color": "#4CAF50" if best_delta > 0 else "#f44336"}
                ),
                html.Div(f"Best Method Δ ({best_method})", style={"color": "#666"}),
            ], style={"textAlign": "center"}),
        ]
    else:
        stats = [
            html.Div([
                html.Div("0", style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}),
                html.Div("Models", style={"color": "#666"}),
            ], style={"textAlign": "center"}),
            html.Div([
                html.P("No submissions yet. Be the first to submit!", style={"color": "#666"}),
            ], style={"textAlign": "center"}),
        ]

    return stats


@callback(
    Output("tab-content", "children"),
    [Input("view-tabs", "value"),
     Input("refresh-interval", "n_intervals")]
)
def render_tab_content(tab, n):
    """Render content based on selected tab."""
    df = load_data()

    if tab == "models":
        # Models leaderboard view
        col_defs = get_model_column_defs(df)
        row_data = df.to_dict("records") if len(df) > 0 else []

        # Build responses links section
        responses_links = []
        if len(df) > 0:
            for _, row in df.iterrows():
                url = row.get("sample_responses_url")
                if pd.notna(url) and url:
                    model = row.get("model", "Unknown")
                    responses_links.append(
                        html.Li([
                            html.Strong(model),
                            html.Span(": "),
                            html.Code(url, style={"fontSize": "0.85em", "wordBreak": "break-all"}),
                        ], style={"marginBottom": "5px"})
                    )

        return html.Div([
            dag.AgGrid(
                id="leaderboard-grid",
                columnDefs=col_defs,
                rowData=row_data,
                defaultColDef={
                    "resizable": True,
                    "sortable": True,
                },
                dashGridOptions={
                    "pagination": True,
                    "paginationPageSize": 50,
                    "animateRows": True,
                    "rowSelection": "single",
                },
                style={"height": "600px"},
                className="ag-theme-alpine",
            ),
            # Sample responses section
            html.Div([
                html.H4("📄 Sample Responses", style={"marginTop": "20px", "marginBottom": "10px"}),
                html.P("Copy and paste these URLs to view detailed model responses:", style={"color": "#666", "fontSize": "0.9em"}),
                html.Ul(responses_links) if responses_links else html.P("No sample responses available yet.", style={"color": "#999"}),
            ], style={
                "backgroundColor": "#f9f9f9",
                "padding": "15px",
                "borderRadius": "8px",
                "marginTop": "20px",
            }) if responses_links else None,
        ])

    elif tab == "methods":
        # Methods comparison view
        method_df, method_colors = calculate_method_stats(df)
        row_data = method_df.to_dict("records") if len(method_df) > 0 else []

        # Sort by delta from baseline descending
        if len(method_df) > 0:
            method_df = method_df.sort_values("delta_from_baseline", ascending=False)
            row_data = method_df.to_dict("records")

        # Build method legend from actual data
        method_legend_items = []
        for _, row in method_df.iterrows():
            method = row["method"]
            desc = row["description"]
            color = method_colors.get(method, "#666")
            method_legend_items.append(
                html.Div([
                    html.Span(
                        f"● {method}",
                        style={"color": color, "fontWeight": "bold", "marginRight": "10px"}
                    ),
                    html.Span(desc, style={"color": "#666"}),
                ], style={"marginBottom": "8px"})
            )

        return html.Div([
            # Method comparison description
            html.Div([
                html.P([
                    "Compare censorship removal methods using ",
                    html.Strong("paired comparisons only"),
                    ". Delta (Δ) is calculated by comparing the ",
                    html.Strong("same base model"),
                    " with and without each method applied."
                ], style={"color": "#666", "marginBottom": "5px"}),
                html.P([
                    "Methods are only shown if they have at least one paired comparison ",
                    "(matching model_family + model_size with a baseline 'none' submission)."
                ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}),
            ]),

            # Methods grid
            dag.AgGrid(
                id="methods-grid",
                columnDefs=METHOD_COLUMN_DEFS,
                rowData=row_data,
                defaultColDef={
                    "resizable": True,
                    "sortable": True,
                },
                dashGridOptions={
                    "animateRows": True,
                    "rowSelection": "single",
                },
                style={"height": "400px"},
                className="ag-theme-alpine",
            ),

            # Method legend - dynamically built from actual data
            html.Div([
                html.H4("Method Definitions", style={"marginTop": "30px", "marginBottom": "15px"}),
                html.Div(
                    method_legend_items if method_legend_items else [html.P("No methods submitted yet.", style={"color": "#666"})],
                    style={"columns": "2", "columnGap": "40px"} if len(method_legend_items) > 3 else {}
                ),
            ], style={
                "backgroundColor": "#f9f9f9",
                "padding": "20px",
                "borderRadius": "8px",
                "marginTop": "20px",
            }),
        ])

    return html.Div("Select a tab")


if __name__ == "__main__":
    app.run_server(debug=True, host="0.0.0.0", port=7860)