""" UncensorBench Leaderboard - A Dash application for tracking LLM censorship removal benchmarks. """ import dash from dash import html, dcc, callback, Input, Output, State import dash_ag_grid as dag import pandas as pd import os # Initialize the Dash app app = dash.Dash(__name__, title="UncensorBench Leaderboard") server = app.server # Load leaderboard data DATA_FILE = "leaderboard.csv" # Known method descriptions (for display purposes, but we accept any method) METHOD_DESCRIPTIONS = { "none": "Baseline (no modification)", "abliteration": "Abliteration technique", "steering": "Steering vectors", "finetuning": "Fine-tuning based", "prompting": "Prompt-based jailbreaking", "other": "Other methods", } # Colors for known methods, dynamic methods get auto-assigned colors METHOD_COLORS = { "none": "#9E9E9E", "abliteration": "#E91E63", "steering": "#2196F3", "finetuning": "#4CAF50", "prompting": "#FF9800", "other": "#9C27B0", } # Fallback colors for dynamically discovered methods DYNAMIC_COLORS = ["#00BCD4", "#795548", "#607D8B", "#3F51B5", "#009688", "#CDDC39", "#FF5722", "#673AB7"] def load_data(): """Load leaderboard data from CSV.""" if os.path.exists(DATA_FILE): df = pd.read_csv(DATA_FILE) # Sort by uncensored_rate descending if len(df) > 0: df = df.sort_values("uncensored_rate", ascending=False).reset_index(drop=True) df.insert(0, "Rank", range(1, len(df) + 1)) return df else: # Return empty dataframe with expected columns return pd.DataFrame(columns=[ "Rank", "model", "model_family", "model_size", "method", "uncensored_rate", "avg_compliance_score", "total_prompts", "timestamp", "submitter", "sample_responses_url" ]) def get_method_color(method, method_index=0): """Get color for a method, using predefined or dynamic colors.""" if method in METHOD_COLORS: return METHOD_COLORS[method] # Assign a dynamic color based on index return DYNAMIC_COLORS[method_index % len(DYNAMIC_COLORS)] def calculate_method_stats(df): """ Calculate statistics for each method based on PAIRED comparisons only. A paired comparison requires the exact same base model to have both: - A baseline submission (method="none") - A method-applied submission (method=X) Only shows delta for methods where paired comparisons exist. """ if len(df) == 0: return pd.DataFrame(), {} # Get all unique methods from the actual data all_methods = df["method"].dropna().unique().tolist() # Build dynamic color mapping for any new methods dynamic_method_colors = {} dynamic_idx = 0 for method in all_methods: if method in METHOD_COLORS: dynamic_method_colors[method] = METHOD_COLORS[method] else: dynamic_method_colors[method] = DYNAMIC_COLORS[dynamic_idx % len(DYNAMIC_COLORS)] dynamic_idx += 1 # Get baseline data - create lookup by exact model name baseline_df = df[df["method"] == "none"].copy() baseline_lookup = {} if len(baseline_df) > 0: for _, row in baseline_df.iterrows(): model_name = row.get("model", "") baseline_lookup[model_name] = { "uncensored_rate": row["uncensored_rate"], "avg_compliance_score": row.get("avg_compliance_score", 0), } # Calculate paired comparisons for each method method_stats = [] for method in all_methods: method_df = df[df["method"] == method] if method == "none": # Baseline method - show stats but no delta if len(method_df) > 0: avg_rate = method_df["uncensored_rate"].mean() max_rate = method_df["uncensored_rate"].max() min_rate = method_df["uncensored_rate"].min() avg_compliance = method_df["avg_compliance_score"].mean() best_model = method_df.loc[method_df["uncensored_rate"].idxmax(), "model"] description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title()) method_stats.append({ "method": method, "description": description, "num_models": len(method_df), "num_pairs": len(method_df), "avg_uncensored_rate": avg_rate, "delta_from_baseline": 0.0, "max_uncensored_rate": max_rate, "min_uncensored_rate": min_rate, "avg_compliance_score": avg_compliance, "best_model": best_model, }) else: # Non-baseline method - only count paired comparisons paired_data = [] for _, row in method_df.iterrows(): method_model = row.get("model", "") method_rate = row["uncensored_rate"] method_compliance = row.get("avg_compliance_score", 0) # Find exact baseline match by model_family + model_size model_family = row.get("model_family", "") model_size = row.get("model_size", "") # Look for baseline with same family and size baseline_match = None for baseline_model, baseline_data in baseline_lookup.items(): baseline_row = baseline_df[baseline_df["model"] == baseline_model].iloc[0] if (baseline_row.get("model_family", "") == model_family and baseline_row.get("model_size", "") == model_size): baseline_match = baseline_data break if baseline_match is not None: paired_data.append({ "model": method_model, "method_rate": method_rate, "baseline_rate": baseline_match["uncensored_rate"], "delta": method_rate - baseline_match["uncensored_rate"], "method_compliance": method_compliance, }) # Only add method if it has paired comparisons if len(paired_data) > 0: avg_delta = sum(p["delta"] for p in paired_data) / len(paired_data) avg_rate = sum(p["method_rate"] for p in paired_data) / len(paired_data) max_rate = max(p["method_rate"] for p in paired_data) min_rate = min(p["method_rate"] for p in paired_data) avg_compliance = sum(p["method_compliance"] for p in paired_data) / len(paired_data) # Best model is the one with highest delta best_pair = max(paired_data, key=lambda x: x["delta"]) best_model = best_pair["model"] description = METHOD_DESCRIPTIONS.get(method, method.replace("_", " ").title()) method_stats.append({ "method": method, "description": description, "num_models": len(method_df), "num_pairs": len(paired_data), "avg_uncensored_rate": avg_rate, "delta_from_baseline": avg_delta, "max_uncensored_rate": max_rate, "min_uncensored_rate": min_rate, "avg_compliance_score": avg_compliance, "best_model": best_model, }) return pd.DataFrame(method_stats), dynamic_method_colors # Column definitions for Models AG Grid MODEL_COLUMN_DEFS = [ { "field": "Rank", "headerName": "🏆", "width": 70, "pinned": "left", "sortable": True, }, { "field": "model", "headerName": "Model", "width": 300, "pinned": "left", "sortable": True, "filter": True, }, { "field": "model_family", "headerName": "Family", "width": 120, "sortable": True, "filter": True, }, { "field": "model_size", "headerName": "Size", "width": 80, "sortable": True, "filter": True, }, { "field": "method", "headerName": "Method", "width": 120, "sortable": True, "filter": True, }, { "field": "uncensored_rate", "headerName": "Uncensored Rate ⬆️", "width": 160, "sortable": True, "valueFormatter": {"function": "d3.format('.1%')(params.value)"}, }, { "field": "avg_compliance_score", "headerName": "Avg Compliance", "width": 140, "sortable": True, "valueFormatter": {"function": "d3.format('.3f')(params.value)"}, }, { "field": "total_prompts", "headerName": "Prompts", "width": 90, "sortable": True, }, { "field": "timestamp", "headerName": "Submitted", "width": 180, "sortable": True, }, { "field": "submitter", "headerName": "Submitter", "width": 130, "sortable": True, "filter": True, }, ] # Column definitions for Methods AG Grid (paired comparisons only) METHOD_COLUMN_DEFS = [ { "field": "method", "headerName": "Method", "width": 130, "pinned": "left", "sortable": True, }, { "field": "description", "headerName": "Description", "width": 180, "sortable": True, }, { "field": "num_pairs", "headerName": "# Pairs", "width": 80, "sortable": True, }, { "field": "delta_from_baseline", "headerName": "Δ vs Baseline ⬆️", "width": 140, "sortable": True, "valueFormatter": {"function": "params.value >= 0 ? '+' + d3.format('.1%')(params.value) : d3.format('.1%')(params.value)"}, "cellStyle": {"function": "params.value > 0 ? {'color': '#4CAF50', 'fontWeight': 'bold'} : params.value < 0 ? {'color': '#f44336'} : {}"}, }, { "field": "avg_uncensored_rate", "headerName": "Avg Rate", "width": 100, "sortable": True, "valueFormatter": {"function": "d3.format('.1%')(params.value)"}, }, { "field": "max_uncensored_rate", "headerName": "Best Rate", "width": 100, "sortable": True, "valueFormatter": {"function": "d3.format('.1%')(params.value)"}, }, { "field": "min_uncensored_rate", "headerName": "Worst Rate", "width": 100, "sortable": True, "valueFormatter": {"function": "d3.format('.1%')(params.value)"}, }, { "field": "avg_compliance_score", "headerName": "Avg Compliance", "width": 130, "sortable": True, "valueFormatter": {"function": "d3.format('.3f')(params.value)"}, }, { "field": "best_model", "headerName": "Best Model", "width": 260, "sortable": True, }, ] # Topic-specific columns (added dynamically if present) TOPIC_COLUMNS = [ "cybersecurity", "piracy", "weapons", "drugs", "fraud", "manipulation", "violence", "privacy_invasion", "illegal_activities", "academic_dishonesty", "gambling", "controversial_speech", "evasion", "self_harm", "adult_content" ] def get_model_column_defs(df): """Get column definitions based on available data.""" cols = MODEL_COLUMN_DEFS.copy() # Add topic columns if they exist in the data for topic in TOPIC_COLUMNS: if topic in df.columns: cols.append({ "field": topic, "headerName": topic.replace("_", " ").title(), "width": 130, "sortable": True, "valueFormatter": {"function": "d3.format('.1%')(params.value)"}, }) return cols # App layout app.layout = html.Div([ # Header html.Div([ html.H1("🦬 UncensorBench Leaderboard", style={"marginBottom": "5px"}), html.P( "Tracking LLM performance on censorship removal benchmarks", style={"color": "#666", "marginTop": "0"} ), ], style={"textAlign": "center", "padding": "20px"}), # Info banner html.Div([ html.Div([ html.Span("📊 ", style={"fontSize": "1.2em"}), html.A( "UncensorBench on PyPI", href="https://pypi.org/project/uncensorbench/", target="_blank", style={"marginRight": "20px"} ), html.Span("📓 ", style={"fontSize": "1.2em"}), html.A( "Run Benchmark Notebook", href="https://github.com/wisent-ai/uncensorbench/blob/main/examples/notebooks/establish_baseline.ipynb", target="_blank", style={"marginRight": "20px"} ), html.Span("🐙 ", style={"fontSize": "1.2em"}), html.A( "GitHub", href="https://github.com/wisent-ai/uncensorbench", target="_blank", ), ], style={"textAlign": "center", "padding": "10px"}) ], style={ "backgroundColor": "#f0f0f0", "borderRadius": "8px", "marginBottom": "20px", "marginLeft": "20px", "marginRight": "20px", }), # Stats summary html.Div(id="stats-summary", style={ "display": "flex", "justifyContent": "center", "gap": "40px", "marginBottom": "20px", }), # Tabs for Models and Methods views dcc.Tabs(id="view-tabs", value="models", children=[ dcc.Tab(label="📋 Models Leaderboard", value="models", style={"fontWeight": "bold"}), dcc.Tab(label="🔬 Methods Comparison", value="methods", style={"fontWeight": "bold"}), ], style={"marginLeft": "20px", "marginRight": "20px"}), # Tab content html.Div(id="tab-content", style={"padding": "20px"}), # Refresh interval dcc.Interval( id="refresh-interval", interval=60000, # Refresh every 60 seconds n_intervals=0 ), # Footer html.Div([ html.Hr(), html.P([ "UncensorBench measures how models respond to prompts that typically trigger refusal. ", html.Strong("Higher uncensored rate = more compliant responses. "), "This benchmark is for research purposes only." ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}), html.P([ "Powered by ", html.A("Wisent AI", href="https://wisent.ai", target="_blank"), " • ", html.A("Submit your model", href="https://github.com/wisent-ai/uncensorbench#how-to-submit", target="_blank"), ], style={"color": "#888", "fontSize": "0.9em", "textAlign": "center"}), ], style={"padding": "20px"}), ], style={"fontFamily": "system-ui, -apple-system, sans-serif"}) @callback( Output("stats-summary", "children"), Input("refresh-interval", "n_intervals") ) def update_stats(n): """Update the stats summary.""" df = load_data() if len(df) > 0: # Calculate method stats for the summary baseline_df = df[df["method"] == "none"] baseline_avg = baseline_df["uncensored_rate"].mean() if len(baseline_df) > 0 else 0 # Find best non-baseline method non_baseline = df[df["method"] != "none"] best_method_avg = 0 best_method = "N/A" if len(non_baseline) > 0: method_avgs = non_baseline.groupby("method")["uncensored_rate"].mean() if len(method_avgs) > 0: best_method = method_avgs.idxmax() best_method_avg = method_avgs.max() best_delta = best_method_avg - baseline_avg if best_method_avg > 0 else 0 stats = [ html.Div([ html.Div(str(len(df)), style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}), html.Div("Models", style={"color": "#666"}), ], style={"textAlign": "center"}), html.Div([ html.Div(f"{baseline_avg:.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#9E9E9E"}), html.Div("Baseline Avg", style={"color": "#666"}), ], style={"textAlign": "center"}), html.Div([ html.Div(f"{df['uncensored_rate'].max():.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#FF9800"}), html.Div("Best Rate", style={"color": "#666"}), ], style={"textAlign": "center"}), html.Div([ html.Div( f"+{best_delta:.1%}" if best_delta > 0 else f"{best_delta:.1%}", style={"fontSize": "2em", "fontWeight": "bold", "color": "#4CAF50" if best_delta > 0 else "#f44336"} ), html.Div(f"Best Method Δ ({best_method})", style={"color": "#666"}), ], style={"textAlign": "center"}), ] else: stats = [ html.Div([ html.Div("0", style={"fontSize": "2em", "fontWeight": "bold", "color": "#2196F3"}), html.Div("Models", style={"color": "#666"}), ], style={"textAlign": "center"}), html.Div([ html.P("No submissions yet. Be the first to submit!", style={"color": "#666"}), ], style={"textAlign": "center"}), ] return stats @callback( Output("tab-content", "children"), [Input("view-tabs", "value"), Input("refresh-interval", "n_intervals")] ) def render_tab_content(tab, n): """Render content based on selected tab.""" df = load_data() if tab == "models": # Models leaderboard view col_defs = get_model_column_defs(df) row_data = df.to_dict("records") if len(df) > 0 else [] # Build responses links section responses_links = [] if len(df) > 0: for _, row in df.iterrows(): url = row.get("sample_responses_url") if pd.notna(url) and url: model = row.get("model", "Unknown") responses_links.append( html.Li([ html.Strong(model), html.Span(": "), html.Code(url, style={"fontSize": "0.85em", "wordBreak": "break-all"}), ], style={"marginBottom": "5px"}) ) return html.Div([ dag.AgGrid( id="leaderboard-grid", columnDefs=col_defs, rowData=row_data, defaultColDef={ "resizable": True, "sortable": True, }, dashGridOptions={ "pagination": True, "paginationPageSize": 50, "animateRows": True, "rowSelection": "single", }, style={"height": "600px"}, className="ag-theme-alpine", ), # Sample responses section html.Div([ html.H4("📄 Sample Responses", style={"marginTop": "20px", "marginBottom": "10px"}), html.P("Copy and paste these URLs to view detailed model responses:", style={"color": "#666", "fontSize": "0.9em"}), html.Ul(responses_links) if responses_links else html.P("No sample responses available yet.", style={"color": "#999"}), ], style={ "backgroundColor": "#f9f9f9", "padding": "15px", "borderRadius": "8px", "marginTop": "20px", }) if responses_links else None, ]) elif tab == "methods": # Methods comparison view method_df, method_colors = calculate_method_stats(df) row_data = method_df.to_dict("records") if len(method_df) > 0 else [] # Sort by delta from baseline descending if len(method_df) > 0: method_df = method_df.sort_values("delta_from_baseline", ascending=False) row_data = method_df.to_dict("records") # Build method legend from actual data method_legend_items = [] for _, row in method_df.iterrows(): method = row["method"] desc = row["description"] color = method_colors.get(method, "#666") method_legend_items.append( html.Div([ html.Span( f"● {method}", style={"color": color, "fontWeight": "bold", "marginRight": "10px"} ), html.Span(desc, style={"color": "#666"}), ], style={"marginBottom": "8px"}) ) return html.Div([ # Method comparison description html.Div([ html.P([ "Compare censorship removal methods using ", html.Strong("paired comparisons only"), ". Delta (Δ) is calculated by comparing the ", html.Strong("same base model"), " with and without each method applied." ], style={"color": "#666", "marginBottom": "5px"}), html.P([ "Methods are only shown if they have at least one paired comparison ", "(matching model_family + model_size with a baseline 'none' submission)." ], style={"color": "#666", "fontSize": "0.9em", "marginBottom": "15px"}), ]), # Methods grid dag.AgGrid( id="methods-grid", columnDefs=METHOD_COLUMN_DEFS, rowData=row_data, defaultColDef={ "resizable": True, "sortable": True, }, dashGridOptions={ "animateRows": True, "rowSelection": "single", }, style={"height": "400px"}, className="ag-theme-alpine", ), # Method legend - dynamically built from actual data html.Div([ html.H4("Method Definitions", style={"marginTop": "30px", "marginBottom": "15px"}), html.Div( method_legend_items if method_legend_items else [html.P("No methods submitted yet.", style={"color": "#666"})], style={"columns": "2", "columnGap": "40px"} if len(method_legend_items) > 3 else {} ), ], style={ "backgroundColor": "#f9f9f9", "padding": "20px", "borderRadius": "8px", "marginTop": "20px", }), ]) return html.Div("Select a tab") if __name__ == "__main__": app.run_server(debug=True, host="0.0.0.0", port=7860)