Ric
fix: cap method chart at 100%, redesign efficiency scatter with smaller dots and quadrant shading
c7bcf11 | """ | |
| Abliteration Methods Dashboard | |
| Visualizes results from "Comparative Analysis of LLM Abliteration Methods: | |
| A Cross-Architecture Evaluation" (arxiv:2512.13655) by Richard J. Young. | |
| """ | |
| import json | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| # --------------------------------------------------------------------------- | |
| # Theme & CSS | |
| # --------------------------------------------------------------------------- | |
| THEME = gr.themes.Base( | |
| primary_hue="red", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| ) | |
| CSS = """ | |
| .header-banner { | |
| background: linear-gradient(135deg, #0a0a1a 0%, #1a1a2e 30%, #0f3460 70%, #16213e 100%); | |
| padding: 32px 40px; | |
| border-radius: 16px; | |
| margin-bottom: 20px; | |
| border: 1px solid rgba(233, 69, 96, 0.15); | |
| box-shadow: 0 4px 24px rgba(0, 0, 0, 0.3), 0 0 40px rgba(233, 69, 96, 0.05); | |
| } | |
| .community-table table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-size: 0.9em; | |
| } | |
| .community-table th { | |
| background: #1a1a2e; | |
| color: #e94560; | |
| padding: 10px 12px; | |
| text-align: left; | |
| border-bottom: 2px solid #333; | |
| } | |
| .community-table td { | |
| padding: 8px 12px; | |
| border-bottom: 1px solid #2a2a2a; | |
| color: #c4c4c4; | |
| } | |
| .community-table tr:hover { | |
| background: #16213e; | |
| } | |
| .community-table a { | |
| color: #53a8b6; | |
| text-decoration: none; | |
| } | |
| .community-table a:hover { | |
| text-decoration: underline; | |
| } | |
| .header-banner h1 { | |
| color: #e94560 !important; | |
| margin-bottom: 8px !important; | |
| font-size: 2.2em !important; | |
| font-weight: 800 !important; | |
| letter-spacing: -0.02em; | |
| } | |
| .header-banner p { | |
| color: #a0a0b0 !important; | |
| line-height: 1.6 !important; | |
| } | |
| .header-banner a { | |
| color: #53a8b6 !important; | |
| text-decoration: none; | |
| border-bottom: 1px solid rgba(83, 168, 182, 0.3); | |
| } | |
| .header-banner a:hover { | |
| border-bottom-color: #53a8b6; | |
| } | |
| .stat-box { | |
| background: linear-gradient(180deg, #1a1a2e 0%, #141428 100%); | |
| border: 1px solid #2a2a3e; | |
| border-radius: 12px; | |
| padding: 20px 16px; | |
| text-align: center; | |
| transition: border-color 0.2s; | |
| } | |
| .stat-box:hover { | |
| border-color: #e94560; | |
| } | |
| .stat-box h3 { | |
| color: #e94560 !important; | |
| font-size: 2em !important; | |
| margin: 0 !important; | |
| } | |
| .stat-box p { | |
| color: #aaa !important; | |
| margin: 4px 0 0 0 !important; | |
| font-size: 0.9em !important; | |
| } | |
| .finding-box { | |
| background: #0f3460; | |
| border-left: 4px solid #e94560; | |
| padding: 12px 16px; | |
| border-radius: 0 8px 8px 0; | |
| margin: 12px 0; | |
| } | |
| .finding-box p { | |
| color: #ddd !important; | |
| margin: 0 !important; | |
| } | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Data (loaded from JSON files in data/ directory) | |
| # --------------------------------------------------------------------------- | |
| DATA_DIR = Path(__file__).parent / "data" | |
| def _load_json(name: str): | |
| with open(DATA_DIR / name) as f: | |
| return json.load(f) | |
| LEADERBOARD_ROWS = _load_json("leaderboard.json") | |
| CAPABILITY_DATA = _load_json("capabilities.json") | |
| MEAN_CAPABILITY_CHANGE = _load_json("mean_capability_change.json") | |
| _compat = _load_json("compatibility.json") | |
| COMPATIBILITY_ROWS = _compat["rows"] | |
| COVERAGE_TOTALS = _compat["totals"] | |
| COMMUNITY_MODELS = _load_json("community_models.json") | |
| V1_VS_V2 = _load_json("v1_vs_v2.json") | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| AVG_ASR = round( | |
| sum(r["ASR (%)"] for r in LEADERBOARD_ROWS) / len(LEADERBOARD_ROWS), 1 | |
| ) | |
| KL_MIN = min(r["KL Divergence"] for r in LEADERBOARD_ROWS) | |
| KL_MAX = max(r["KL Divergence"] for r in LEADERBOARD_ROWS) | |
| def stat_box(value: str, label: str) -> str: | |
| return ( | |
| f'<div class="stat-box">' | |
| f"<h3>{value}</h3>" | |
| f"<p>{label}</p>" | |
| f"</div>" | |
| ) | |
| def finding_box(text: str) -> str: | |
| return f'<div class="finding-box"><p>{text}</p></div>' | |
| def _color_cell(val: str) -> str: | |
| """Return inline style for the compatibility matrix.""" | |
| colors = { | |
| "Yes": "background-color: #1b4332; color: #95d5b2;", | |
| "Partial": "background-color: #3a3000; color: #ffd60a;", | |
| "No": "background-color: #461220; color: #f5a0a0;", | |
| "Incompatible": "background-color: #461220; color: #f5a0a0;", | |
| "N/A": "background-color: #2a2a2a; color: #888;", | |
| } | |
| return colors.get(val, "") | |
| # --------------------------------------------------------------------------- | |
| # Chart builders | |
| # --------------------------------------------------------------------------- | |
| def build_asr_chart(sort_by: str = "ASR (%)") -> go.Figure: | |
| df = pd.DataFrame(LEADERBOARD_ROWS) | |
| ascending = sort_by != "ASR (%)" | |
| df = df.sort_values(sort_by, ascending=ascending) | |
| fig = px.bar( | |
| df, | |
| x="Model", | |
| y="ASR (%)", | |
| color="ASR (%)", | |
| color_continuous_scale=["#461220", "#e94560", "#53a8b6"], | |
| text="ASR (%)", | |
| ) | |
| fig.update_traces(textposition="outside", textfont_size=13) | |
| fig.update_layout( | |
| title="Attack Success Rate by Model (Heretic)", | |
| xaxis_title="Model", | |
| yaxis_title="ASR (%)", | |
| yaxis_range=[0, 110], | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| coloraxis_showscale=False, | |
| margin=dict(t=50, b=60), | |
| ) | |
| fig.update_xaxes(tickangle=-30) | |
| return fig | |
| def build_kl_vs_refusals_chart() -> go.Figure: | |
| df = pd.DataFrame(LEADERBOARD_ROWS) | |
| fig = px.scatter( | |
| df, | |
| x="KL Divergence", | |
| y="Refusals (n=100)", | |
| text="Model", | |
| size="Refusals (n=100)", | |
| color="ASR (%)", | |
| color_continuous_scale=["#461220", "#e94560", "#53a8b6"], | |
| ) | |
| fig.update_traces(textposition="top center", textfont_size=10) | |
| fig.update_layout( | |
| title="KL Divergence vs Remaining Refusals (Pearson r=0.87, p=0.005)", | |
| xaxis_title="KL Divergence", | |
| yaxis_title="Refusals (n=100)", | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| margin=dict(t=50, b=40), | |
| ) | |
| return fig | |
| def build_capability_chart(model_name: str) -> go.Figure: | |
| rows = CAPABILITY_DATA.get(model_name, []) | |
| if not rows: | |
| fig = go.Figure() | |
| fig.update_layout( | |
| title="No data available", | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| ) | |
| return fig | |
| df = pd.DataFrame(rows) | |
| benchmarks = ["MMLU", "GSM8K", "HellaSwag"] | |
| colors = { | |
| "Base": "#888888", | |
| "Heretic": "#e94560", | |
| "OBLITERATUS": "#ff6b35", | |
| "jwest33": "#7b68ee", | |
| "DECCP": "#53a8b6", | |
| "ErisForge": "#ffd60a", | |
| } | |
| fig = go.Figure() | |
| for _, row in df.iterrows(): | |
| variant = row["Variant"] | |
| fig.add_trace(go.Bar( | |
| name=variant, | |
| x=benchmarks, | |
| y=[row[b] for b in benchmarks], | |
| marker_color=colors.get(variant, "#aaa"), | |
| text=[f"{row[b]:.2f}" for b in benchmarks], | |
| textposition="outside", | |
| textfont_size=11, | |
| )) | |
| fig.update_layout( | |
| barmode="group", | |
| title=f"Benchmark Scores \u2014 {model_name}", | |
| xaxis_title="Benchmark", | |
| yaxis_title="Score", | |
| yaxis_range=[0, 100], | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| margin=dict(t=70, b=40), | |
| ) | |
| return fig | |
| def build_mean_delta_chart() -> go.Figure: | |
| df = pd.DataFrame(MEAN_CAPABILITY_CHANGE) | |
| benchmarks = [ | |
| "Avg MMLU \u0394 (pp)", | |
| "Avg GSM8K \u0394 (pp)", | |
| "Avg HellaSwag \u0394 (pp)", | |
| ] | |
| colors = { | |
| "Heretic": "#e94560", | |
| "OBLITERATUS": "#ff6b35", | |
| "jwest33": "#7b68ee", | |
| "DECCP": "#53a8b6", | |
| "ErisForge": "#ffd60a", | |
| } | |
| fig = go.Figure() | |
| for _, row in df.iterrows(): | |
| tool = row["Tool"] | |
| fig.add_trace(go.Bar( | |
| name=tool, | |
| x=["MMLU", "GSM8K", "HellaSwag"], | |
| y=[row[b] for b in benchmarks], | |
| marker_color=colors.get(tool, "#aaa"), | |
| text=[f"{row[b]:+.2f}" for b in benchmarks], | |
| textposition="outside", | |
| textfont_size=11, | |
| )) | |
| fig.update_layout( | |
| barmode="group", | |
| title="Mean Capability Change by Tool (pp)", | |
| xaxis_title="Benchmark", | |
| yaxis_title="Change (percentage points)", | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| margin=dict(t=70, b=40), | |
| ) | |
| return fig | |
| def build_coverage_chart() -> go.Figure: | |
| tools = list(COVERAGE_TOTALS.keys()) | |
| counts = [int(v.split("/")[0]) if v[0].isdigit() else 0 for v in COVERAGE_TOTALS.values()] | |
| color_map = { | |
| "Heretic": "#e94560", | |
| "OBLITERATUS": "#ff6b35", | |
| "jwest33": "#7b68ee", | |
| "DECCP": "#53a8b6", | |
| "ErisForge": "#ffd60a", | |
| } | |
| colors = [color_map.get(t, "#95d5b2") for t in tools] | |
| totals_text = list(COVERAGE_TOTALS.values()) | |
| fig = go.Figure(go.Bar( | |
| x=tools, | |
| y=counts, | |
| marker_color=colors, | |
| text=totals_text, | |
| textposition="outside", | |
| textfont_size=13, | |
| )) | |
| fig.update_layout( | |
| title="Tool Compatibility (models supported out of 16)", | |
| xaxis_title="Tool", | |
| yaxis_title="Models Supported", | |
| yaxis_range=[0, 20], | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| margin=dict(t=50, b=40), | |
| ) | |
| return fig | |
| # --------------------------------------------------------------------------- | |
| # Community models chart | |
| # --------------------------------------------------------------------------- | |
| def build_community_chart() -> go.Figure: | |
| evaluated = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None] | |
| pending = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None] | |
| if not evaluated: | |
| fig = go.Figure() | |
| fig.update_layout( | |
| title="No evaluated community models yet", | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| ) | |
| return fig | |
| df = pd.DataFrame(evaluated).sort_values("ASR (%)", ascending=False) | |
| fig = px.bar( | |
| df, | |
| x="Model", | |
| y="ASR (%)", | |
| color="Method", | |
| text="ASR (%)", | |
| color_discrete_map={ | |
| "Orthogonal Reflection": "#95d5b2", | |
| "Heretic": "#e94560", | |
| "Abliteration": "#53a8b6", | |
| "Unknown": "#888", | |
| }, | |
| ) | |
| fig.update_traces(textposition="outside", textfont_size=13) | |
| fig.update_layout( | |
| title="Community Abliterated Models - Attack Success Rate", | |
| xaxis_title="Model", | |
| yaxis_title="ASR (%)", | |
| yaxis_range=[0, 110], | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| margin=dict(t=50, b=60), | |
| ) | |
| fig.update_xaxes(tickangle=-30) | |
| return fig | |
| def _community_display_rows(rows): | |
| """Convert community model rows for display with clickable links.""" | |
| display = [] | |
| for m in rows: | |
| row = {**m} | |
| hf_link = row.pop("HF Link", "") | |
| if hf_link: | |
| model_name = row.get("Model", hf_link) | |
| row["Model"] = f'<a href="https://huggingface.co/{hf_link}" target="_blank">{model_name}</a>' | |
| creator = row.get("Creator", "") | |
| if creator: | |
| row["Creator"] = f'<a href="https://huggingface.co/{creator}" target="_blank">{creator}</a>' | |
| display.append(row) | |
| return display | |
| def filter_community(status_filter: str) -> str: | |
| if status_filter == "Evaluated": | |
| rows = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None] | |
| elif status_filter == "Pending": | |
| rows = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None] | |
| else: | |
| rows = COMMUNITY_MODELS | |
| df = pd.DataFrame(_community_display_rows(rows)) | |
| return f'<div class="community-table">{df.to_html(escape=False, index=False)}</div>' | |
| # --------------------------------------------------------------------------- | |
| # v1 vs v2 chart | |
| # --------------------------------------------------------------------------- | |
| def build_method_comparison(base_model: str) -> go.Figure: | |
| """Compare all methods/tools tested on a given base model.""" | |
| # Combine leaderboard + community data | |
| all_results = [] | |
| for r in LEADERBOARD_ROWS: | |
| all_results.append({ | |
| "Model": r["Model"], | |
| "Tool": r["Tool"], | |
| "ASR (%)": r["ASR (%)"], | |
| "KL Divergence": r.get("KL Divergence", 0), | |
| "Source": "Leaderboard", | |
| }) | |
| for r in COMMUNITY_MODELS: | |
| if r.get("ASR (%)") is not None: | |
| all_results.append({ | |
| "Model": r.get("Base Model", r["Model"]), | |
| "Tool": f"{r['Creator']}/{r['Method']}", | |
| "ASR (%)": r["ASR (%)"], | |
| "KL Divergence": r.get("KL Divergence", 0), | |
| "Source": "Community", | |
| }) | |
| # Filter to the selected base model (fuzzy match) | |
| matched = [r for r in all_results if base_model.lower() in r["Model"].lower()] | |
| if not matched: | |
| fig = go.Figure() | |
| fig.update_layout( | |
| title=f"No results found for '{base_model}'", | |
| plot_bgcolor="#0e1117", paper_bgcolor="#0e1117", font_color="#c4c4c4", | |
| ) | |
| return fig | |
| df = pd.DataFrame(matched).sort_values("ASR (%)", ascending=False) | |
| colors = ["#e94560" if s == "Leaderboard" else "#95d5b2" for s in df["Source"]] | |
| fig = go.Figure(go.Bar( | |
| x=df["Tool"], | |
| y=df["ASR (%)"], | |
| marker_color=colors, | |
| text=df["ASR (%)"].apply(lambda x: f"{x:.0f}%"), | |
| textposition="outside", | |
| textfont_size=12, | |
| hovertext=df.apply(lambda r: f"KL: {r['KL Divergence']:.4f}", axis=1), | |
| )) | |
| fig.update_layout( | |
| title=f"All Methods Tested on {base_model}", | |
| xaxis_title="Tool / Method", | |
| yaxis_title="ASR (%)", | |
| yaxis_range=[0, 100], | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| margin=dict(t=50, b=80), | |
| ) | |
| fig.update_xaxes(tickangle=-30) | |
| return fig | |
| def build_efficiency_scatter() -> go.Figure: | |
| """ASR vs KL scatter - shows which abliterations got the best result with least damage.""" | |
| all_results = [] | |
| for r in LEADERBOARD_ROWS: | |
| all_results.append({ | |
| "Label": f"{r['Model']} ({r['Tool']})", | |
| "ASR (%)": r["ASR (%)"], | |
| "KL": r.get("KL Divergence", 0), | |
| "Source": "Our Tools", | |
| }) | |
| for r in COMMUNITY_MODELS: | |
| if r.get("ASR (%)") is not None and r.get("KL Divergence") is not None: | |
| all_results.append({ | |
| "Label": f"{r['Model']} ({r['Creator']})", | |
| "ASR (%)": r["ASR (%)"], | |
| "KL": r.get("KL Divergence", 0), | |
| "Source": "Community", | |
| }) | |
| df = pd.DataFrame(all_results) | |
| fig = go.Figure() | |
| for source, color, symbol in [("Our Tools", "#e94560", "circle"), ("Community", "#95d5b2", "diamond")]: | |
| subset = df[df["Source"] == source] | |
| if subset.empty: | |
| continue | |
| fig.add_trace(go.Scatter( | |
| x=subset["KL"], | |
| y=subset["ASR (%)"], | |
| mode="markers", | |
| name=source, | |
| marker=dict( | |
| color=color, | |
| size=10, | |
| symbol=symbol, | |
| line=dict(width=1, color="#222"), | |
| ), | |
| text=subset["Label"], | |
| hovertemplate="<b>%{text}</b><br>ASR: %{y:.0f}%<br>KL: %{x:.4f}<extra></extra>", | |
| )) | |
| # Quadrant shading | |
| fig.add_shape(type="rect", x0=0, x1=1.0, y0=50, y1=100, | |
| fillcolor="rgba(149,213,178,0.06)", line=dict(width=0)) | |
| fig.add_shape(type="rect", x0=1.0, x1=12, y0=50, y1=100, | |
| fillcolor="rgba(255,214,10,0.04)", line=dict(width=0)) | |
| fig.add_shape(type="rect", x0=0, x1=12, y0=0, y1=50, | |
| fillcolor="rgba(233,69,96,0.04)", line=dict(width=0)) | |
| # Quadrant lines | |
| fig.add_shape(type="line", x0=1.0, x1=1.0, y0=0, y1=100, | |
| line=dict(color="#333", width=1, dash="dot")) | |
| fig.add_shape(type="line", x0=0, x1=12, y0=50, y1=50, | |
| line=dict(color="#333", width=1, dash="dot")) | |
| # Quadrant labels | |
| fig.add_annotation(x=0.3, y=97, text="Best: High ASR, Low KL", | |
| showarrow=False, font=dict(color="#95d5b2", size=9)) | |
| fig.add_annotation(x=6, y=97, text="Effective but Damaged", | |
| showarrow=False, font=dict(color="#ffd60a", size=9)) | |
| fig.add_annotation(x=6, y=5, text="Failed", | |
| showarrow=False, font=dict(color="#666", size=9)) | |
| fig.update_layout( | |
| title="Abliteration Efficiency: ASR vs KL Divergence", | |
| xaxis_title="KL Divergence (lower = less damage to model)", | |
| yaxis_title="ASR % (higher = more refusals removed)", | |
| yaxis_range=[0, 102], | |
| xaxis_range=[-0.2, max(df["KL"].max() * 1.1, 2)], | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| margin=dict(t=70, b=40), | |
| hoverlabel=dict(bgcolor="#1a1a2e", font_size=12), | |
| ) | |
| return fig | |
| def build_v1_v2_chart() -> go.Figure: | |
| df = pd.DataFrame(V1_VS_V2) | |
| fig = go.Figure() | |
| # v1 bars | |
| fig.add_trace(go.Bar( | |
| name="v1 (Dec 2025)", | |
| x=df["Model"], | |
| y=df["v1 ASR (%)"], | |
| marker_color="#53a8b6", | |
| text=df["v1 ASR (%)"].apply(lambda x: f"{x:.0f}%" if pd.notna(x) else ""), | |
| textposition="outside", | |
| textfont_size=11, | |
| )) | |
| # v2 bars | |
| fig.add_trace(go.Bar( | |
| name="v2 (Mar 2026)", | |
| x=df["Model"], | |
| y=df["v2 ASR (%)"], | |
| marker_color="#e94560", | |
| text=df["v2 ASR (%)"].apply(lambda x: f"{x:.0f}%" if pd.notna(x) else ""), | |
| textposition="outside", | |
| textfont_size=11, | |
| )) | |
| fig.update_layout( | |
| barmode="group", | |
| title="Abliteration Effectiveness: v1 (2025) vs v2 (2026)", | |
| xaxis_title="Model", | |
| yaxis_title="ASR (%)", | |
| yaxis_range=[0, 110], | |
| plot_bgcolor="#0e1117", | |
| paper_bgcolor="#0e1117", | |
| font_color="#c4c4c4", | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| margin=dict(t=70, b=80), | |
| ) | |
| fig.update_xaxes(tickangle=-30) | |
| return fig | |
| # --------------------------------------------------------------------------- | |
| # Filter callbacks | |
| # --------------------------------------------------------------------------- | |
| def filter_leaderboard(search: str, min_asr: float, max_kl: float) -> pd.DataFrame: | |
| filtered = [ | |
| r for r in LEADERBOARD_ROWS | |
| if r["ASR (%)"] >= min_asr | |
| and r["KL Divergence"] <= max_kl | |
| and (not search or search.lower() in r["Model"].lower()) | |
| ] | |
| return pd.DataFrame(filtered) | |
| def filter_compatibility(tool_filter: str) -> pd.DataFrame: | |
| df = pd.DataFrame(COMPATIBILITY_ROWS) | |
| if tool_filter and tool_filter != "All": | |
| df = df[df[tool_filter].isin(["Yes"])] | |
| return df | |
| # --------------------------------------------------------------------------- | |
| # App | |
| # --------------------------------------------------------------------------- | |
| def build_app() -> gr.Blocks: | |
| with gr.Blocks(theme=THEME, css=CSS, title="Abliteration Methods Dashboard") as app: | |
| # -- Header -- | |
| gr.HTML( | |
| '<div class="header-banner">' | |
| "<h1>Abliteration Methods Dashboard</h1>" | |
| "<p>Interactive visualisation of results from " | |
| '<a href="https://arxiv.org/abs/2512.13655" target="_blank">' | |
| "Comparative Analysis of LLM Abliteration Methods: " | |
| "A Cross-Architecture Evaluation</a> " | |
| "(arXiv: 2512.13655) by Richard J. Young</p>" | |
| f"<p style='font-size:0.8em; color:#666;'>Last updated: March 2026 " | |
| f"| {len(LEADERBOARD_ROWS)} leaderboard entries " | |
| f"| {len(COMMUNITY_MODELS)} community models tracked</p>" | |
| "</div>" | |
| ) | |
| # ================================================================ | |
| # TAB 1 \u2014 Leaderboard | |
| # ================================================================ | |
| with gr.Tab("Abliteration Leaderboard"): | |
| with gr.Row(): | |
| gr.HTML(stat_box(str(len(LEADERBOARD_ROWS)), "Results")) | |
| gr.HTML(stat_box("5", "Abliteration Tools")) | |
| gr.HTML(stat_box(f"{AVG_ASR}%", "Avg ASR")) | |
| gr.HTML(stat_box( | |
| f"{KL_MIN:.3f}--{KL_MAX:.3f}", "KL Divergence Range" | |
| )) | |
| gr.HTML(finding_box( | |
| "<strong>Key Finding:</strong> KL divergence and remaining " | |
| "refusals are strongly correlated (Pearson r=0.87, p=0.005)." | |
| )) | |
| with gr.Accordion("What do these metrics mean?", open=False): | |
| gr.Markdown( | |
| "- **ASR (Attack Success Rate)** - Percentage of harmful prompts the model responded to " | |
| "instead of refusing. Higher = more abliterated. 98% means nearly all refusals removed.\n" | |
| "- **KL Divergence** - How much the model changed from the original. Lower = less capability damage. " | |
| "Under 0.1 is excellent, over 1.0 means significant degradation.\n" | |
| "- **Refusals (n=100)** - Out of 100 harmful prompts, how many still refused. Lower = more abliterated.\n" | |
| "- **The ideal result** has high ASR and low KL." | |
| ) | |
| with gr.Row(): | |
| model_search = gr.Textbox( | |
| value="", | |
| label="Search model name", | |
| placeholder="e.g. Gemma, Llama, Qwen...", | |
| ) | |
| min_asr_slider = gr.Slider( | |
| minimum=0, maximum=100, value=0, step=1, | |
| label="Minimum ASR (%)", | |
| ) | |
| max_kl_slider = gr.Slider( | |
| minimum=0.0, maximum=12.0, value=12.0, step=0.01, | |
| label="Maximum KL Divergence", | |
| ) | |
| leaderboard_df = gr.Dataframe( | |
| value=pd.DataFrame(LEADERBOARD_ROWS), | |
| label="Abliteration Results (ranked by ASR)", | |
| interactive=False, | |
| ) | |
| _lb_inputs = [model_search, min_asr_slider, max_kl_slider] | |
| model_search.change(filter_leaderboard, inputs=_lb_inputs, outputs=leaderboard_df) | |
| min_asr_slider.change(filter_leaderboard, inputs=_lb_inputs, outputs=leaderboard_df) | |
| max_kl_slider.change(filter_leaderboard, inputs=_lb_inputs, outputs=leaderboard_df) | |
| with gr.Row(): | |
| asr_chart = gr.Plot( | |
| value=build_asr_chart(), label="ASR by Model" | |
| ) | |
| kl_chart = gr.Plot( | |
| value=build_kl_vs_refusals_chart(), | |
| label="KL Divergence vs Refusals", | |
| ) | |
| gr.Markdown("### Efficiency Quadrant") | |
| gr.Markdown( | |
| "Where does each abliteration land? Top-left is ideal (high ASR, low KL). " | |
| "Hover over points for details." | |
| ) | |
| gr.Plot(value=build_efficiency_scatter(), label="ASR vs KL") | |
| gr.Markdown("### Compare Methods on a Base Model") | |
| base_model_picker = gr.Dropdown( | |
| choices=sorted(set( | |
| [r["Model"].replace(" (v2)", "") for r in COMPATIBILITY_ROWS] | |
| + [m.get("Base Model", "") for m in COMMUNITY_MODELS if m.get("Base Model")] | |
| )), | |
| value="Gemma-3-12B-it", | |
| label="Select base model", | |
| ) | |
| method_chart = gr.Plot( | |
| value=build_method_comparison("Gemma-3-12B-it"), | |
| label="Method Comparison", | |
| ) | |
| base_model_picker.change( | |
| build_method_comparison, | |
| inputs=[base_model_picker], | |
| outputs=method_chart, | |
| ) | |
| gr.HTML( | |
| '<div class="finding-box">' | |
| "<p><strong>Want your model tested?</strong> " | |
| '<a href="https://huggingface.co/spaces/richardyoung/' | |
| 'abliteration-methods-dashboard/discussions" ' | |
| 'target="_blank" style="color: #53a8b6;">' | |
| "Open a discussion on this Space</a> with the HF model " | |
| "link and which abliteration tool you want evaluated. " | |
| "Results will be added to the dashboard.</p></div>" | |
| ) | |
| # ================================================================ | |
| # TAB 2 \u2014 Capability Preservation | |
| # ================================================================ | |
| with gr.Tab("Capability Preservation"): | |
| with gr.Row(): | |
| gr.HTML(stat_box("3", "Benchmarks")) | |
| gr.HTML(stat_box("4", "Models Evaluated")) | |
| gr.HTML(stat_box("3", "Tools Compared")) | |
| gr.HTML(stat_box("-0.08 pp", "Best HellaSwag \u0394 (ErisForge)")) | |
| gr.HTML(finding_box( | |
| "<strong>Key Finding:</strong> ErisForge exhibited the lowest " | |
| "average benchmark degradation. Mathematical reasoning (GSM8K) " | |
| "shows highest sensitivity, ranging from +1.51 pp to " | |
| "\u221218.81 pp." | |
| )) | |
| gr.Markdown("### Per-Model Benchmark Comparison") | |
| model_dropdown = gr.Dropdown( | |
| choices=list(CAPABILITY_DATA.keys()), | |
| value="DeepSeek-7B", | |
| label="Select Model", | |
| ) | |
| cap_chart = gr.Plot( | |
| value=build_capability_chart("DeepSeek-7B"), | |
| label="Benchmarks", | |
| ) | |
| model_dropdown.change( | |
| build_capability_chart, | |
| inputs=[model_dropdown], | |
| outputs=cap_chart, | |
| ) | |
| cap_table = gr.Dataframe( | |
| value=pd.DataFrame(CAPABILITY_DATA["DeepSeek-7B"]), | |
| label="Benchmark Scores", | |
| interactive=False, | |
| ) | |
| def _update_cap_table(model_name: str) -> pd.DataFrame: | |
| return pd.DataFrame( | |
| CAPABILITY_DATA.get(model_name, []) | |
| ) | |
| model_dropdown.change( | |
| _update_cap_table, | |
| inputs=[model_dropdown], | |
| outputs=cap_table, | |
| ) | |
| gr.Markdown("### Mean Capability Change by Tool") | |
| gr.Dataframe( | |
| value=pd.DataFrame(MEAN_CAPABILITY_CHANGE), | |
| label="Average benchmark delta (percentage points)", | |
| interactive=False, | |
| ) | |
| gr.Plot( | |
| value=build_mean_delta_chart(), | |
| label="Mean Capability Change", | |
| ) | |
| # ================================================================ | |
| # TAB 3 \u2014 Tool Compatibility | |
| # ================================================================ | |
| with gr.Tab("Tool Compatibility"): | |
| with gr.Row(): | |
| gr.HTML(stat_box("22/26", "Heretic (85%)")) | |
| gr.HTML(stat_box("5/10", "OBLITERATUS (50%)")) | |
| gr.HTML(stat_box("0/10", "jwest33 (0%)")) | |
| gr.HTML(stat_box("11/26", "DECCP (42%)")) | |
| gr.HTML(stat_box("9/26", "ErisForge (35%)")) | |
| gr.HTML(finding_box( | |
| "<strong>Key Finding (v2):</strong> Heretic remains the most compatible tool (85%). " | |
| "OBLITERATUS works on standard dense architectures (50%) but fails on Gemma 3 and MoE models. " | |
| "jwest33, DECCP, and ErisForge failed on all v2 models due to API/architecture incompatibilities. " | |
| "All 4 novel MoE architectures (Nemotron, GLM, GigaChat, LFM2) are incompatible with every tool." | |
| )) | |
| tool_filter = gr.Dropdown( | |
| choices=["All", "Heretic", "OBLITERATUS", "jwest33", "DECCP", "ErisForge"], | |
| value="All", | |
| label="Filter: show models fully supported by tool", | |
| ) | |
| compat_df = gr.Dataframe( | |
| value=pd.DataFrame(COMPATIBILITY_ROWS), | |
| label="Tool Compatibility Matrix", | |
| interactive=False, | |
| ) | |
| tool_filter.change( | |
| filter_compatibility, | |
| inputs=[tool_filter], | |
| outputs=compat_df, | |
| ) | |
| gr.Plot( | |
| value=build_coverage_chart(), | |
| label="Coverage Summary", | |
| ) | |
| gr.Markdown( | |
| "**Legend:** " | |
| "Yes = fully supported | " | |
| "Partial = runs with limitations | " | |
| "No = explicitly unsupported | " | |
| "Incompatible = architecture mismatch | " | |
| "N/A = not tested" | |
| ) | |
| # ================================================================ | |
| # TAB 4 -- Community Models | |
| # ================================================================ | |
| with gr.Tab("Community Models"): | |
| n_evaluated = len([m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None]) | |
| n_pending = len([m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None]) | |
| with gr.Row(): | |
| gr.HTML(stat_box(str(len(COMMUNITY_MODELS)), "Models Tracked")) | |
| gr.HTML(stat_box(str(n_evaluated), "Evaluated")) | |
| gr.HTML(stat_box(str(n_pending), "Awaiting Evaluation")) | |
| gr.HTML(stat_box(str(len(set(m["Creator"] for m in COMMUNITY_MODELS))), "Community Contributors")) | |
| gr.HTML(finding_box( | |
| "<strong>Community Impact:</strong> Independent researchers are producing " | |
| "abliterated models using diverse methods. grimjim's Orthogonal Reflection " | |
| "achieved 97% ASR on Gemma-3-12B-it, far surpassing our Heretic result (3% ASR) " | |
| "on the same base model -- demonstrating that method choice matters more than tool popularity." | |
| )) | |
| status_filter = gr.Dropdown( | |
| choices=["All", "Evaluated", "Pending"], | |
| value="All", | |
| label="Filter by evaluation status", | |
| ) | |
| initial_html = pd.DataFrame( | |
| _community_display_rows(COMMUNITY_MODELS) | |
| ).to_html(escape=False, index=False) | |
| community_df = gr.HTML(value=f'<div class="community-table">{initial_html}</div>') | |
| status_filter.change( | |
| filter_community, | |
| inputs=[status_filter], | |
| outputs=community_df, | |
| ) | |
| gr.Plot( | |
| value=build_community_chart(), | |
| label="Community Model ASR", | |
| ) | |
| gr.HTML( | |
| '<div class="finding-box">' | |
| "<p><strong>Submit your model for evaluation!</strong> " | |
| '<a href="https://huggingface.co/spaces/richardyoung/' | |
| 'abliteration-methods-dashboard/discussions" ' | |
| 'target="_blank" style="color: #53a8b6;">' | |
| "Open a discussion</a> with your HF model link and abliteration method. " | |
| "We will evaluate it using Heretic's standardized refusal/KL metrics " | |
| "and add it to this tab.</p></div>" | |
| ) | |
| # ================================================================ | |
| # TAB 5 -- v1 vs v2 Comparison | |
| # ================================================================ | |
| with gr.Tab("v1 vs v2"): | |
| v1_models = [m for m in V1_VS_V2 if m.get("v1 ASR (%)") is not None] | |
| v2_models = [m for m in V1_VS_V2 if m.get("v2 ASR (%)") is not None] | |
| v1_avg = round(sum(m["v1 ASR (%)"] for m in v1_models) / len(v1_models), 1) if v1_models else 0 | |
| v2_avg = round(sum(m["v2 ASR (%)"] for m in v2_models) / len(v2_models), 1) if v2_models else 0 | |
| with gr.Row(): | |
| gr.HTML(stat_box(f"{v1_avg}%", "v1 Avg ASR (Dec 2025)")) | |
| gr.HTML(stat_box(f"{v2_avg}%", "v2 Avg ASR (Mar 2026)")) | |
| gr.HTML(stat_box(f"{v1_avg - v2_avg:+.1f} pp", "ASR Change")) | |
| gr.HTML(stat_box(f"{len(v1_models)}+{len(v2_models)}", "Models Tested")) | |
| gr.HTML(finding_box( | |
| "<strong>Key Finding:</strong> Safety alignment has improved dramatically. " | |
| f"Average abliteration success dropped from {v1_avg}% (v1, late 2025) to " | |
| f"{v2_avg}% (v2, early 2026), a {v1_avg - v2_avg:.1f} percentage point decline. " | |
| "2026-era instruct models with stacked RLHF+DPO are significantly more resistant " | |
| "to weight-level safety removal than their 2024-2025 predecessors." | |
| )) | |
| gr.Plot( | |
| value=build_v1_v2_chart(), | |
| label="v1 vs v2 ASR Comparison", | |
| ) | |
| gr.Dataframe( | |
| value=pd.DataFrame(V1_VS_V2), | |
| label="v1 vs v2 Detailed Comparison", | |
| interactive=False, | |
| ) | |
| gr.Markdown( | |
| "**Notes:** v1 and v2 tested slightly different model versions " | |
| "(e.g., Mistral v0.3 vs v0.2, Llama base vs Instruct). " | |
| "Direct comparisons should account for these differences. " | |
| "Models marked 'Not retested' were only evaluated in v1." | |
| ) | |
| # ================================================================ | |
| # TAB 6 \u2014 About | |
| # ================================================================ | |
| with gr.Tab("About"): | |
| gr.HTML( | |
| '<div class="header-banner">' | |
| "<h1>About This Dashboard</h1>" | |
| "<p>This interactive dashboard accompanies the paper " | |
| '<a href="https://arxiv.org/abs/2512.13655" target="_blank">' | |
| "Comparative Analysis of LLM Abliteration Methods: " | |
| "A Cross-Architecture Evaluation</a>.</p>" | |
| "</div>" | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### TL;DR | |
| **Can you remove safety guardrails from an LLM by editing its weights?** That is what abliteration does -- it finds the internal "refusal direction" in a model's activations and surgically removes it, no retraining needed. But does it actually work? Which tools are best? And do newer models resist it? | |
| This dashboard tracks the answers. We test every major open-source abliteration tool against a growing set of models and report exactly what happens: how many prompts still get refused, how much the model's general capabilities degrade, and which tool-model combinations are even compatible. | |
| **Key findings so far:** | |
| - **2026 models are much harder to abliterate** than 2024-2025 models (2-17% success vs 46-98%) | |
| - **Knowledge distillation creates weaker safety** -- DeepSeek R1 Distill is 10x more vulnerable than directly-trained models | |
| - **MoE architectures are largely un-abliterable** -- 4/4 novel MoE models failed all tools | |
| - **Only 2 of 5 tools work on modern models** -- Heretic and OBLITERATUS; the other 3 are broken or abandoned | |
| - **Heretic consistently outperforms** other tools through Bayesian optimization | |
| ### Why This Matters | |
| Abliteration sits at the intersection of AI safety, interpretability, and open-source model governance. Understanding which models can be uncensored -- and how easily -- is critical for: | |
| - **Safety researchers** evaluating the robustness of alignment techniques | |
| - **Red teamers** testing whether deployed models resist weight-level attacks | |
| - **Model developers** understanding how their alignment choices affect vulnerability | |
| - **Policymakers** assessing the effectiveness of safety training requirements | |
| If safety alignment can be trivially removed, then alignment alone is not sufficient protection. If it cannot, that tells us something important about how deeply safety representations are embedded in modern architectures. | |
| ### Paper | |
| **Comparative Analysis of LLM Abliteration Methods: A Cross-Architecture Evaluation** | |
| arXiv: [2512.13655](https://arxiv.org/abs/2512.13655) | |
| **v2 (in progress):** Scaling to MoE Architectures and Modern Tools -- extends the evaluation to 26 models, 5 tools, and Mixture-of-Experts architectures. | |
| ### Author | |
| **Richard J. Young** | |
| - Email: [richard@deepneuro.ai](mailto:richard@deepneuro.ai) | \ | |
| [ryoung@unlv.edu](mailto:ryoung@unlv.edu) | |
| - ORCID: [0000-0002-1109-7552](https://orcid.org/0000-0002-1109-7552) | |
| - Hugging Face: [richardyoung](https://huggingface.co/richardyoung) | |
| - Abliterated models: [Collection](https://huggingface.co/collections/richardyoung/uncensored-and-abliterated-llms) | |
| ### Related Papers | |
| - **Refusal in Language Models Is Mediated by a Single Direction** \u2014 \ | |
| Arditi et al. (2024) \u2014 \ | |
| [arXiv: 2406.11717](https://arxiv.org/abs/2406.11717) | |
| - **The Geometry of Refusal: Concept Cones** \u2014 \ | |
| Wollschlager et al. (2025) \u2014 \ | |
| [arXiv: 2502.17420](https://arxiv.org/abs/2502.17420) | |
| - **SOM Directions Are Better Than One** \u2014 \ | |
| Piras et al. (2025) \u2014 \ | |
| [arXiv: 2511.08379](https://arxiv.org/abs/2511.08379) | |
| - **Differentiated Bi-Directional Intervention** \u2014 \ | |
| Li et al. (2025) \u2014 \ | |
| [arXiv: 2511.06852](https://arxiv.org/abs/2511.06852) | |
| - **Embarrassingly Simple Defense Against Abliteration** \u2014 \ | |
| Abu Shairah et al. (2025) \u2014 \ | |
| [arXiv: 2505.19056](https://arxiv.org/abs/2505.19056) | |
| - **Measuring Faithfulness of Chains of Thought** \u2014 \ | |
| Young (2025) \u2014 \ | |
| [arXiv: 2502.14829](https://arxiv.org/abs/2502.14829) | |
| - **Lie to Me: LLM Deception Under Forced Dissent** \u2014 \ | |
| Young (2025) \u2014 \ | |
| [arXiv: 2504.08530](https://arxiv.org/abs/2504.08530) | |
| ### Citation | |
| ```bibtex | |
| @article{young2024abliteration, | |
| title={Comparative Analysis of LLM Abliteration Methods: | |
| A Cross-Architecture Evaluation}, | |
| author={Young, Richard J.}, | |
| journal={arXiv preprint arXiv:2512.13655}, | |
| year={2024} | |
| } | |
| ``` | |
| ### Want Your Model Tested? | |
| Have an abliterated model you want benchmarked, or want to suggest a \ | |
| model for abliteration testing? \ | |
| **[Open a discussion on this Space](https://huggingface.co/spaces/richardyoung/abliteration-methods-dashboard/discussions)** \ | |
| with: | |
| - The Hugging Face model link | |
| - Which abliteration tool was used (or which you want tested) | |
| - Any notes on the method (e.g. Householder reflection, custom intervention vector) | |
| Results will be added to the dashboard. | |
| ### Contact | |
| richard@deepneuro.ai | ryoung@unlv.edu | \ | |
| [ORCID: 0000-0002-1109-7552](https://orcid.org/0000-0002-1109-7552) | |
| """ | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| demo = build_app() | |
| demo.launch() | |