Spaces:

richardyoung
/

abliteration-methods-dashboard

Running

Ric commited on 30 days ago

Commit

8649dad

1 Parent(s): bfe9ac6

feat: add Community Models tab and v1 vs v2 comparison tab

- Community Models tab: tracks 10 community-produced abliterated models
with grimjim's Orthogonal Reflection (97% ASR) as the standout result.
Includes evaluated/pending filter, ASR chart, and submission instructions.
- v1 vs v2 tab: side-by-side comparison showing safety alignment improvement
over 3 months. Average ASR dropped dramatically from v1 to v2.
- Dashboard now has 6 tabs total.

Files changed (3) hide show

app.py +204 -1
data/community_models.json +12 -0
data/v1_vs_v2.json +13 -0

app.py CHANGED Viewed

@@ -90,6 +90,9 @@ _compat = _load_json("compatibility.json")
 COMPATIBILITY_ROWS = _compat["rows"]
 COVERAGE_TOTALS = _compat["totals"]
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -306,6 +309,111 @@ def build_coverage_chart() -> go.Figure:
     return fig
 # ---------------------------------------------------------------------------
 # Filter callbacks
 # ---------------------------------------------------------------------------
@@ -521,7 +629,102 @@ def build_app() -> gr.Blocks:
             )
         # ================================================================
-        # TAB 4 \u2014 About
         # ================================================================
         with gr.Tab("About"):
             gr.HTML(

 COMPATIBILITY_ROWS = _compat["rows"]
 COVERAGE_TOTALS = _compat["totals"]
+COMMUNITY_MODELS = _load_json("community_models.json")
+V1_VS_V2 = _load_json("v1_vs_v2.json")
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
     return fig
+# ---------------------------------------------------------------------------
+# Community models chart
+# ---------------------------------------------------------------------------
+def build_community_chart() -> go.Figure:
+    evaluated = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None]
+    pending = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None]
+    if not evaluated:
+        fig = go.Figure()
+        fig.update_layout(
+            title="No evaluated community models yet",
+            plot_bgcolor="#0e1117",
+            paper_bgcolor="#0e1117",
+            font_color="#c4c4c4",
+        )
+        return fig
+    df = pd.DataFrame(evaluated).sort_values("ASR (%)", ascending=False)
+    fig = px.bar(
+        df,
+        x="Model",
+        y="ASR (%)",
+        color="Method",
+        text="ASR (%)",
+        color_discrete_map={
+            "Orthogonal Reflection": "#95d5b2",
+            "Heretic": "#e94560",
+            "Abliteration": "#53a8b6",
+            "Unknown": "#888",
+        },
+    )
+    fig.update_traces(textposition="outside", textfont_size=13)
+    fig.update_layout(
+        title="Community Abliterated Models - Attack Success Rate",
+        xaxis_title="Model",
+        yaxis_title="ASR (%)",
+        yaxis_range=[0, 110],
+        plot_bgcolor="#0e1117",
+        paper_bgcolor="#0e1117",
+        font_color="#c4c4c4",
+        margin=dict(t=50, b=60),
+    )
+    fig.update_xaxes(tickangle=-30)
+    return fig
+def filter_community(status_filter: str) -> pd.DataFrame:
+    if status_filter == "Evaluated":
+        rows = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None]
+    elif status_filter == "Pending":
+        rows = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None]
+    else:
+        rows = COMMUNITY_MODELS
+    return pd.DataFrame(rows)
+# ---------------------------------------------------------------------------
+# v1 vs v2 chart
+# ---------------------------------------------------------------------------
+def build_v1_v2_chart() -> go.Figure:
+    df = pd.DataFrame(V1_VS_V2)
+    fig = go.Figure()
+    # v1 bars
+    fig.add_trace(go.Bar(
+        name="v1 (Dec 2025)",
+        x=df["Model"],
+        y=df["v1 ASR (%)"],
+        marker_color="#53a8b6",
+        text=df["v1 ASR (%)"].apply(lambda x: f"{x:.0f}%" if pd.notna(x) else ""),
+        textposition="outside",
+        textfont_size=11,
+    ))
+    # v2 bars
+    fig.add_trace(go.Bar(
+        name="v2 (Mar 2026)",
+        x=df["Model"],
+        y=df["v2 ASR (%)"],
+        marker_color="#e94560",
+        text=df["v2 ASR (%)"].apply(lambda x: f"{x:.0f}%" if pd.notna(x) else ""),
+        textposition="outside",
+        textfont_size=11,
+    ))
+    fig.update_layout(
+        barmode="group",
+        title="Abliteration Effectiveness: v1 (2025) vs v2 (2026)",
+        xaxis_title="Model",
+        yaxis_title="ASR (%)",
+        yaxis_range=[0, 110],
+        plot_bgcolor="#0e1117",
+        paper_bgcolor="#0e1117",
+        font_color="#c4c4c4",
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+        margin=dict(t=70, b=80),
+    )
+    fig.update_xaxes(tickangle=-30)
+    return fig
 # ---------------------------------------------------------------------------
 # Filter callbacks
 # ---------------------------------------------------------------------------
             )
         # ================================================================
+        # TAB 4 -- Community Models
+        # ================================================================
+        with gr.Tab("Community Models"):
+            n_evaluated = len([m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None])
+            n_pending = len([m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None])
+            with gr.Row():
+                gr.HTML(stat_box(str(len(COMMUNITY_MODELS)), "Models Tracked"))
+                gr.HTML(stat_box(str(n_evaluated), "Evaluated"))
+                gr.HTML(stat_box(str(n_pending), "Awaiting Evaluation"))
+                gr.HTML(stat_box(str(len(set(m["Creator"] for m in COMMUNITY_MODELS))), "Community Contributors"))
+            gr.HTML(finding_box(
+                "<strong>Community Impact:</strong> Independent researchers are producing "
+                "abliterated models using diverse methods. grimjim's Orthogonal Reflection "
+                "achieved 97% ASR on Gemma-3-12B-it, far surpassing our Heretic result (3% ASR) "
+                "on the same base model -- demonstrating that method choice matters more than tool popularity."
+            ))
+            status_filter = gr.Dropdown(
+                choices=["All", "Evaluated", "Pending"],
+                value="All",
+                label="Filter by evaluation status",
+            )
+            community_df = gr.Dataframe(
+                value=pd.DataFrame(COMMUNITY_MODELS),
+                label="Community Abliterated Models",
+                interactive=False,
+            )
+            status_filter.change(
+                filter_community,
+                inputs=[status_filter],
+                outputs=community_df,
+            )
+            gr.Plot(
+                value=build_community_chart(),
+                label="Community Model ASR",
+            )
+            gr.HTML(
+                '<div class="finding-box">'
+                "<p><strong>Submit your model for evaluation!</strong> "
+                '<a href="https://huggingface.co/spaces/richardyoung/'
+                'abliteration-methods-dashboard/discussions" '
+                'target="_blank" style="color: #53a8b6;">'
+                "Open a discussion</a> with your HF model link and abliteration method. "
+                "We will evaluate it using Heretic's standardized refusal/KL metrics "
+                "and add it to this tab.</p></div>"
+            )
+        # ================================================================
+        # TAB 5 -- v1 vs v2 Comparison
+        # ================================================================
+        with gr.Tab("v1 vs v2"):
+            v1_models = [m for m in V1_VS_V2 if m.get("v1 ASR (%)") is not None]
+            v2_models = [m for m in V1_VS_V2 if m.get("v2 ASR (%)") is not None]
+            v1_avg = round(sum(m["v1 ASR (%)"] for m in v1_models) / len(v1_models), 1) if v1_models else 0
+            v2_avg = round(sum(m["v2 ASR (%)"] for m in v2_models) / len(v2_models), 1) if v2_models else 0
+            with gr.Row():
+                gr.HTML(stat_box(f"{v1_avg}%", "v1 Avg ASR (Dec 2025)"))
+                gr.HTML(stat_box(f"{v2_avg}%", "v2 Avg ASR (Mar 2026)"))
+                gr.HTML(stat_box(f"{v1_avg - v2_avg:+.1f} pp", "ASR Change"))
+                gr.HTML(stat_box(f"{len(v1_models)}+{len(v2_models)}", "Models Tested"))
+            gr.HTML(finding_box(
+                "<strong>Key Finding:</strong> Safety alignment has improved dramatically. "
+                f"Average abliteration success dropped from {v1_avg}% (v1, late 2025) to "
+                f"{v2_avg}% (v2, early 2026), a {v1_avg - v2_avg:.1f} percentage point decline. "
+                "2026-era instruct models with stacked RLHF+DPO are significantly more resistant "
+                "to weight-level safety removal than their 2024-2025 predecessors."
+            ))
+            gr.Plot(
+                value=build_v1_v2_chart(),
+                label="v1 vs v2 ASR Comparison",
+            )
+            gr.Dataframe(
+                value=pd.DataFrame(V1_VS_V2),
+                label="v1 vs v2 Detailed Comparison",
+                interactive=False,
+            )
+            gr.Markdown(
+                "**Notes:** v1 and v2 tested slightly different model versions "
+                "(e.g., Mistral v0.3 vs v0.2, Llama base vs Instruct). "
+                "Direct comparisons should account for these differences. "
+                "Models marked 'Not retested' were only evaluated in v1."
+            )
+        # ================================================================
+        # TAB 6 \u2014 About
         # ================================================================
         with gr.Tab("About"):
             gr.HTML(

data/community_models.json ADDED Viewed

	@@ -0,0 +1,12 @@

+[
+  {"Model": "gemma-3-12b-it-orthogonal-reflection-v3", "Base Model": "Gemma-3-12B-it", "Creator": "grimjim", "Method": "Orthogonal Reflection", "Refusals (n=100)": 3, "ASR (%)": 97, "KL Divergence": 0.048, "HF Link": "grimjim/gemma-3-12b-it-orthogonal-reflection-bounded-ablation-v3-12B", "Notes": "Norm-preserving biprojected abliteration"},
+  {"Model": "gemma-3-12b-it-heretic-v2", "Base Model": "Gemma-3-12B-it", "Creator": "DreamFast", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "DreamFast/gemma-3-12b-it-heretic-v2", "Notes": "Awaiting evaluation"},
+  {"Model": "gemma-3-12b-it-heretic", "Base Model": "Gemma-3-12B-it", "Creator": "DreamFast", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "DreamFast/gemma-3-12b-it-heretic", "Notes": "Awaiting evaluation"},
+  {"Model": "gpt-oss-20b-heretic", "Base Model": "GPT-oss-20b", "Creator": "p-e-w", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "p-e-w/gpt-oss-20b-heretic", "Notes": "Awaiting evaluation"},
+  {"Model": "Qwen3.5-9B-Uncensored", "Base Model": "Qwen3.5-9B", "Creator": "LEONW24", "Method": "Unknown", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "LEONW24/Qwen3.5-9B-Uncensored", "Notes": "Awaiting evaluation"},
+  {"Model": "Llama-3.1-8B-Lexi-Uncensored-V2", "Base Model": "Llama-3.1-8B", "Creator": "Orenguteng", "Method": "Unknown", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2", "Notes": "Awaiting evaluation"},
+  {"Model": "Huihui-Qwen3-4B-abliterated-v2", "Base Model": "Qwen3-4B", "Creator": "huihui-ai", "Method": "Abliteration", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "huihui-ai/Huihui-Qwen3-4B-abliterated-v2", "Notes": "Awaiting evaluation"},
+  {"Model": "Qwen3.5-27B-Derestricted", "Base Model": "Qwen3.5-27B", "Creator": "ArliAI", "Method": "Derestriction", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "ArliAI/Qwen3.5-27B-Derestricted", "Notes": "Awaiting evaluation"},
+  {"Model": "Dolphin-Mistral-24B-Venice", "Base Model": "Mistral-24B", "Creator": "dphn", "Method": "Venice uncensored", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "dphn/Dolphin-Mistral-24B-Venice-Edition", "Notes": "Awaiting evaluation"},
+  {"Model": "GLM-4.7-Flash-Uncensored-Heretic", "Base Model": "GLM-4.7-Flash", "Creator": "DavidAU", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "DavidAU/GLM-4.7-Flash-Uncensored-Heretic-NEO-CODE-Imatrix-MAX-GGUF", "Notes": "GGUF only - awaiting evaluation"}
+]

data/v1_vs_v2.json ADDED Viewed

	@@ -0,0 +1,13 @@

+[
+  {"Model": "Zephyr-7B-beta", "v1 ASR (%)": 98, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
+  {"Model": "DeepSeek-7B-chat", "v1 ASR (%)": 84, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
+  {"Model": "Mistral-7B", "v1 ASR (%)": 84, "v2 ASR (%)": 17, "v1 Tool": "Heretic v1.1 (v0.3)", "v2 Tool": "Heretic v1.2 (v0.2)", "Category": "Both"},
+  {"Model": "Llama-3.1-8B", "v1 ASR (%)": 76, "v2 ASR (%)": 4, "v1 Tool": "Heretic v1.1 (base)", "v2 Tool": "Heretic v1.2 (Instruct)", "Category": "Both"},
+  {"Model": "Qwen3-8B", "v1 ASR (%)": 75, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
+  {"Model": "Qwen2.5-7B", "v1 ASR (%)": 58, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
+  {"Model": "StableLM-2-12B", "v1 ASR (%)": 46, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
+  {"Model": "DeepSeek-R1-Distill", "v1 ASR (%)": null, "v2 ASR (%)": 55, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"},
+  {"Model": "SmolLM3-3B", "v1 ASR (%)": null, "v2 ASR (%)": 16, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"},
+  {"Model": "Gemma-3-12B-it", "v1 ASR (%)": null, "v2 ASR (%)": 3, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"},
+  {"Model": "GPT-oss-20b", "v1 ASR (%)": null, "v2 ASR (%)": 2, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"}
+]