Ric commited on
Commit
8649dad
·
1 Parent(s): bfe9ac6

feat: add Community Models tab and v1 vs v2 comparison tab

Browse files

- Community Models tab: tracks 10 community-produced abliterated models
with grimjim's Orthogonal Reflection (97% ASR) as the standout result.
Includes evaluated/pending filter, ASR chart, and submission instructions.
- v1 vs v2 tab: side-by-side comparison showing safety alignment improvement
over 3 months. Average ASR dropped dramatically from v1 to v2.
- Dashboard now has 6 tabs total.

Files changed (3) hide show
  1. app.py +204 -1
  2. data/community_models.json +12 -0
  3. data/v1_vs_v2.json +13 -0
app.py CHANGED
@@ -90,6 +90,9 @@ _compat = _load_json("compatibility.json")
90
  COMPATIBILITY_ROWS = _compat["rows"]
91
  COVERAGE_TOTALS = _compat["totals"]
92
 
 
 
 
93
  # ---------------------------------------------------------------------------
94
  # Helpers
95
  # ---------------------------------------------------------------------------
@@ -306,6 +309,111 @@ def build_coverage_chart() -> go.Figure:
306
  return fig
307
 
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  # ---------------------------------------------------------------------------
310
  # Filter callbacks
311
  # ---------------------------------------------------------------------------
@@ -521,7 +629,102 @@ def build_app() -> gr.Blocks:
521
  )
522
 
523
  # ================================================================
524
- # TAB 4 \u2014 About
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  # ================================================================
526
  with gr.Tab("About"):
527
  gr.HTML(
 
90
  COMPATIBILITY_ROWS = _compat["rows"]
91
  COVERAGE_TOTALS = _compat["totals"]
92
 
93
+ COMMUNITY_MODELS = _load_json("community_models.json")
94
+ V1_VS_V2 = _load_json("v1_vs_v2.json")
95
+
96
  # ---------------------------------------------------------------------------
97
  # Helpers
98
  # ---------------------------------------------------------------------------
 
309
  return fig
310
 
311
 
312
+ # ---------------------------------------------------------------------------
313
+ # Community models chart
314
+ # ---------------------------------------------------------------------------
315
+
316
+
317
+ def build_community_chart() -> go.Figure:
318
+ evaluated = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None]
319
+ pending = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None]
320
+
321
+ if not evaluated:
322
+ fig = go.Figure()
323
+ fig.update_layout(
324
+ title="No evaluated community models yet",
325
+ plot_bgcolor="#0e1117",
326
+ paper_bgcolor="#0e1117",
327
+ font_color="#c4c4c4",
328
+ )
329
+ return fig
330
+
331
+ df = pd.DataFrame(evaluated).sort_values("ASR (%)", ascending=False)
332
+ fig = px.bar(
333
+ df,
334
+ x="Model",
335
+ y="ASR (%)",
336
+ color="Method",
337
+ text="ASR (%)",
338
+ color_discrete_map={
339
+ "Orthogonal Reflection": "#95d5b2",
340
+ "Heretic": "#e94560",
341
+ "Abliteration": "#53a8b6",
342
+ "Unknown": "#888",
343
+ },
344
+ )
345
+ fig.update_traces(textposition="outside", textfont_size=13)
346
+ fig.update_layout(
347
+ title="Community Abliterated Models - Attack Success Rate",
348
+ xaxis_title="Model",
349
+ yaxis_title="ASR (%)",
350
+ yaxis_range=[0, 110],
351
+ plot_bgcolor="#0e1117",
352
+ paper_bgcolor="#0e1117",
353
+ font_color="#c4c4c4",
354
+ margin=dict(t=50, b=60),
355
+ )
356
+ fig.update_xaxes(tickangle=-30)
357
+ return fig
358
+
359
+
360
+ def filter_community(status_filter: str) -> pd.DataFrame:
361
+ if status_filter == "Evaluated":
362
+ rows = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None]
363
+ elif status_filter == "Pending":
364
+ rows = [m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None]
365
+ else:
366
+ rows = COMMUNITY_MODELS
367
+ return pd.DataFrame(rows)
368
+
369
+
370
+ # ---------------------------------------------------------------------------
371
+ # v1 vs v2 chart
372
+ # ---------------------------------------------------------------------------
373
+
374
+
375
+ def build_v1_v2_chart() -> go.Figure:
376
+ df = pd.DataFrame(V1_VS_V2)
377
+ fig = go.Figure()
378
+
379
+ # v1 bars
380
+ fig.add_trace(go.Bar(
381
+ name="v1 (Dec 2025)",
382
+ x=df["Model"],
383
+ y=df["v1 ASR (%)"],
384
+ marker_color="#53a8b6",
385
+ text=df["v1 ASR (%)"].apply(lambda x: f"{x:.0f}%" if pd.notna(x) else ""),
386
+ textposition="outside",
387
+ textfont_size=11,
388
+ ))
389
+
390
+ # v2 bars
391
+ fig.add_trace(go.Bar(
392
+ name="v2 (Mar 2026)",
393
+ x=df["Model"],
394
+ y=df["v2 ASR (%)"],
395
+ marker_color="#e94560",
396
+ text=df["v2 ASR (%)"].apply(lambda x: f"{x:.0f}%" if pd.notna(x) else ""),
397
+ textposition="outside",
398
+ textfont_size=11,
399
+ ))
400
+
401
+ fig.update_layout(
402
+ barmode="group",
403
+ title="Abliteration Effectiveness: v1 (2025) vs v2 (2026)",
404
+ xaxis_title="Model",
405
+ yaxis_title="ASR (%)",
406
+ yaxis_range=[0, 110],
407
+ plot_bgcolor="#0e1117",
408
+ paper_bgcolor="#0e1117",
409
+ font_color="#c4c4c4",
410
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
411
+ margin=dict(t=70, b=80),
412
+ )
413
+ fig.update_xaxes(tickangle=-30)
414
+ return fig
415
+
416
+
417
  # ---------------------------------------------------------------------------
418
  # Filter callbacks
419
  # ---------------------------------------------------------------------------
 
629
  )
630
 
631
  # ================================================================
632
+ # TAB 4 -- Community Models
633
+ # ================================================================
634
+ with gr.Tab("Community Models"):
635
+ n_evaluated = len([m for m in COMMUNITY_MODELS if m.get("ASR (%)") is not None])
636
+ n_pending = len([m for m in COMMUNITY_MODELS if m.get("ASR (%)") is None])
637
+
638
+ with gr.Row():
639
+ gr.HTML(stat_box(str(len(COMMUNITY_MODELS)), "Models Tracked"))
640
+ gr.HTML(stat_box(str(n_evaluated), "Evaluated"))
641
+ gr.HTML(stat_box(str(n_pending), "Awaiting Evaluation"))
642
+ gr.HTML(stat_box(str(len(set(m["Creator"] for m in COMMUNITY_MODELS))), "Community Contributors"))
643
+
644
+ gr.HTML(finding_box(
645
+ "<strong>Community Impact:</strong> Independent researchers are producing "
646
+ "abliterated models using diverse methods. grimjim's Orthogonal Reflection "
647
+ "achieved 97% ASR on Gemma-3-12B-it, far surpassing our Heretic result (3% ASR) "
648
+ "on the same base model -- demonstrating that method choice matters more than tool popularity."
649
+ ))
650
+
651
+ status_filter = gr.Dropdown(
652
+ choices=["All", "Evaluated", "Pending"],
653
+ value="All",
654
+ label="Filter by evaluation status",
655
+ )
656
+
657
+ community_df = gr.Dataframe(
658
+ value=pd.DataFrame(COMMUNITY_MODELS),
659
+ label="Community Abliterated Models",
660
+ interactive=False,
661
+ )
662
+
663
+ status_filter.change(
664
+ filter_community,
665
+ inputs=[status_filter],
666
+ outputs=community_df,
667
+ )
668
+
669
+ gr.Plot(
670
+ value=build_community_chart(),
671
+ label="Community Model ASR",
672
+ )
673
+
674
+ gr.HTML(
675
+ '<div class="finding-box">'
676
+ "<p><strong>Submit your model for evaluation!</strong> "
677
+ '<a href="https://huggingface.co/spaces/richardyoung/'
678
+ 'abliteration-methods-dashboard/discussions" '
679
+ 'target="_blank" style="color: #53a8b6;">'
680
+ "Open a discussion</a> with your HF model link and abliteration method. "
681
+ "We will evaluate it using Heretic's standardized refusal/KL metrics "
682
+ "and add it to this tab.</p></div>"
683
+ )
684
+
685
+ # ================================================================
686
+ # TAB 5 -- v1 vs v2 Comparison
687
+ # ================================================================
688
+ with gr.Tab("v1 vs v2"):
689
+ v1_models = [m for m in V1_VS_V2 if m.get("v1 ASR (%)") is not None]
690
+ v2_models = [m for m in V1_VS_V2 if m.get("v2 ASR (%)") is not None]
691
+ v1_avg = round(sum(m["v1 ASR (%)"] for m in v1_models) / len(v1_models), 1) if v1_models else 0
692
+ v2_avg = round(sum(m["v2 ASR (%)"] for m in v2_models) / len(v2_models), 1) if v2_models else 0
693
+
694
+ with gr.Row():
695
+ gr.HTML(stat_box(f"{v1_avg}%", "v1 Avg ASR (Dec 2025)"))
696
+ gr.HTML(stat_box(f"{v2_avg}%", "v2 Avg ASR (Mar 2026)"))
697
+ gr.HTML(stat_box(f"{v1_avg - v2_avg:+.1f} pp", "ASR Change"))
698
+ gr.HTML(stat_box(f"{len(v1_models)}+{len(v2_models)}", "Models Tested"))
699
+
700
+ gr.HTML(finding_box(
701
+ "<strong>Key Finding:</strong> Safety alignment has improved dramatically. "
702
+ f"Average abliteration success dropped from {v1_avg}% (v1, late 2025) to "
703
+ f"{v2_avg}% (v2, early 2026), a {v1_avg - v2_avg:.1f} percentage point decline. "
704
+ "2026-era instruct models with stacked RLHF+DPO are significantly more resistant "
705
+ "to weight-level safety removal than their 2024-2025 predecessors."
706
+ ))
707
+
708
+ gr.Plot(
709
+ value=build_v1_v2_chart(),
710
+ label="v1 vs v2 ASR Comparison",
711
+ )
712
+
713
+ gr.Dataframe(
714
+ value=pd.DataFrame(V1_VS_V2),
715
+ label="v1 vs v2 Detailed Comparison",
716
+ interactive=False,
717
+ )
718
+
719
+ gr.Markdown(
720
+ "**Notes:** v1 and v2 tested slightly different model versions "
721
+ "(e.g., Mistral v0.3 vs v0.2, Llama base vs Instruct). "
722
+ "Direct comparisons should account for these differences. "
723
+ "Models marked 'Not retested' were only evaluated in v1."
724
+ )
725
+
726
+ # ================================================================
727
+ # TAB 6 \u2014 About
728
  # ================================================================
729
  with gr.Tab("About"):
730
  gr.HTML(
data/community_models.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"Model": "gemma-3-12b-it-orthogonal-reflection-v3", "Base Model": "Gemma-3-12B-it", "Creator": "grimjim", "Method": "Orthogonal Reflection", "Refusals (n=100)": 3, "ASR (%)": 97, "KL Divergence": 0.048, "HF Link": "grimjim/gemma-3-12b-it-orthogonal-reflection-bounded-ablation-v3-12B", "Notes": "Norm-preserving biprojected abliteration"},
3
+ {"Model": "gemma-3-12b-it-heretic-v2", "Base Model": "Gemma-3-12B-it", "Creator": "DreamFast", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "DreamFast/gemma-3-12b-it-heretic-v2", "Notes": "Awaiting evaluation"},
4
+ {"Model": "gemma-3-12b-it-heretic", "Base Model": "Gemma-3-12B-it", "Creator": "DreamFast", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "DreamFast/gemma-3-12b-it-heretic", "Notes": "Awaiting evaluation"},
5
+ {"Model": "gpt-oss-20b-heretic", "Base Model": "GPT-oss-20b", "Creator": "p-e-w", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "p-e-w/gpt-oss-20b-heretic", "Notes": "Awaiting evaluation"},
6
+ {"Model": "Qwen3.5-9B-Uncensored", "Base Model": "Qwen3.5-9B", "Creator": "LEONW24", "Method": "Unknown", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "LEONW24/Qwen3.5-9B-Uncensored", "Notes": "Awaiting evaluation"},
7
+ {"Model": "Llama-3.1-8B-Lexi-Uncensored-V2", "Base Model": "Llama-3.1-8B", "Creator": "Orenguteng", "Method": "Unknown", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2", "Notes": "Awaiting evaluation"},
8
+ {"Model": "Huihui-Qwen3-4B-abliterated-v2", "Base Model": "Qwen3-4B", "Creator": "huihui-ai", "Method": "Abliteration", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "huihui-ai/Huihui-Qwen3-4B-abliterated-v2", "Notes": "Awaiting evaluation"},
9
+ {"Model": "Qwen3.5-27B-Derestricted", "Base Model": "Qwen3.5-27B", "Creator": "ArliAI", "Method": "Derestriction", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "ArliAI/Qwen3.5-27B-Derestricted", "Notes": "Awaiting evaluation"},
10
+ {"Model": "Dolphin-Mistral-24B-Venice", "Base Model": "Mistral-24B", "Creator": "dphn", "Method": "Venice uncensored", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "dphn/Dolphin-Mistral-24B-Venice-Edition", "Notes": "Awaiting evaluation"},
11
+ {"Model": "GLM-4.7-Flash-Uncensored-Heretic", "Base Model": "GLM-4.7-Flash", "Creator": "DavidAU", "Method": "Heretic", "Refusals (n=100)": null, "ASR (%)": null, "KL Divergence": null, "HF Link": "DavidAU/GLM-4.7-Flash-Uncensored-Heretic-NEO-CODE-Imatrix-MAX-GGUF", "Notes": "GGUF only - awaiting evaluation"}
12
+ ]
data/v1_vs_v2.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"Model": "Zephyr-7B-beta", "v1 ASR (%)": 98, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
3
+ {"Model": "DeepSeek-7B-chat", "v1 ASR (%)": 84, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
4
+ {"Model": "Mistral-7B", "v1 ASR (%)": 84, "v2 ASR (%)": 17, "v1 Tool": "Heretic v1.1 (v0.3)", "v2 Tool": "Heretic v1.2 (v0.2)", "Category": "Both"},
5
+ {"Model": "Llama-3.1-8B", "v1 ASR (%)": 76, "v2 ASR (%)": 4, "v1 Tool": "Heretic v1.1 (base)", "v2 Tool": "Heretic v1.2 (Instruct)", "Category": "Both"},
6
+ {"Model": "Qwen3-8B", "v1 ASR (%)": 75, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
7
+ {"Model": "Qwen2.5-7B", "v1 ASR (%)": 58, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
8
+ {"Model": "StableLM-2-12B", "v1 ASR (%)": 46, "v2 ASR (%)": null, "v1 Tool": "Heretic v1.1", "v2 Tool": "Not retested", "Category": "v1 Only"},
9
+ {"Model": "DeepSeek-R1-Distill", "v1 ASR (%)": null, "v2 ASR (%)": 55, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"},
10
+ {"Model": "SmolLM3-3B", "v1 ASR (%)": null, "v2 ASR (%)": 16, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"},
11
+ {"Model": "Gemma-3-12B-it", "v1 ASR (%)": null, "v2 ASR (%)": 3, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"},
12
+ {"Model": "GPT-oss-20b", "v1 ASR (%)": null, "v2 ASR (%)": 2, "v1 Tool": "N/A", "v2 Tool": "Heretic v1.2", "Category": "v2 Only"}
13
+ ]