Ric commited on
Commit
f35fdee
·
1 Parent(s): d55e7bb

refactor: extract hardcoded data to JSON files

Browse files

Move all dashboard data (leaderboard, capabilities, compatibility,
mean capability change) from hardcoded Python dicts to data/*.json.
Makes it easier to add new model results without editing app.py.

app.py CHANGED
@@ -4,6 +4,9 @@ Visualizes results from "Comparative Analysis of LLM Abliteration Methods:
4
  A Cross-Architecture Evaluation" (arxiv:2512.13655) by Richard J. Young.
5
  """
6
 
 
 
 
7
  import gradio as gr
8
  import pandas as pd
9
  import plotly.express as px
@@ -68,103 +71,24 @@ CSS = """
68
  """
69
 
70
  # ---------------------------------------------------------------------------
71
- # Hard-coded data
72
  # ---------------------------------------------------------------------------
73
 
74
- LEADERBOARD_ROWS = [
75
- {"Model": "Zephyr-7B-beta", "Parameters": "7B", "Refusals (n=100)": 2,
76
- "KL Divergence": 0.076, "ASR (%)": 98, "ASR 95% CI": "93.0\u201399.4", "Time": "40m"},
77
- {"Model": "DeepSeek-7B-chat", "Parameters": "7B", "Refusals (n=100)": 16,
78
- "KL Divergence": 0.043, "ASR (%)": 84, "ASR 95% CI": "75.6\u201389.9", "Time": "59m"},
79
- {"Model": "Mistral-7B-v0.3", "Parameters": "7B", "Refusals (n=100)": 16,
80
- "KL Divergence": 0.317, "ASR (%)": 84, "ASR 95% CI": "75.6\u201389.9", "Time": "39m"},
81
- {"Model": "Llama-3.1-8B", "Parameters": "8B", "Refusals (n=100)": 24,
82
- "KL Divergence": 0.056, "ASR (%)": 76, "ASR 95% CI": "66.8\u201383.3", "Time": "33m"},
83
- {"Model": "Qwen3-8B", "Parameters": "8B", "Refusals (n=100)": 25,
84
- "KL Divergence": 0.210, "ASR (%)": 75, "ASR 95% CI": "65.7\u201382.5", "Time": "56m"},
85
- {"Model": "Yi-1.5-9B", "Parameters": "9B", "Refusals (n=100)": 25,
86
- "KL Divergence": 0.248, "ASR (%)": 75, "ASR 95% CI": "65.7\u201382.5", "Time": "57m"},
87
- {"Model": "Qwen2.5-7B", "Parameters": "7B", "Refusals (n=100)": 42,
88
- "KL Divergence": 1.646, "ASR (%)": 58, "ASR 95% CI": "48.2\u201367.2", "Time": "41m"},
89
- {"Model": "StableLM-2-12B", "Parameters": "12B", "Refusals (n=100)": 54,
90
- "KL Divergence": 1.605, "ASR (%)": 46, "ASR 95% CI": "36.6\u201355.7", "Time": "109m"},
91
- ]
92
-
93
- CAPABILITY_DATA = {
94
- "DeepSeek-7B": [
95
- {"Variant": "Base", "MMLU": 49.44, "GSM8K": 44.58, "HellaSwag": 77.84},
96
- {"Variant": "Heretic", "MMLU": 48.95, "GSM8K": 40.11, "HellaSwag": 77.62},
97
- {"Variant": "DECCP", "MMLU": 49.05, "GSM8K": 43.59, "HellaSwag": 77.99},
98
- {"Variant": "ErisForge", "MMLU": 49.43, "GSM8K": 44.35, "HellaSwag": 77.69},
99
- ],
100
- "Mistral-7B": [
101
- {"Variant": "Base", "MMLU": 59.74, "GSM8K": 48.52, "HellaSwag": 83.28},
102
- {"Variant": "Heretic", "MMLU": 59.46, "GSM8K": 48.37, "HellaSwag": 83.36},
103
- {"Variant": "DECCP", "MMLU": 58.98, "GSM8K": 47.61, "HellaSwag": 83.12},
104
- {"Variant": "ErisForge", "MMLU": 59.42, "GSM8K": 48.29, "HellaSwag": 83.35},
105
- ],
106
- "Yi-1.5-9B": [
107
- {"Variant": "Base", "MMLU": 68.02, "GSM8K": 70.89, "HellaSwag": 78.62},
108
- {"Variant": "Heretic", "MMLU": 66.46, "GSM8K": 52.08, "HellaSwag": 77.08},
109
- {"Variant": "DECCP", "MMLU": 67.33, "GSM8K": 72.40, "HellaSwag": 77.87},
110
- {"Variant": "ErisForge", "MMLU": 67.99, "GSM8K": 70.51, "HellaSwag": 78.46},
111
- ],
112
- "Zephyr-7B": [
113
- {"Variant": "Heretic", "MMLU": 58.50, "GSM8K": 33.36, "HellaSwag": 82.90},
114
- {"Variant": "DECCP", "MMLU": 58.28, "GSM8K": 33.21, "HellaSwag": 82.05},
115
- ],
116
- }
117
 
118
- MEAN_CAPABILITY_CHANGE = [
119
- {"Tool": "Heretic", "Avg MMLU \u0394 (pp)": -0.78,
120
- "Avg GSM8K \u0394 (pp)": -7.81, "Avg HellaSwag \u0394 (pp)": -0.56},
121
- {"Tool": "DECCP", "Avg MMLU \u0394 (pp)": -0.61,
122
- "Avg GSM8K \u0394 (pp)": -0.13, "Avg HellaSwag \u0394 (pp)": -0.25},
123
- {"Tool": "ErisForge", "Avg MMLU \u0394 (pp)": -0.12,
124
- "Avg GSM8K \u0394 (pp)": -0.28, "Avg HellaSwag \u0394 (pp)": -0.08},
125
- ]
126
-
127
- COMPATIBILITY_ROWS = [
128
- {"Model": "Llama-3.1-8B", "Heretic": "Yes", "DECCP": "Yes",
129
- "FailSpy": "Yes", "ErisForge": "Yes"},
130
- {"Model": "Mistral-7B-v0.3", "Heretic": "Yes", "DECCP": "Yes",
131
- "FailSpy": "Yes", "ErisForge": "Yes"},
132
- {"Model": "Qwen2.5-7B", "Heretic": "Yes", "DECCP": "Yes",
133
- "FailSpy": "Yes", "ErisForge": "Yes"},
134
- {"Model": "Gemma-2-9B", "Heretic": "Yes", "DECCP": "Yes",
135
- "FailSpy": "Yes", "ErisForge": "Yes"},
136
- {"Model": "Gemma-7B", "Heretic": "Yes", "DECCP": "Yes",
137
- "FailSpy": "Yes", "ErisForge": "Yes"},
138
- {"Model": "StableLM-2-12B", "Heretic": "Yes", "DECCP": "Yes",
139
- "FailSpy": "Partial", "ErisForge": "Yes"},
140
- {"Model": "Yi-1.5-9B", "Heretic": "Yes", "DECCP": "Yes",
141
- "FailSpy": "Partial", "ErisForge": "Yes"},
142
- {"Model": "Zephyr-7B-beta", "Heretic": "Yes", "DECCP": "Yes",
143
- "FailSpy": "Partial", "ErisForge": "Yes"},
144
- {"Model": "DeepSeek-7B", "Heretic": "Yes", "DECCP": "Yes",
145
- "FailSpy": "Partial", "ErisForge": "Yes"},
146
- {"Model": "OpenChat-3.5", "Heretic": "Yes", "DECCP": "Yes",
147
- "FailSpy": "Partial", "ErisForge": "No"},
148
- {"Model": "Qwen3-8B", "Heretic": "Yes", "DECCP": "Yes",
149
- "FailSpy": "Partial", "ErisForge": "N/A"},
150
- {"Model": "Vicuna-7B", "Heretic": "Yes", "DECCP": "N/A",
151
- "FailSpy": "Partial", "ErisForge": "No"},
152
- {"Model": "InternLM2.5-7B", "Heretic": "Yes", "DECCP": "N/A",
153
- "FailSpy": "Partial", "ErisForge": "No"},
154
- {"Model": "Falcon-Mamba-7B", "Heretic": "Yes", "DECCP": "Incompatible",
155
- "FailSpy": "Incompatible", "ErisForge": "Incompatible"},
156
- {"Model": "Phi-3-small-8k", "Heretic": "Yes", "DECCP": "N/A",
157
- "FailSpy": "Partial", "ErisForge": "N/A"},
158
- {"Model": "Qwen3-14B", "Heretic": "Yes", "DECCP": "N/A",
159
- "FailSpy": "Partial", "ErisForge": "N/A"},
160
- ]
161
-
162
- COVERAGE_TOTALS = {
163
- "Heretic": "16/16 (100%)",
164
- "DECCP": "11/16 (69%)",
165
- "FailSpy": "5/16 (31%)",
166
- "ErisForge": "9/16 (56%)",
167
- }
168
 
169
  # ---------------------------------------------------------------------------
170
  # Helpers
 
4
  A Cross-Architecture Evaluation" (arxiv:2512.13655) by Richard J. Young.
5
  """
6
 
7
+ import json
8
+ from pathlib import Path
9
+
10
  import gradio as gr
11
  import pandas as pd
12
  import plotly.express as px
 
71
  """
72
 
73
  # ---------------------------------------------------------------------------
74
+ # Data (loaded from JSON files in data/ directory)
75
  # ---------------------------------------------------------------------------
76
 
77
+ DATA_DIR = Path(__file__).parent / "data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+
80
+ def _load_json(name: str):
81
+ with open(DATA_DIR / name) as f:
82
+ return json.load(f)
83
+
84
+
85
+ LEADERBOARD_ROWS = _load_json("leaderboard.json")
86
+ CAPABILITY_DATA = _load_json("capabilities.json")
87
+ MEAN_CAPABILITY_CHANGE = _load_json("mean_capability_change.json")
88
+
89
+ _compat = _load_json("compatibility.json")
90
+ COMPATIBILITY_ROWS = _compat["rows"]
91
+ COVERAGE_TOTALS = _compat["totals"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # ---------------------------------------------------------------------------
94
  # Helpers
data/capabilities.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "DeepSeek-7B": [
3
+ {"Variant": "Base", "MMLU": 49.44, "GSM8K": 44.58, "HellaSwag": 77.84},
4
+ {"Variant": "Heretic", "MMLU": 48.95, "GSM8K": 40.11, "HellaSwag": 77.62},
5
+ {"Variant": "DECCP", "MMLU": 49.05, "GSM8K": 43.59, "HellaSwag": 77.99},
6
+ {"Variant": "ErisForge", "MMLU": 49.43, "GSM8K": 44.35, "HellaSwag": 77.69}
7
+ ],
8
+ "Mistral-7B": [
9
+ {"Variant": "Base", "MMLU": 59.74, "GSM8K": 48.52, "HellaSwag": 83.28},
10
+ {"Variant": "Heretic", "MMLU": 59.46, "GSM8K": 48.37, "HellaSwag": 83.36},
11
+ {"Variant": "DECCP", "MMLU": 58.98, "GSM8K": 47.61, "HellaSwag": 83.12},
12
+ {"Variant": "ErisForge", "MMLU": 59.42, "GSM8K": 48.29, "HellaSwag": 83.35}
13
+ ],
14
+ "Yi-1.5-9B": [
15
+ {"Variant": "Base", "MMLU": 68.02, "GSM8K": 70.89, "HellaSwag": 78.62},
16
+ {"Variant": "Heretic", "MMLU": 66.46, "GSM8K": 52.08, "HellaSwag": 77.08},
17
+ {"Variant": "DECCP", "MMLU": 67.33, "GSM8K": 72.40, "HellaSwag": 77.87},
18
+ {"Variant": "ErisForge", "MMLU": 67.99, "GSM8K": 70.51, "HellaSwag": 78.46}
19
+ ],
20
+ "Zephyr-7B": [
21
+ {"Variant": "Heretic", "MMLU": 58.50, "GSM8K": 33.36, "HellaSwag": 82.90},
22
+ {"Variant": "DECCP", "MMLU": 58.28, "GSM8K": 33.21, "HellaSwag": 82.05}
23
+ ]
24
+ }
data/compatibility.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "rows": [
3
+ {"Model": "Llama-3.1-8B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Yes", "ErisForge": "Yes"},
4
+ {"Model": "Mistral-7B-v0.3", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Yes", "ErisForge": "Yes"},
5
+ {"Model": "Qwen2.5-7B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Yes", "ErisForge": "Yes"},
6
+ {"Model": "Gemma-2-9B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Yes", "ErisForge": "Yes"},
7
+ {"Model": "Gemma-7B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Yes", "ErisForge": "Yes"},
8
+ {"Model": "StableLM-2-12B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Partial", "ErisForge": "Yes"},
9
+ {"Model": "Yi-1.5-9B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Partial", "ErisForge": "Yes"},
10
+ {"Model": "Zephyr-7B-beta", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Partial", "ErisForge": "Yes"},
11
+ {"Model": "DeepSeek-7B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Partial", "ErisForge": "Yes"},
12
+ {"Model": "OpenChat-3.5", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Partial", "ErisForge": "No"},
13
+ {"Model": "Qwen3-8B", "Heretic": "Yes", "DECCP": "Yes", "FailSpy": "Partial", "ErisForge": "N/A"},
14
+ {"Model": "Vicuna-7B", "Heretic": "Yes", "DECCP": "N/A", "FailSpy": "Partial", "ErisForge": "No"},
15
+ {"Model": "InternLM2.5-7B", "Heretic": "Yes", "DECCP": "N/A", "FailSpy": "Partial", "ErisForge": "No"},
16
+ {"Model": "Falcon-Mamba-7B", "Heretic": "Yes", "DECCP": "Incompatible", "FailSpy": "Incompatible", "ErisForge": "Incompatible"},
17
+ {"Model": "Phi-3-small-8k", "Heretic": "Yes", "DECCP": "N/A", "FailSpy": "Partial", "ErisForge": "N/A"},
18
+ {"Model": "Qwen3-14B", "Heretic": "Yes", "DECCP": "N/A", "FailSpy": "Partial", "ErisForge": "N/A"}
19
+ ],
20
+ "totals": {
21
+ "Heretic": "16/16 (100%)",
22
+ "DECCP": "11/16 (69%)",
23
+ "FailSpy": "5/16 (31%)",
24
+ "ErisForge": "9/16 (56%)"
25
+ }
26
+ }
data/leaderboard.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"Model": "Zephyr-7B-beta", "Parameters": "7B", "Refusals (n=100)": 2, "KL Divergence": 0.076, "ASR (%)": 98, "ASR 95% CI": "93.0\u201399.4", "Time": "40m"},
3
+ {"Model": "DeepSeek-7B-chat", "Parameters": "7B", "Refusals (n=100)": 16, "KL Divergence": 0.043, "ASR (%)": 84, "ASR 95% CI": "75.6\u201389.9", "Time": "59m"},
4
+ {"Model": "Mistral-7B-v0.3", "Parameters": "7B", "Refusals (n=100)": 16, "KL Divergence": 0.317, "ASR (%)": 84, "ASR 95% CI": "75.6\u201389.9", "Time": "39m"},
5
+ {"Model": "Llama-3.1-8B", "Parameters": "8B", "Refusals (n=100)": 24, "KL Divergence": 0.056, "ASR (%)": 76, "ASR 95% CI": "66.8\u201383.3", "Time": "33m"},
6
+ {"Model": "Qwen3-8B", "Parameters": "8B", "Refusals (n=100)": 25, "KL Divergence": 0.210, "ASR (%)": 75, "ASR 95% CI": "65.7\u201382.5", "Time": "56m"},
7
+ {"Model": "Yi-1.5-9B", "Parameters": "9B", "Refusals (n=100)": 25, "KL Divergence": 0.248, "ASR (%)": 75, "ASR 95% CI": "65.7\u201382.5", "Time": "57m"},
8
+ {"Model": "Qwen2.5-7B", "Parameters": "7B", "Refusals (n=100)": 42, "KL Divergence": 1.646, "ASR (%)": 58, "ASR 95% CI": "48.2\u201367.2", "Time": "41m"},
9
+ {"Model": "StableLM-2-12B", "Parameters": "12B", "Refusals (n=100)": 54, "KL Divergence": 1.605, "ASR (%)": 46, "ASR 95% CI": "36.6\u201355.7", "Time": "109m"}
10
+ ]
data/mean_capability_change.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [
2
+ {"Tool": "Heretic", "Avg MMLU \u0394 (pp)": -0.78, "Avg GSM8K \u0394 (pp)": -7.81, "Avg HellaSwag \u0394 (pp)": -0.56},
3
+ {"Tool": "DECCP", "Avg MMLU \u0394 (pp)": -0.61, "Avg GSM8K \u0394 (pp)": -0.13, "Avg HellaSwag \u0394 (pp)": -0.25},
4
+ {"Tool": "ErisForge", "Avg MMLU \u0394 (pp)": -0.12, "Avg GSM8K \u0394 (pp)": -0.28, "Avg HellaSwag \u0394 (pp)": -0.08}
5
+ ]