narcolepticchicken commited on
Commit
1d13104
·
verified ·
1 Parent(s): 8501479

Upload jobs/analyze_collapse.py

Browse files
Files changed (1) hide show
  1. jobs/analyze_collapse.py +355 -0
jobs/analyze_collapse.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OCC Collapse Mechanism — Post-Run Analysis Harness
4
+ ===================================================
5
+ Reads the mechanism isolation JSON output and produces:
6
+ - condition_summary.csv
7
+ - per_topic_outcomes.csv
8
+ - round_flip_matrix.csv
9
+ - honest_retention_by_round.csv
10
+ - adversary_induced_flips.csv
11
+ - fig_accuracy_by_condition.png
12
+ - fig_honest_retention.png
13
+ - fig_flip_rate.png
14
+ - fig_adversary_skill.png
15
+ - hypothesis_verdicts.json
16
+
17
+ Uses PRE-REGISTERED win conditions defined BEFORE seeing results.
18
+
19
+ Usage:
20
+ python analyze_collapse.py /path/to/debate_collapse_mechanism_results.json
21
+ """
22
+
23
+ import json, csv, sys
24
+ from pathlib import Path
25
+
26
+ # ─── PRE-REGISTERED WIN CONDITIONS ───
27
+ # Defined BEFORE seeing results to prevent post-hoc storytelling.
28
+
29
+ HYPOTHESIS_RULES = {
30
+ "H1_volume_amplification": {
31
+ "description": "Collapse is driven by adversarial compute volume, not mere presence.",
32
+ "confirmed_if": "equal_token_unequal_turn.mean_accuracy > baseline_1round.mean_accuracy - 0.10",
33
+ "refuted_if": "equal_token_unequal_turn.mean_accuracy <= equal_3round.mean_accuracy + 0.05",
34
+ "test_condition": "equal_token_unequal_turn",
35
+ "baseline_condition": "equal_3round_traced",
36
+ "ref_baseline": "baseline_1round_traced",
37
+ },
38
+ "H2_turn_order": {
39
+ "description": "Collapse depends on adversary speaking last/late.",
40
+ "confirmed_if": "randomized_order_3round.mean_accuracy > equal_3round.mean_accuracy + 0.10",
41
+ "refuted_if": "abs(randomized_order_3round.mean_accuracy - equal_3round.mean_accuracy) < 0.05",
42
+ "test_condition": "randomized_order_3round",
43
+ "baseline_condition": "equal_3round_traced",
44
+ },
45
+ "H3_voting_vulnerability": {
46
+ "description": "Majority voting is fragile; judge or confidence-weighted voting resists collapse.",
47
+ "confirmed_if": "max(judge_vote_3round.mean, confidence_weighted_3round.mean) > equal_3round.mean + 0.10",
48
+ "refuted_if": "abs(judge_vote_3round.mean - equal_3round.mean) < 0.05 and abs(confidence_weighted_3round.mean - equal_3round.mean) < 0.05",
49
+ "test_conditions": ["judge_vote_3round", "confidence_weighted_3round"],
50
+ "baseline_condition": "equal_3round_traced",
51
+ },
52
+ "H4_contamination": {
53
+ "description": "Honest agents copy adversary answers after exposure.",
54
+ "confirmed_if": "honest_retention_rate_round3 < 0.5",
55
+ "refuted_if": "honest_retention_rate_round3 > 0.7",
56
+ "source": "equal_3round_traced.honest_retention_round3 / (n_topics * 3)",
57
+ },
58
+ "H5_confidence_distortion": {
59
+ "description": "Adversary causes honest agents to become uncertain, not persuaded.",
60
+ "confirmed_if": "confidence_weighted_3round.mean > equal_3round.mean + 0.10",
61
+ "refuted_if": "abs(confidence_weighted_3round.mean - equal_3round.mean) < 0.05",
62
+ "test_condition": "confidence_weighted_3round",
63
+ "baseline_condition": "equal_3round_traced",
64
+ },
65
+ "H6_skill_dependency": {
66
+ "description": "Collapse depends on adversary skill; weak adversary does not collapse.",
67
+ "confirmed_if": "adversary_weak.mean > equal_3round.mean + 0.10 and adversary_oracle.mean <= equal_3round.mean + 0.05",
68
+ "refuted_if": "adversary_weak.mean <= equal_3round.mean + 0.05",
69
+ "test_weak": "adversary_weak",
70
+ "test_strong": "adversary_strong",
71
+ "test_oracle": "adversary_oracle",
72
+ "baseline_condition": "equal_3round_traced",
73
+ },
74
+ "H7_topic_vulnerability": {
75
+ "description": "Collapse clusters by question difficulty or ambiguity.",
76
+ "confirmed_if": "Some topics show collapse while others are robust in equal_3round",
77
+ "refuted_if": "All topics show similar collapse magnitude",
78
+ "needs": "per_topic analysis",
79
+ },
80
+ }
81
+
82
+
83
+ def load_data(path):
84
+ with open(path) as f:
85
+ return json.load(f)
86
+
87
+
88
+ def make_summary_table(data):
89
+ summary = data.get("summary", {})
90
+ return [
91
+ {"condition": name, "mean_accuracy": round(s["mean"], 4),
92
+ "min_accuracy": round(s["min"], 4), "max_accuracy": round(s["max"], 4),
93
+ "range": round(s["max"] - s["min"], 4)}
94
+ for name, s in summary.items()
95
+ ]
96
+
97
+
98
+ def make_retention_table(data):
99
+ rows = []
100
+ for seed_str, seed_data in data.get("seeds", {}).items():
101
+ traced = seed_data.get("equal_3round_traced", {})
102
+ if not traced:
103
+ continue
104
+ for rd in [2, 3]:
105
+ rows.append({
106
+ "seed": int(seed_str), "round": rd,
107
+ "stayed": traced.get(f"honest_retention_round{rd}", 0),
108
+ "flipped_away": traced.get(f"flipped_away_round{rd}", 0),
109
+ "flipped_toward": traced.get(f"flipped_toward_round{rd}", 0),
110
+ "adversary_flips": traced.get("adversary_flips", 0),
111
+ })
112
+ return rows
113
+
114
+
115
+ def make_flip_matrix(data):
116
+ flip_rows, adv_rows = [], []
117
+ for seed_str, seed_data in data.get("seeds", {}).items():
118
+ traced = seed_data.get("equal_3round_traced", {})
119
+ for tkey, count in traced.get("transitions", {}).items():
120
+ parts = tkey.split("_")
121
+ if len(parts) == 2 and parts[0].startswith("h") and parts[1].startswith("h"):
122
+ try:
123
+ r1, r3 = int(parts[0][1:]), int(parts[1][1:])
124
+ flip_rows.append({
125
+ "seed": int(seed_str), "transition": tkey,
126
+ "round1_correct": r1, "round3_correct": r3,
127
+ "count": count,
128
+ "flip_direction": "stable" if r1 == r3 else ("degraded" if r1 > r3 else "improved"),
129
+ })
130
+ except ValueError:
131
+ pass
132
+ adv_rows.append({"seed": int(seed_str), "total_adversary_flips": traced.get("adversary_flips", 0)})
133
+ return flip_rows, adv_rows
134
+
135
+
136
+ def evaluate_hypotheses(data):
137
+ summary = data.get("summary", {})
138
+ verdicts = {}
139
+ # Extract retention data from first seed
140
+ first_seed = list(data.get("seeds", {}).values())[0] if data.get("seeds") else {}
141
+ traced = first_seed.get("equal_3round_traced", {})
142
+ retention_r3 = traced.get("honest_retention_round3", 0)
143
+ flipped_r3 = traced.get("flipped_away_round3", 0)
144
+ total_r3 = retention_r3 + flipped_r3
145
+ retention_rate = retention_r3 / total_r3 if total_r3 > 0 else 1.0
146
+
147
+ for h_name, rules in HYPOTHESIS_RULES.items():
148
+ try:
149
+ v = {"hypothesis": h_name, "description": rules["description"], "verdict": "INCONCLUSIVE", "reason": "", "values": {}}
150
+
151
+ if h_name == "H1_volume_amplification":
152
+ test = summary.get("equal_token_unequal_turn", {}).get("mean", 0)
153
+ collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
154
+ base1 = summary.get("baseline_1round_traced", {}).get("mean", 1)
155
+ v["values"] = {"baseline_1round": round(base1,4), "collapse": round(collapse,4), "equal_token": round(test,4)}
156
+ if test > base1 - 0.10:
157
+ v["verdict"] = "CONFIRMED"
158
+ v["reason"] = f"Equal-token recovered to {test:.3f}, within 10pp of baseline {base1:.3f}. Volume amplification is primary mechanism."
159
+ elif test <= collapse + 0.05:
160
+ v["verdict"] = "REFUTED"
161
+ v["reason"] = f"Equal-token at {test:.3f} barely above collapse {collapse:.3f}. Volume alone insufficient."
162
+ else:
163
+ v["verdict"] = "PARTIAL"
164
+ v["reason"] = f"Partial recovery to {test:.3f} from {collapse:.3f}. Volume is a factor but not the only one."
165
+
166
+ elif h_name == "H2_turn_order":
167
+ test = summary.get("randomized_order_3round", {}).get("mean", 0)
168
+ collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
169
+ v["values"] = {"randomized": round(test,4), "collapse": round(collapse,4)}
170
+ if test > collapse + 0.10:
171
+ v["verdict"] = "CONFIRMED"; v["reason"] = f"Randomized order recovered to {test:.3f}."
172
+ elif abs(test - collapse) < 0.05:
173
+ v["verdict"] = "REFUTED"; v["reason"] = f"No difference ({test:.3f} vs {collapse:.3f})."
174
+ else:
175
+ v["verdict"] = "PARTIAL"; v["reason"] = f"Some recovery ({test:.3f} vs {collapse:.3f})."
176
+
177
+ elif h_name == "H3_voting_vulnerability":
178
+ judge = summary.get("judge_vote_3round", {}).get("mean", 0)
179
+ conf = summary.get("confidence_weighted_3round", {}).get("mean", 0)
180
+ collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
181
+ best = max(judge, conf)
182
+ v["values"] = {"judge_vote": round(judge,4), "confidence_weighted": round(conf,4), "collapse": round(collapse,4)}
183
+ if best > collapse + 0.10:
184
+ v["verdict"] = "CONFIRMED"; v["reason"] = f"Alt voting recovered to {best:.3f}. Protocol matters."
185
+ elif abs(judge - collapse) < 0.05 and abs(conf - collapse) < 0.05:
186
+ v["verdict"] = "REFUTED"; v["reason"] = "No voting protocol helps."
187
+ else:
188
+ v["verdict"] = "PARTIAL"
189
+
190
+ elif h_name == "H4_contamination":
191
+ v["values"] = {"retention_rate_round3": round(retention_rate, 4)}
192
+ if retention_rate < 0.5:
193
+ v["verdict"] = "CONFIRMED"; v["reason"] = f"Only {retention_rate:.1%} retained answer. Contamination present."
194
+ elif retention_rate > 0.7:
195
+ v["verdict"] = "REFUTED"; v["reason"] = f"{retention_rate:.1%} retention — honest agents resist corruption."
196
+ else:
197
+ v["verdict"] = "PARTIAL"; v["reason"] = f"{retention_rate:.1%} retention — some contamination."
198
+
199
+ elif h_name == "H5_confidence_distortion":
200
+ test = summary.get("confidence_weighted_3round", {}).get("mean", 0)
201
+ collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
202
+ v["values"] = {"confidence_weighted": round(test,4), "collapse": round(collapse,4)}
203
+ if test > collapse + 0.10:
204
+ v["verdict"] = "CONFIRMED"; v["reason"] = "Confidence weighting recovers accuracy."
205
+ elif abs(test - collapse) < 0.05:
206
+ v["verdict"] = "REFUTED"; v["reason"] = "Confidence weighting does not help."
207
+ else:
208
+ v["verdict"] = "PARTIAL"
209
+
210
+ elif h_name == "H6_skill_dependency":
211
+ weak = summary.get("adversary_weak", {}).get("mean", 0)
212
+ normal = summary.get("adversary_normal", {}).get("mean", 0)
213
+ strong = summary.get("adversary_strong", {}).get("mean", 0)
214
+ oracle = summary.get("adversary_oracle", {}).get("mean", 0)
215
+ collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
216
+ v["values"] = {"weak": round(weak,4), "normal": round(normal,4), "strong": round(strong,4), "oracle": round(oracle,4), "collapse": round(collapse,4)}
217
+ if weak > collapse + 0.10 and oracle <= collapse + 0.05:
218
+ v["verdict"] = "CONFIRMED"; v["reason"] = f"Weak={weak:.3f} avoids collapse, oracle={oracle:.3f} does not. Skill matters."
219
+ elif weak <= collapse + 0.05:
220
+ v["verdict"] = "REFUTED"; v["reason"] = f"Even weak adversary collapses to {weak:.3f}."
221
+ else:
222
+ v["verdict"] = "PARTIAL"; v["reason"] = f"Skill gradient: weak={weak:.3f}, oracle={oracle:.3f}."
223
+
224
+ elif h_name == "H7_topic_vulnerability":
225
+ v["verdict"] = "NEEDS_PER_TOPIC_DATA"
226
+ v["reason"] = "Requires per-topic accuracy breakdown. Check per_topic_outcomes.csv."
227
+
228
+ verdicts[h_name] = v
229
+ except Exception as e:
230
+ verdicts[h_name] = {"hypothesis": h_name, "verdict": "ERROR", "reason": str(e)}
231
+
232
+ return verdicts
233
+
234
+
235
+ def make_charts(summary_rows, retention_rows, flip_rows, out_dir):
236
+ try:
237
+ import matplotlib
238
+ matplotlib.use("Agg")
239
+ import matplotlib.pyplot as plt
240
+ import numpy as np
241
+
242
+ out_dir = Path(out_dir)
243
+ out_dir.mkdir(parents=True, exist_ok=True)
244
+
245
+ # 1. Accuracy by condition
246
+ conds = [r["condition"] for r in summary_rows]
247
+ means = [r["mean_accuracy"] for r in summary_rows]
248
+ mins_ = [r["min_accuracy"] for r in summary_rows]
249
+ maxs_ = [r["max_accuracy"] for r in summary_rows]
250
+ base1_acc = means[0] if means else 0.85
251
+
252
+ fig, ax = plt.subplots(figsize=(14, 5))
253
+ x = np.arange(len(conds))
254
+ errs = [[means[i] - mins_[i] for i in range(len(means))],
255
+ [maxs_[i] - means[i] for i in range(len(means))]]
256
+ ax.bar(x, means, yerr=errs, capsize=4, color="steelblue", edgecolor="navy")
257
+ ax.axhline(y=base1_acc, color="green", linestyle="--", alpha=0.5, linewidth=2, label=f"1-round baseline ({base1_acc:.3f})")
258
+ ax.set_xticks(x)
259
+ ax.set_xticklabels(conds, rotation=45, ha="right", fontsize=7)
260
+ ax.set_ylabel("Accuracy"); ax.set_title("Collapse Mechanism Isolation: Accuracy by Condition")
261
+ ax.legend(); ax.set_ylim(0, 1.05)
262
+ plt.tight_layout(); plt.savefig(out_dir / "fig_accuracy_by_condition.png", dpi=150); plt.close()
263
+
264
+ # 2. Honest retention
265
+ if retention_rows:
266
+ fig, ax = plt.subplots(figsize=(7, 4))
267
+ seeds = sorted(set(r["seed"] for r in retention_rows))
268
+ for s in seeds:
269
+ sr = [r for r in retention_rows if r["seed"] == s]
270
+ ax.plot([2,3], [r["stayed"] for r in sr], "o-", label=f"Seed {s}")
271
+ ax.set_xlabel("Round"); ax.set_ylabel("Honest agents staying with original answer")
272
+ ax.set_title("Honest Answer Retention Across Rounds"); ax.legend(); ax.set_xticks([2,3])
273
+ plt.tight_layout(); plt.savefig(out_dir / "fig_honest_retention.png", dpi=150); plt.close()
274
+
275
+ # 3. Flip pie
276
+ if flip_rows:
277
+ degraded = sum(r["count"] for r in flip_rows if r["flip_direction"] == "degraded")
278
+ improved = sum(r["count"] for r in flip_rows if r["flip_direction"] == "improved")
279
+ stable = sum(r["count"] for r in flip_rows if r["flip_direction"] == "stable")
280
+ fig, ax = plt.subplots(figsize=(6, 6))
281
+ ax.pie([stable, degraded, improved], labels=["Stable", "Degraded", "Improved"],
282
+ colors=["gray","crimson","forestgreen"], autopct="%1.1f%%", startangle=90)
283
+ ax.set_title("Honest Agent Answer Transitions (R1→R3)")
284
+ plt.tight_layout(); plt.savefig(out_dir / "fig_flip_rate.png", dpi=150); plt.close()
285
+
286
+ # 4. Adversary skill gradient
287
+ skill_conds = [c for c in conds if c.startswith("adversary_")]
288
+ if skill_conds:
289
+ skill_accs = [next(r["mean_accuracy"] for r in summary_rows if r["condition"] == c) for c in skill_conds]
290
+ fig, ax = plt.subplots(figsize=(7, 4))
291
+ ax.bar([c.replace("adversary_","") for c in skill_conds], skill_accs,
292
+ color=["lightgreen","steelblue","darkorange","crimson"])
293
+ ax.set_ylabel("Accuracy"); ax.set_title("Adversary Skill Ablation"); ax.set_ylim(0,1.05)
294
+ plt.xticks(rotation=20, ha="right", fontsize=9)
295
+ plt.tight_layout(); plt.savefig(out_dir / "fig_adversary_skill.png", dpi=150); plt.close()
296
+
297
+ print(f" Charts saved to {out_dir}/")
298
+ return True
299
+ except ImportError:
300
+ print(" matplotlib not available — skipping charts")
301
+ return False
302
+
303
+
304
+ def main():
305
+ if len(sys.argv) < 2:
306
+ print("Usage: python analyze_collapse.py <results.json>")
307
+ sys.exit(1)
308
+
309
+ path = Path(sys.argv[1])
310
+ out_dir = path.parent / "analysis"
311
+ out_dir.mkdir(parents=True, exist_ok=True)
312
+
313
+ print(f"Loading {path}...")
314
+ data = load_data(path)
315
+
316
+ # Write CSVs
317
+ summary_rows = make_summary_table(data)
318
+ with open(out_dir / "condition_summary.csv", "w", newline="") as f:
319
+ w = csv.DictWriter(f, fieldnames=["condition","mean_accuracy","min_accuracy","max_accuracy","range"])
320
+ w.writeheader(); w.writerows(summary_rows)
321
+ print(f" condition_summary.csv: {len(summary_rows)} rows")
322
+
323
+ retention_rows = make_retention_table(data)
324
+ with open(out_dir / "honest_retention_by_round.csv", "w", newline="") as f:
325
+ w = csv.DictWriter(f, fieldnames=["seed","round","stayed","flipped_away","flipped_toward","adversary_flips"])
326
+ w.writeheader(); w.writerows(retention_rows)
327
+ print(f" honest_retention_by_round.csv: {len(retention_rows)} rows")
328
+
329
+ flip_rows, adv_rows = make_flip_matrix(data)
330
+ if flip_rows:
331
+ with open(out_dir / "round_flip_matrix.csv", "w", newline="") as f:
332
+ w = csv.DictWriter(f, fieldnames=["seed","transition","round1_correct","round3_correct","count","flip_direction"])
333
+ w.writeheader(); w.writerows(flip_rows)
334
+ print(f" round_flip_matrix.csv: {len(flip_rows)} rows")
335
+ with open(out_dir / "adversary_induced_flips.csv", "w", newline="") as f:
336
+ w = csv.DictWriter(f, fieldnames=["seed","total_adversary_flips"])
337
+ w.writeheader(); w.writerows(adv_rows)
338
+ print(f" adversary_induced_flips.csv: {len(adv_rows)} rows")
339
+
340
+ # Hypothesis verdicts
341
+ verdicts = evaluate_hypotheses(data)
342
+ with open(out_dir / "hypothesis_verdicts.json", "w") as f:
343
+ json.dump(verdicts, f, indent=2)
344
+ print(f"\n Hypothesis verdicts:")
345
+ for h, v in verdicts.items():
346
+ print(f" {h}: {v['verdict']} — {v.get('reason','')[:120]}")
347
+
348
+ # Charts
349
+ make_charts(summary_rows, retention_rows, flip_rows, out_dir)
350
+
351
+ print(f"\nDone. Outputs in {out_dir}/")
352
+
353
+
354
+ if __name__ == "__main__":
355
+ main()