narcolepticchicken commited on
Commit
34a5c7f
·
verified ·
1 Parent(s): 26a56c7

Upload jobs/analyze_collapse.py

Browse files
Files changed (1) hide show
  1. jobs/analyze_collapse.py +201 -287
jobs/analyze_collapse.py CHANGED
@@ -1,355 +1,269 @@
1
  #!/usr/bin/env python3
2
  """
3
- OCC Collapse Mechanism — Post-Run Analysis Harness
4
- ===================================================
5
- Reads the mechanism isolation JSON output and produces:
6
- - condition_summary.csv
7
- - per_topic_outcomes.csv
8
- - round_flip_matrix.csv
9
- - honest_retention_by_round.csv
10
- - adversary_induced_flips.csv
11
- - fig_accuracy_by_condition.png
12
- - fig_honest_retention.png
13
- - fig_flip_rate.png
14
- - fig_adversary_skill.png
15
- - hypothesis_verdicts.json
16
-
17
- Uses PRE-REGISTERED win conditions defined BEFORE seeing results.
18
 
19
  Usage:
20
- python analyze_collapse.py /path/to/debate_collapse_mechanism_results.json
21
  """
22
 
23
  import json, csv, sys
24
  from pathlib import Path
25
 
26
- # ─── PRE-REGISTERED WIN CONDITIONS ───
27
- # Defined BEFORE seeing results to prevent post-hoc storytelling.
28
-
29
  HYPOTHESIS_RULES = {
30
  "H1_volume_amplification": {
31
  "description": "Collapse is driven by adversarial compute volume, not mere presence.",
32
- "confirmed_if": "equal_token_unequal_turn.mean_accuracy > baseline_1round.mean_accuracy - 0.10",
33
- "refuted_if": "equal_token_unequal_turn.mean_accuracy <= equal_3round.mean_accuracy + 0.05",
34
- "test_condition": "equal_token_unequal_turn",
35
- "baseline_condition": "equal_3round_traced",
36
- "ref_baseline": "baseline_1round_traced",
37
  },
38
  "H2_turn_order": {
39
  "description": "Collapse depends on adversary speaking last/late.",
40
- "confirmed_if": "randomized_order_3round.mean_accuracy > equal_3round.mean_accuracy + 0.10",
41
- "refuted_if": "abs(randomized_order_3round.mean_accuracy - equal_3round.mean_accuracy) < 0.05",
42
- "test_condition": "randomized_order_3round",
43
- "baseline_condition": "equal_3round_traced",
44
  },
45
  "H3_voting_vulnerability": {
46
- "description": "Majority voting is fragile; judge or confidence-weighted voting resists collapse.",
47
- "confirmed_if": "max(judge_vote_3round.mean, confidence_weighted_3round.mean) > equal_3round.mean + 0.10",
48
- "refuted_if": "abs(judge_vote_3round.mean - equal_3round.mean) < 0.05 and abs(confidence_weighted_3round.mean - equal_3round.mean) < 0.05",
49
- "test_conditions": ["judge_vote_3round", "confidence_weighted_3round"],
50
- "baseline_condition": "equal_3round_traced",
51
  },
52
  "H4_contamination": {
53
  "description": "Honest agents copy adversary answers after exposure.",
54
  "confirmed_if": "honest_retention_rate_round3 < 0.5",
55
  "refuted_if": "honest_retention_rate_round3 > 0.7",
56
- "source": "equal_3round_traced.honest_retention_round3 / (n_topics * 3)",
57
  },
58
  "H5_confidence_distortion": {
59
  "description": "Adversary causes honest agents to become uncertain, not persuaded.",
60
- "confirmed_if": "confidence_weighted_3round.mean > equal_3round.mean + 0.10",
61
- "refuted_if": "abs(confidence_weighted_3round.mean - equal_3round.mean) < 0.05",
62
- "test_condition": "confidence_weighted_3round",
63
- "baseline_condition": "equal_3round_traced",
64
  },
65
  "H6_skill_dependency": {
66
- "description": "Collapse depends on adversary skill; weak adversary does not collapse.",
67
- "confirmed_if": "adversary_weak.mean > equal_3round.mean + 0.10 and adversary_oracle.mean <= equal_3round.mean + 0.05",
68
  "refuted_if": "adversary_weak.mean <= equal_3round.mean + 0.05",
69
- "test_weak": "adversary_weak",
70
- "test_strong": "adversary_strong",
71
- "test_oracle": "adversary_oracle",
72
- "baseline_condition": "equal_3round_traced",
73
  },
74
  "H7_topic_vulnerability": {
75
  "description": "Collapse clusters by question difficulty or ambiguity.",
76
- "confirmed_if": "Some topics show collapse while others are robust in equal_3round",
77
- "refuted_if": "All topics show similar collapse magnitude",
78
- "needs": "per_topic analysis",
79
  },
80
  }
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- def load_data(path):
84
- with open(path) as f:
85
- return json.load(f)
86
-
87
-
88
- def make_summary_table(data):
89
- summary = data.get("summary", {})
90
- return [
91
- {"condition": name, "mean_accuracy": round(s["mean"], 4),
92
- "min_accuracy": round(s["min"], 4), "max_accuracy": round(s["max"], 4),
93
- "range": round(s["max"] - s["min"], 4)}
94
- for name, s in summary.items()
95
- ]
96
-
97
-
98
- def make_retention_table(data):
99
  rows = []
100
- for seed_str, seed_data in data.get("seeds", {}).items():
101
- traced = seed_data.get("equal_3round_traced", {})
102
- if not traced:
103
- continue
104
- for rd in [2, 3]:
105
  rows.append({
106
- "seed": int(seed_str), "round": rd,
107
- "stayed": traced.get(f"honest_retention_round{rd}", 0),
108
- "flipped_away": traced.get(f"flipped_away_round{rd}", 0),
109
- "flipped_toward": traced.get(f"flipped_toward_round{rd}", 0),
110
- "adversary_flips": traced.get("adversary_flips", 0),
111
- })
112
  return rows
113
 
114
-
115
- def make_flip_matrix(data):
116
- flip_rows, adv_rows = [], []
117
- for seed_str, seed_data in data.get("seeds", {}).items():
118
- traced = seed_data.get("equal_3round_traced", {})
119
- for tkey, count in traced.get("transitions", {}).items():
120
- parts = tkey.split("_")
121
- if len(parts) == 2 and parts[0].startswith("h") and parts[1].startswith("h"):
122
- try:
123
- r1, r3 = int(parts[0][1:]), int(parts[1][1:])
124
- flip_rows.append({
125
- "seed": int(seed_str), "transition": tkey,
126
- "round1_correct": r1, "round3_correct": r3,
127
- "count": count,
128
- "flip_direction": "stable" if r1 == r3 else ("degraded" if r1 > r3 else "improved"),
129
- })
130
- except ValueError:
131
- pass
132
- adv_rows.append({"seed": int(seed_str), "total_adversary_flips": traced.get("adversary_flips", 0)})
133
- return flip_rows, adv_rows
134
-
135
-
136
- def evaluate_hypotheses(data):
137
- summary = data.get("summary", {})
138
  verdicts = {}
139
- # Extract retention data from first seed
140
- first_seed = list(data.get("seeds", {}).values())[0] if data.get("seeds") else {}
141
- traced = first_seed.get("equal_3round_traced", {})
142
- retention_r3 = traced.get("honest_retention_round3", 0)
143
- flipped_r3 = traced.get("flipped_away_round3", 0)
144
- total_r3 = retention_r3 + flipped_r3
145
- retention_rate = retention_r3 / total_r3 if total_r3 > 0 else 1.0
146
-
147
- for h_name, rules in HYPOTHESIS_RULES.items():
148
  try:
149
- v = {"hypothesis": h_name, "description": rules["description"], "verdict": "INCONCLUSIVE", "reason": "", "values": {}}
150
-
151
- if h_name == "H1_volume_amplification":
152
- test = summary.get("equal_token_unequal_turn", {}).get("mean", 0)
153
- collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
154
- base1 = summary.get("baseline_1round_traced", {}).get("mean", 1)
155
- v["values"] = {"baseline_1round": round(base1,4), "collapse": round(collapse,4), "equal_token": round(test,4)}
156
- if test > base1 - 0.10:
157
- v["verdict"] = "CONFIRMED"
158
- v["reason"] = f"Equal-token recovered to {test:.3f}, within 10pp of baseline {base1:.3f}. Volume amplification is primary mechanism."
159
- elif test <= collapse + 0.05:
160
- v["verdict"] = "REFUTED"
161
- v["reason"] = f"Equal-token at {test:.3f} barely above collapse {collapse:.3f}. Volume alone insufficient."
162
- else:
163
- v["verdict"] = "PARTIAL"
164
- v["reason"] = f"Partial recovery to {test:.3f} from {collapse:.3f}. Volume is a factor but not the only one."
165
-
166
- elif h_name == "H2_turn_order":
167
- test = summary.get("randomized_order_3round", {}).get("mean", 0)
168
- collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
169
- v["values"] = {"randomized": round(test,4), "collapse": round(collapse,4)}
170
- if test > collapse + 0.10:
171
- v["verdict"] = "CONFIRMED"; v["reason"] = f"Randomized order recovered to {test:.3f}."
172
- elif abs(test - collapse) < 0.05:
173
- v["verdict"] = "REFUTED"; v["reason"] = f"No difference ({test:.3f} vs {collapse:.3f})."
174
- else:
175
- v["verdict"] = "PARTIAL"; v["reason"] = f"Some recovery ({test:.3f} vs {collapse:.3f})."
176
-
177
- elif h_name == "H3_voting_vulnerability":
178
- judge = summary.get("judge_vote_3round", {}).get("mean", 0)
179
- conf = summary.get("confidence_weighted_3round", {}).get("mean", 0)
180
- collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
181
- best = max(judge, conf)
182
- v["values"] = {"judge_vote": round(judge,4), "confidence_weighted": round(conf,4), "collapse": round(collapse,4)}
183
- if best > collapse + 0.10:
184
- v["verdict"] = "CONFIRMED"; v["reason"] = f"Alt voting recovered to {best:.3f}. Protocol matters."
185
- elif abs(judge - collapse) < 0.05 and abs(conf - collapse) < 0.05:
186
- v["verdict"] = "REFUTED"; v["reason"] = "No voting protocol helps."
187
- else:
188
- v["verdict"] = "PARTIAL"
189
-
190
- elif h_name == "H4_contamination":
191
- v["values"] = {"retention_rate_round3": round(retention_rate, 4)}
192
- if retention_rate < 0.5:
193
- v["verdict"] = "CONFIRMED"; v["reason"] = f"Only {retention_rate:.1%} retained answer. Contamination present."
194
- elif retention_rate > 0.7:
195
- v["verdict"] = "REFUTED"; v["reason"] = f"{retention_rate:.1%} retention — honest agents resist corruption."
196
- else:
197
- v["verdict"] = "PARTIAL"; v["reason"] = f"{retention_rate:.1%} retention — some contamination."
198
-
199
- elif h_name == "H5_confidence_distortion":
200
- test = summary.get("confidence_weighted_3round", {}).get("mean", 0)
201
- collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
202
- v["values"] = {"confidence_weighted": round(test,4), "collapse": round(collapse,4)}
203
- if test > collapse + 0.10:
204
- v["verdict"] = "CONFIRMED"; v["reason"] = "Confidence weighting recovers accuracy."
205
- elif abs(test - collapse) < 0.05:
206
- v["verdict"] = "REFUTED"; v["reason"] = "Confidence weighting does not help."
207
- else:
208
- v["verdict"] = "PARTIAL"
209
-
210
- elif h_name == "H6_skill_dependency":
211
- weak = summary.get("adversary_weak", {}).get("mean", 0)
212
- normal = summary.get("adversary_normal", {}).get("mean", 0)
213
- strong = summary.get("adversary_strong", {}).get("mean", 0)
214
- oracle = summary.get("adversary_oracle", {}).get("mean", 0)
215
- collapse = summary.get("equal_3round_traced", {}).get("mean", 0)
216
- v["values"] = {"weak": round(weak,4), "normal": round(normal,4), "strong": round(strong,4), "oracle": round(oracle,4), "collapse": round(collapse,4)}
217
- if weak > collapse + 0.10 and oracle <= collapse + 0.05:
218
- v["verdict"] = "CONFIRMED"; v["reason"] = f"Weak={weak:.3f} avoids collapse, oracle={oracle:.3f} does not. Skill matters."
219
- elif weak <= collapse + 0.05:
220
- v["verdict"] = "REFUTED"; v["reason"] = f"Even weak adversary collapses to {weak:.3f}."
221
- else:
222
- v["verdict"] = "PARTIAL"; v["reason"] = f"Skill gradient: weak={weak:.3f}, oracle={oracle:.3f}."
223
-
224
- elif h_name == "H7_topic_vulnerability":
225
- v["verdict"] = "NEEDS_PER_TOPIC_DATA"
226
- v["reason"] = "Requires per-topic accuracy breakdown. Check per_topic_outcomes.csv."
227
-
228
- verdicts[h_name] = v
229
  except Exception as e:
230
- verdicts[h_name] = {"hypothesis": h_name, "verdict": "ERROR", "reason": str(e)}
231
-
232
  return verdicts
233
 
234
-
235
  def make_charts(summary_rows, retention_rows, flip_rows, out_dir):
236
  try:
237
- import matplotlib
238
- matplotlib.use("Agg")
239
- import matplotlib.pyplot as plt
240
- import numpy as np
241
-
242
- out_dir = Path(out_dir)
243
- out_dir.mkdir(parents=True, exist_ok=True)
244
-
245
- # 1. Accuracy by condition
246
- conds = [r["condition"] for r in summary_rows]
247
- means = [r["mean_accuracy"] for r in summary_rows]
248
- mins_ = [r["min_accuracy"] for r in summary_rows]
249
- maxs_ = [r["max_accuracy"] for r in summary_rows]
250
- base1_acc = means[0] if means else 0.85
251
-
252
- fig, ax = plt.subplots(figsize=(14, 5))
253
- x = np.arange(len(conds))
254
- errs = [[means[i] - mins_[i] for i in range(len(means))],
255
- [maxs_[i] - means[i] for i in range(len(means))]]
256
- ax.bar(x, means, yerr=errs, capsize=4, color="steelblue", edgecolor="navy")
257
- ax.axhline(y=base1_acc, color="green", linestyle="--", alpha=0.5, linewidth=2, label=f"1-round baseline ({base1_acc:.3f})")
258
- ax.set_xticks(x)
259
- ax.set_xticklabels(conds, rotation=45, ha="right", fontsize=7)
260
- ax.set_ylabel("Accuracy"); ax.set_title("Collapse Mechanism Isolation: Accuracy by Condition")
261
- ax.legend(); ax.set_ylim(0, 1.05)
262
- plt.tight_layout(); plt.savefig(out_dir / "fig_accuracy_by_condition.png", dpi=150); plt.close()
263
-
264
- # 2. Honest retention
265
  if retention_rows:
266
- fig, ax = plt.subplots(figsize=(7, 4))
267
- seeds = sorted(set(r["seed"] for r in retention_rows))
268
- for s in seeds:
269
- sr = [r for r in retention_rows if r["seed"] == s]
270
- ax.plot([2,3], [r["stayed"] for r in sr], "o-", label=f"Seed {s}")
271
- ax.set_xlabel("Round"); ax.set_ylabel("Honest agents staying with original answer")
272
- ax.set_title("Honest Answer Retention Across Rounds"); ax.legend(); ax.set_xticks([2,3])
273
- plt.tight_layout(); plt.savefig(out_dir / "fig_honest_retention.png", dpi=150); plt.close()
274
-
275
- # 3. Flip pie
276
  if flip_rows:
277
- degraded = sum(r["count"] for r in flip_rows if r["flip_direction"] == "degraded")
278
- improved = sum(r["count"] for r in flip_rows if r["flip_direction"] == "improved")
279
- stable = sum(r["count"] for r in flip_rows if r["flip_direction"] == "stable")
280
- fig, ax = plt.subplots(figsize=(6, 6))
281
- ax.pie([stable, degraded, improved], labels=["Stable", "Degraded", "Improved"],
282
- colors=["gray","crimson","forestgreen"], autopct="%1.1f%%", startangle=90)
283
- ax.set_title("Honest Agent Answer Transitions (R1→R3)")
284
- plt.tight_layout(); plt.savefig(out_dir / "fig_flip_rate.png", dpi=150); plt.close()
285
-
286
- # 4. Adversary skill gradient
287
- skill_conds = [c for c in conds if c.startswith("adversary_")]
288
- if skill_conds:
289
- skill_accs = [next(r["mean_accuracy"] for r in summary_rows if r["condition"] == c) for c in skill_conds]
290
- fig, ax = plt.subplots(figsize=(7, 4))
291
- ax.bar([c.replace("adversary_","") for c in skill_conds], skill_accs,
292
- color=["lightgreen","steelblue","darkorange","crimson"])
293
  ax.set_ylabel("Accuracy"); ax.set_title("Adversary Skill Ablation"); ax.set_ylim(0,1.05)
294
- plt.xticks(rotation=20, ha="right", fontsize=9)
295
- plt.tight_layout(); plt.savefig(out_dir / "fig_adversary_skill.png", dpi=150); plt.close()
296
-
297
- print(f" Charts saved to {out_dir}/")
298
- return True
299
  except ImportError:
300
- print(" matplotlib not available — skipping charts")
301
- return False
302
-
303
 
304
  def main():
305
- if len(sys.argv) < 2:
306
- print("Usage: python analyze_collapse.py <results.json>")
307
- sys.exit(1)
308
-
309
- path = Path(sys.argv[1])
310
- out_dir = path.parent / "analysis"
311
- out_dir.mkdir(parents=True, exist_ok=True)
312
 
313
- print(f"Loading {path}...")
314
- data = load_data(path)
315
-
316
- # Write CSVs
317
- summary_rows = make_summary_table(data)
318
- with open(out_dir / "condition_summary.csv", "w", newline="") as f:
319
- w = csv.DictWriter(f, fieldnames=["condition","mean_accuracy","min_accuracy","max_accuracy","range"])
320
  w.writeheader(); w.writerows(summary_rows)
321
  print(f" condition_summary.csv: {len(summary_rows)} rows")
322
 
323
- retention_rows = make_retention_table(data)
324
- with open(out_dir / "honest_retention_by_round.csv", "w", newline="") as f:
325
- w = csv.DictWriter(f, fieldnames=["seed","round","stayed","flipped_away","flipped_toward","adversary_flips"])
326
  w.writeheader(); w.writerows(retention_rows)
327
  print(f" honest_retention_by_round.csv: {len(retention_rows)} rows")
328
 
329
- flip_rows, adv_rows = make_flip_matrix(data)
 
 
 
 
 
 
 
330
  if flip_rows:
331
- with open(out_dir / "round_flip_matrix.csv", "w", newline="") as f:
332
- w = csv.DictWriter(f, fieldnames=["seed","transition","round1_correct","round3_correct","count","flip_direction"])
333
  w.writeheader(); w.writerows(flip_rows)
334
  print(f" round_flip_matrix.csv: {len(flip_rows)} rows")
335
- with open(out_dir / "adversary_induced_flips.csv", "w", newline="") as f:
336
- w = csv.DictWriter(f, fieldnames=["seed","total_adversary_flips"])
337
- w.writeheader(); w.writerows(adv_rows)
338
  print(f" adversary_induced_flips.csv: {len(adv_rows)} rows")
339
 
340
- # Hypothesis verdicts
341
- verdicts = evaluate_hypotheses(data)
342
- with open(out_dir / "hypothesis_verdicts.json", "w") as f:
343
- json.dump(verdicts, f, indent=2)
344
  print(f"\n Hypothesis verdicts:")
345
- for h, v in verdicts.items():
346
- print(f" {h}: {v['verdict']} — {v.get('reason','')[:120]}")
347
-
348
- # Charts
349
- make_charts(summary_rows, retention_rows, flip_rows, out_dir)
350
 
 
351
  print(f"\nDone. Outputs in {out_dir}/")
352
 
353
-
354
- if __name__ == "__main__":
355
- main()
 
1
  #!/usr/bin/env python3
2
  """
3
+ OCC Collapse Mechanism — Post-Run Analysis Harness v2
4
+ =======================================================
5
+ Handles both v2 (summary-only) and v3 (per-topic traces) output formats.
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  Usage:
8
+ python analyze_collapse.py <results.json>
9
  """
10
 
11
  import json, csv, sys
12
  from pathlib import Path
13
 
 
 
 
14
  HYPOTHESIS_RULES = {
15
  "H1_volume_amplification": {
16
  "description": "Collapse is driven by adversarial compute volume, not mere presence.",
17
+ "confirmed_if": "equal_token_unequal_turn.mean > baseline_1round_traced.mean - 0.10",
18
+ "refuted_if": "equal_token_unequal_turn.mean <= equal_3round_traced.mean + 0.05",
 
 
 
19
  },
20
  "H2_turn_order": {
21
  "description": "Collapse depends on adversary speaking last/late.",
22
+ "confirmed_if": "randomized_order_3round.mean > equal_3round_traced.mean + 0.10",
23
+ "refuted_if": "abs(randomized_order_3round.mean - equal_3round_traced.mean) < 0.05",
 
 
24
  },
25
  "H3_voting_vulnerability": {
26
+ "description": "Majority voting is fragile; judge or conf-weighted vote resists.",
27
+ "confirmed_if": "max(judge_vote, conf_weighted) > equal_3round_traced.mean + 0.10",
28
+ "refuted_if": "abs(judge_vote - collapse) < 0.05 and abs(conf_weighted - collapse) < 0.05",
 
 
29
  },
30
  "H4_contamination": {
31
  "description": "Honest agents copy adversary answers after exposure.",
32
  "confirmed_if": "honest_retention_rate_round3 < 0.5",
33
  "refuted_if": "honest_retention_rate_round3 > 0.7",
 
34
  },
35
  "H5_confidence_distortion": {
36
  "description": "Adversary causes honest agents to become uncertain, not persuaded.",
37
+ "confirmed_if": "confidence_weighted_3round.mean > equal_3round_traced.mean + 0.10",
38
+ "refuted_if": "abs(confidence_weighted_3round.mean - equal_3round_traced.mean) < 0.05",
 
 
39
  },
40
  "H6_skill_dependency": {
41
+ "description": "Collapse depends on adversary skill.",
42
+ "confirmed_if": "adversary_weak.mean > equal_3round.mean + 0.10 and oracle <= equal_3round.mean + 0.05",
43
  "refuted_if": "adversary_weak.mean <= equal_3round.mean + 0.05",
 
 
 
 
44
  },
45
  "H7_topic_vulnerability": {
46
  "description": "Collapse clusters by question difficulty or ambiguity.",
47
+ "confirmed_if": "per-topic variance in collapse > 0.05",
48
+ "refuted_if": "per-topic variance < 0.02",
 
49
  },
50
  }
51
 
52
+ def load(path):
53
+ with open(path) as f: return json.load(f)
54
+
55
+ def compute_summary(data):
56
+ s = data.get("summary", {})
57
+ if not s:
58
+ from collections import defaultdict
59
+ acc = defaultdict(list)
60
+ for sd in data.get("seeds",{}).values():
61
+ for cn, ci in sd.items():
62
+ if ci.get("accuracy") is not None: acc[cn].append(ci["accuracy"])
63
+ s = {k: {"mean": sum(v)/len(v), "min": min(v), "max": max(v)} for k,v in acc.items()}
64
+ return s
65
+
66
+ def make_summary_rows(summary):
67
+ return [{"condition": n, "mean_accuracy": round(v["mean"],4),
68
+ "min_accuracy": round(v["min"],4), "max_accuracy": round(v["max"],4),
69
+ "range": round(v["max"]-v["min"],4)} for n,v in summary.items()]
70
+
71
+ def make_retention_rows(data):
72
+ rows = []
73
+ for sid, sd in data.get("seeds",{}).items():
74
+ t = sd.get("equal_3round_traced", {})
75
+ if not t or t.get("accuracy") is None: continue
76
+ for rd, lbl in [(2,"round2"),(3,"round3")]:
77
+ rows.append({"seed": int(sid), "round": rd,
78
+ "stayed": t.get(f"honest_retention_{lbl}",0),
79
+ "flipped_away": t.get(f"flipped_away_{lbl}",0),
80
+ "flipped_toward": t.get(f"flipped_toward_{lbl}",0),
81
+ "adversary_flips": t.get("adversary_flips",0)})
82
+ return rows
83
 
84
+ def make_per_topic_rows(data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  rows = []
86
+ for sid, sd in data.get("seeds",{}).items():
87
+ pt = sd.get("equal_3round_traced",{}).get("per_topic_rounds", [])
88
+ for p in pt:
89
+ rounds = p.get("rounds",[])
90
+ ret = p.get("retention",[])
91
  rows.append({
92
+ "seed": int(sid), "topic": p.get("topic","")[:80],
93
+ "stayed_r2": ret[1]["stayed"] if len(ret)>1 else 0,
94
+ "stayed_r3": ret[2]["stayed"] if len(ret)>2 else 0,
95
+ "flipped_away_r3": ret[2]["flipped_away"] if len(ret)>2 else 0,
96
+ "adversary_flips": p.get("adversary_flips",0)})
 
97
  return rows
98
 
99
+ def make_flip_rows(data):
100
+ flips, advs = [], []
101
+ for sid, sd in data.get("seeds",{}).items():
102
+ advs.append({"seed": int(sid),
103
+ "total_adversary_flips": sd.get("equal_3round_traced",{}).get("adversary_flips",0)})
104
+ for p in sd.get("equal_3round_traced",{}).get("per_topic_rounds",[]):
105
+ rds = p.get("rounds",[])
106
+ if len(rds)<3: continue
107
+ for i in range(3):
108
+ r1, r3 = rds[0][i], rds[2][i]
109
+ flips.append({"seed": int(sid), "topic": p.get("topic","")[:60],
110
+ "agent": i, "round1": r1, "round3": r3,
111
+ "flip_direction": "stable" if r1==r3 else ("degraded" if r1=="yes" else "improved")})
112
+ return flips, advs
113
+
114
+ def evaluate(summary_rows, data):
115
+ sm = {r["condition"]: r for r in summary_rows}
116
+ # Get retention from first seed
117
+ fs = list(data.get("seeds",{}).values())[0] if data.get("seeds") else {}
118
+ t = fs.get("equal_3round_traced",{})
119
+ rr = t.get("honest_retention_round3",0)
120
+ fr = t.get("flipped_away_round3",0)
121
+ rate = rr/(rr+fr) if (rr+fr)>0 else 1.0
 
122
  verdicts = {}
123
+ for hn, rl in HYPOTHESIS_RULES.items():
 
 
 
 
 
 
 
 
124
  try:
125
+ v = {"hypothesis": hn, "description": rl["description"], "verdict": "INCONCLUSIVE", "reason":"","values":{}}
126
+ if hn=="H1_volume_amplification":
127
+ test=sm.get("equal_token_unequal_turn",{}).get("mean_accuracy",0)
128
+ col=sm.get("equal_3round_traced",{}).get("mean_accuracy",0)
129
+ b1=sm.get("baseline_1round_traced",{}).get("mean_accuracy",1)
130
+ v["values"]={"b1":round(b1,3),"collapse":round(col,3),"equal_token":round(test,3)}
131
+ if test>b1-0.10: v["verdict"]="CONFIRMED"; v["reason"]=f"Recovered to {test:.3f} vs baseline {b1:.3f}."
132
+ elif test<=col+0.05: v["verdict"]="REFUTED"; v["reason"]=f"No meaningful recovery ({test:.3f} vs {col:.3f})."
133
+ else: v["verdict"]="PARTIAL"; v["reason"]=f"Partial recovery to {test:.3f}."
134
+ elif hn=="H2_turn_order":
135
+ test=sm.get("randomized_order_3round",{}).get("mean_accuracy",0)
136
+ col=sm.get("equal_3round_traced",{}).get("mean_accuracy",0)
137
+ v["values"]={"randomized":round(test,3),"collapse":round(col,3)}
138
+ if test>col+0.10: v["verdict"]="CONFIRMED"; v["reason"]=f"Recovered to {test:.3f}."
139
+ elif abs(test-col)<0.05: v["verdict"]="REFUTED"
140
+ else: v["verdict"]="PARTIAL"
141
+ elif hn=="H3_voting_vulnerability":
142
+ j=sm.get("judge_vote_3round",{}).get("mean_accuracy",0)
143
+ c=sm.get("confidence_weighted_3round",{}).get("mean_accuracy",0)
144
+ col=sm.get("equal_3round_traced",{}).get("mean_accuracy",0)
145
+ best=max(j,c)
146
+ v["values"]={"judge":round(j,3),"conf":round(c,3),"collapse":round(col,3)}
147
+ if best>col+0.10: v["verdict"]="CONFIRMED"; v["reason"]=f"Alt voting recovered to {best:.3f}."
148
+ elif abs(j-col)<0.05 and abs(c-col)<0.05: v["verdict"]="REFUTED"
149
+ else: v["verdict"]="PARTIAL"
150
+ elif hn=="H4_contamination":
151
+ v["values"]={"retention_rate":round(rate,3),"stayed":rr,"flipped":fr}
152
+ if rate<0.5: v["verdict"]="CONFIRMED"; v["reason"]=f"Only {rate:.1%} retained."
153
+ elif rate>0.7: v["verdict"]="REFUTED"; v["reason"]=f"{rate:.1%} retention."
154
+ else: v["verdict"]="PARTIAL"
155
+ elif hn=="H5_confidence_distortion":
156
+ test=sm.get("confidence_weighted_3round",{}).get("mean_accuracy",0)
157
+ col=sm.get("equal_3round_traced",{}).get("mean_accuracy",0)
158
+ v["values"]={"conf":round(test,3),"collapse":round(col,3)}
159
+ if test>col+0.10: v["verdict"]="CONFIRMED"; v["reason"]=f"Confidence weighting recovers to {test:.3f}."
160
+ elif abs(test-col)<0.05: v["verdict"]="REFUTED"
161
+ else: v["verdict"]="PARTIAL"
162
+ elif hn=="H6_skill_dependency":
163
+ w=sm.get("adversary_weak",{}).get("mean_accuracy",0)
164
+ n=sm.get("adversary_normal",{}).get("mean_accuracy",0)
165
+ s=sm.get("adversary_strong",{}).get("mean_accuracy",0)
166
+ o=sm.get("adversary_oracle",{}).get("mean_accuracy",0)
167
+ col=sm.get("equal_3round_traced",{}).get("mean_accuracy",0)
168
+ v["values"]={"weak":round(w,3),"normal":round(n,3),"strong":round(s,3),"oracle":round(o,3)}
169
+ if w>col+0.10 and o<=col+0.05: v["verdict"]="CONFIRMED"; v["reason"]=f"Weak={w:.3f} avoids, oracle={o:.3f} collapses."
170
+ elif w<=col+0.05: v["verdict"]="REFUTED"; v["reason"]=f"Even weak collapses."
171
+ else: v["verdict"]="PARTIAL"
172
+ elif hn=="H7_topic_vulnerability":
173
+ v["verdict"]="NEEDS_PER_TOPIC_DATA"; v["reason"]="Check per_topic_outcomes.csv."
174
+ verdicts[hn]=v
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  except Exception as e:
176
+ verdicts[hn]={"hypothesis":hn,"verdict":"ERROR","reason":str(e)}
 
177
  return verdicts
178
 
 
179
  def make_charts(summary_rows, retention_rows, flip_rows, out_dir):
180
  try:
181
+ import matplotlib; matplotlib.use("Agg")
182
+ import matplotlib.pyplot as plt; import numpy as np
183
+ out_dir=Path(out_dir); out_dir.mkdir(parents=True,exist_ok=True)
184
+ # Accuracy bars
185
+ conds=[r["condition"] for r in summary_rows]
186
+ means=[r["mean_accuracy"] for r in summary_rows]
187
+ mins=[r["min_accuracy"] for r in summary_rows]
188
+ maxs=[r["max_accuracy"] for r in summary_rows]
189
+ fig,ax=plt.subplots(figsize=(14,5))
190
+ x=np.arange(len(conds))
191
+ errs=[[means[i]-mins[i] for i in range(len(means))],[maxs[i]-means[i] for i in range(len(means))]]
192
+ ax.bar(x,means,yerr=errs,capsize=4,color="steelblue",edgecolor="navy")
193
+ if means: ax.axhline(y=means[0],color="green",ls="--",alpha=0.5,lw=2,label=f"1-round baseline ({means[0]:.3f})")
194
+ ax.set_xticks(x); ax.set_xticklabels(conds,rotation=45,ha="right",fontsize=7)
195
+ ax.set_ylabel("Accuracy"); ax.set_title("Collapse Mechanism Isolation"); ax.legend(); ax.set_ylim(0,1.05)
196
+ plt.tight_layout(); plt.savefig(out_dir/"fig_accuracy_by_condition.png",dpi=150); plt.close()
197
+ # Retention
 
 
 
 
 
 
 
 
 
 
 
198
  if retention_rows:
199
+ fig,ax=plt.subplots(figsize=(7,4))
200
+ for s in sorted(set(r["seed"] for r in retention_rows)):
201
+ sr=[r for r in retention_rows if r["seed"]==s]
202
+ ax.plot([2,3],[r["stayed"] for r in sr],"o-",label=f"Seed {s}")
203
+ ax.set_xlabel("Round"); ax.set_ylabel("Honest agents staying"); ax.set_title("Honest Answer Retention")
204
+ ax.legend(); ax.set_xticks([2,3]); plt.tight_layout(); plt.savefig(out_dir/"fig_honest_retention.png",dpi=150); plt.close()
205
+ # Flip pie
 
 
 
206
  if flip_rows:
207
+ degraded=sum(1 for r in flip_rows if r["flip_direction"]=="degraded")
208
+ improved=sum(1 for r in flip_rows if r["flip_direction"]=="improved")
209
+ stable=sum(1 for r in flip_rows if r["flip_direction"]=="stable")
210
+ fig,ax=plt.subplots(figsize=(6,6))
211
+ ax.pie([stable,degraded,improved],labels=["Stable","Degraded","Improved"],
212
+ colors=["gray","crimson","forestgreen"],autopct="%1.1f%%",startangle=90)
213
+ ax.set_title("Honest Agent Transitions (R1→R3)"); plt.tight_layout()
214
+ plt.savefig(out_dir/"fig_flip_rate.png",dpi=150); plt.close()
215
+ # Skill gradient
216
+ sc=[c for c in conds if c.startswith("adversary_")]
217
+ if sc:
218
+ sa=[next(r["mean_accuracy"] for r in summary_rows if r["condition"]==c) for c in sc]
219
+ fig,ax=plt.subplots(figsize=(7,4))
220
+ ax.bar([c.replace("adversary_","") for c in sc],sa,color=["lightgreen","steelblue","darkorange","crimson"])
 
 
221
  ax.set_ylabel("Accuracy"); ax.set_title("Adversary Skill Ablation"); ax.set_ylim(0,1.05)
222
+ plt.xticks(rotation=20,ha="right",fontsize=9); plt.tight_layout(); plt.savefig(out_dir/"fig_adversary_skill.png",dpi=150); plt.close()
223
+ print(f" Charts saved to {out_dir}/"); return True
 
 
 
224
  except ImportError:
225
+ print(" matplotlib not available — skipping charts"); return False
 
 
226
 
227
  def main():
228
+ if len(sys.argv)<2: print("Usage: python analyze_collapse.py <results.json>"); sys.exit(1)
229
+ path=Path(sys.argv[1]); out_dir=path.parent/"analysis"; out_dir.mkdir(parents=True,exist_ok=True)
230
+ print(f"Loading {path}..."); data=load(path)
 
 
 
 
231
 
232
+ summary_rows=make_summary_rows(compute_summary(data))
233
+ with open(out_dir/"condition_summary.csv","w",newline="") as f:
234
+ w=csv.DictWriter(f,fieldnames=["condition","mean_accuracy","min_accuracy","max_accuracy","range"])
 
 
 
 
235
  w.writeheader(); w.writerows(summary_rows)
236
  print(f" condition_summary.csv: {len(summary_rows)} rows")
237
 
238
+ retention_rows=make_retention_rows(data)
239
+ with open(out_dir/"honest_retention_by_round.csv","w",newline="") as f:
240
+ w=csv.DictWriter(f,fieldnames=["seed","round","stayed","flipped_away","flipped_toward","adversary_flips"])
241
  w.writeheader(); w.writerows(retention_rows)
242
  print(f" honest_retention_by_round.csv: {len(retention_rows)} rows")
243
 
244
+ topic_rows=make_per_topic_rows(data)
245
+ if topic_rows:
246
+ with open(out_dir/"per_topic_outcomes.csv","w",newline="") as f:
247
+ w=csv.DictWriter(f,fieldnames=list(topic_rows[0].keys())); w.writeheader(); w.writerows(topic_rows)
248
+ print(f" per_topic_outcomes.csv: {len(topic_rows)} rows")
249
+ else: print(" per_topic_outcomes.csv: NO DATA")
250
+
251
+ flip_rows,adv_rows=make_flip_rows(data)
252
  if flip_rows:
253
+ with open(out_dir/"round_flip_matrix.csv","w",newline="") as f:
254
+ w=csv.DictWriter(f,fieldnames=["seed","topic","agent","round1","round3","flip_direction"])
255
  w.writeheader(); w.writerows(flip_rows)
256
  print(f" round_flip_matrix.csv: {len(flip_rows)} rows")
257
+ with open(out_dir/"adversary_induced_flips.csv","w",newline="") as f:
258
+ w=csv.DictWriter(f,fieldnames=["seed","total_adversary_flips"]); w.writeheader(); w.writerows(adv_rows)
 
259
  print(f" adversary_induced_flips.csv: {len(adv_rows)} rows")
260
 
261
+ verdicts=evaluate(summary_rows,data)
262
+ with open(out_dir/"hypothesis_verdicts.json","w") as f: json.dump(verdicts,f,indent=2)
 
 
263
  print(f"\n Hypothesis verdicts:")
264
+ for h,v in verdicts.items(): print(f" {h}: {v['verdict']} — {v.get('reason','')[:120]}")
 
 
 
 
265
 
266
+ make_charts(summary_rows,retention_rows,flip_rows,out_dir)
267
  print(f"\nDone. Outputs in {out_dir}/")
268
 
269
+ if __name__=="__main__": main()