narcolepticchicken commited on
Commit
0755d5f
·
verified ·
1 Parent(s): ef70478

Upload reports/debate_extended_baselines_2seed.json

Browse files
reports/debate_extended_baselines_2seed.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "platform": "H200",
3
+ "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
4
+ "pytorch": "2.11.0+cu130",
5
+ "n_topics": 30,
6
+ "n_agents": 4,
7
+ "agent_mix": "3 honest + 1 adversarial",
8
+ "seeds_completed": [42, 123],
9
+ "seeds_running": [456],
10
+ "date": "2026-05-08",
11
+ "per_seed": {
12
+ "42": {
13
+ "equal_1round": {"accuracy": 0.867, "correct": 26, "tokens": 41812},
14
+ "equal_3round": {"accuracy": 0.567, "correct": 17, "tokens": 150099},
15
+ "random_drop": {"accuracy": 0.833, "correct": 25, "tokens": 34181, "denied": 33},
16
+ "occ_240_5": {"accuracy": 0.800, "correct": 24, "tokens": 40780, "denied": 6},
17
+ "occ_180_3": {"accuracy": 0.867, "correct": 26, "tokens": 39952, "denied": 0},
18
+ "occ_120_3": {"accuracy": 0.833, "correct": 25, "tokens": 42423, "denied": 0}
19
+ },
20
+ "123": {
21
+ "equal_1round": {"accuracy": 0.900, "correct": 27, "tokens": 41875},
22
+ "equal_3round": {"accuracy": 0.567, "correct": 17, "tokens": 149544},
23
+ "random_drop": {"accuracy": 0.867, "correct": 26, "tokens": 27200, "denied": 35},
24
+ "occ_240_5": {"accuracy": 0.767, "correct": 23, "tokens": 32071, "denied": 15},
25
+ "occ_180_3": {"accuracy": 0.800, "correct": 24, "tokens": 42086, "denied": 0},
26
+ "occ_120_3": {"accuracy": 0.867, "correct": 26, "tokens": 42902, "denied": 0}
27
+ }
28
+ },
29
+ "aggregate_seeds_42_123": {
30
+ "equal_1round": {"mean_accuracy": 0.883, "mean_tokens": 41844},
31
+ "equal_3round": {"mean_accuracy": 0.567, "mean_tokens": 149822},
32
+ "random_drop": {"mean_accuracy": 0.850, "mean_tokens": 30691},
33
+ "occ_240_5": {"mean_accuracy": 0.783, "mean_tokens": 36426},
34
+ "occ_180_3": {"mean_accuracy": 0.833, "mean_tokens": 41019},
35
+ "occ_120_3": {"mean_accuracy": 0.850, "mean_tokens": 42663}
36
+ },
37
+ "key_findings": [
38
+ "Equal 3-round collapses to 56.7% on BOTH seeds — 32pp below 1-round baseline (88.3%)",
39
+ "Adversarial agent given 3x speaking time floods the vote pool",
40
+ "Random 25% drop achieves 85.0% with 26.5% token savings — effective but undiscriminating",
41
+ "OCC 180/3 achieves 83.3% at iso-compute (41k vs 42k baseline tokens)",
42
+ "OCC 240/5 is too aggressive (78.3%), OCC 120/3 = random drop (85.0%)",
43
+ "OCC credit allocation prevents catastrophic failure but doesn't beat random gating at moderate budgets"
44
+ ]
45
+ }