Upload reports/debate_extended_baselines_2seed.json
Browse files
reports/debate_extended_baselines_2seed.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"platform": "H200",
|
| 3 |
+
"model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
| 4 |
+
"pytorch": "2.11.0+cu130",
|
| 5 |
+
"n_topics": 30,
|
| 6 |
+
"n_agents": 4,
|
| 7 |
+
"agent_mix": "3 honest + 1 adversarial",
|
| 8 |
+
"seeds_completed": [42, 123],
|
| 9 |
+
"seeds_running": [456],
|
| 10 |
+
"date": "2026-05-08",
|
| 11 |
+
"per_seed": {
|
| 12 |
+
"42": {
|
| 13 |
+
"equal_1round": {"accuracy": 0.867, "correct": 26, "tokens": 41812},
|
| 14 |
+
"equal_3round": {"accuracy": 0.567, "correct": 17, "tokens": 150099},
|
| 15 |
+
"random_drop": {"accuracy": 0.833, "correct": 25, "tokens": 34181, "denied": 33},
|
| 16 |
+
"occ_240_5": {"accuracy": 0.800, "correct": 24, "tokens": 40780, "denied": 6},
|
| 17 |
+
"occ_180_3": {"accuracy": 0.867, "correct": 26, "tokens": 39952, "denied": 0},
|
| 18 |
+
"occ_120_3": {"accuracy": 0.833, "correct": 25, "tokens": 42423, "denied": 0}
|
| 19 |
+
},
|
| 20 |
+
"123": {
|
| 21 |
+
"equal_1round": {"accuracy": 0.900, "correct": 27, "tokens": 41875},
|
| 22 |
+
"equal_3round": {"accuracy": 0.567, "correct": 17, "tokens": 149544},
|
| 23 |
+
"random_drop": {"accuracy": 0.867, "correct": 26, "tokens": 27200, "denied": 35},
|
| 24 |
+
"occ_240_5": {"accuracy": 0.767, "correct": 23, "tokens": 32071, "denied": 15},
|
| 25 |
+
"occ_180_3": {"accuracy": 0.800, "correct": 24, "tokens": 42086, "denied": 0},
|
| 26 |
+
"occ_120_3": {"accuracy": 0.867, "correct": 26, "tokens": 42902, "denied": 0}
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"aggregate_seeds_42_123": {
|
| 30 |
+
"equal_1round": {"mean_accuracy": 0.883, "mean_tokens": 41844},
|
| 31 |
+
"equal_3round": {"mean_accuracy": 0.567, "mean_tokens": 149822},
|
| 32 |
+
"random_drop": {"mean_accuracy": 0.850, "mean_tokens": 30691},
|
| 33 |
+
"occ_240_5": {"mean_accuracy": 0.783, "mean_tokens": 36426},
|
| 34 |
+
"occ_180_3": {"mean_accuracy": 0.833, "mean_tokens": 41019},
|
| 35 |
+
"occ_120_3": {"mean_accuracy": 0.850, "mean_tokens": 42663}
|
| 36 |
+
},
|
| 37 |
+
"key_findings": [
|
| 38 |
+
"Equal 3-round collapses to 56.7% on BOTH seeds — 32pp below 1-round baseline (88.3%)",
|
| 39 |
+
"Adversarial agent given 3x speaking time floods the vote pool",
|
| 40 |
+
"Random 25% drop achieves 85.0% with 26.5% token savings — effective but undiscriminating",
|
| 41 |
+
"OCC 180/3 achieves 83.3% at iso-compute (41k vs 42k baseline tokens)",
|
| 42 |
+
"OCC 240/5 is too aggressive (78.3%), OCC 120/3 = random drop (85.0%)",
|
| 43 |
+
"OCC credit allocation prevents catastrophic failure but doesn't beat random gating at moderate budgets"
|
| 44 |
+
]
|
| 45 |
+
}
|