Upload reports/debate_real_results.json
Browse files
reports/debate_real_results.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
|
| 3 |
+
"date": "2026-05-07",
|
| 4 |
+
"num_topics": 30,
|
| 5 |
+
"equal_turns": {
|
| 6 |
+
"accuracy": 0.533,
|
| 7 |
+
"correct": 16,
|
| 8 |
+
"total_tokens": 61440,
|
| 9 |
+
"decision_quality_per_1k_tokens": 0.0087,
|
| 10 |
+
"notes": "Single round, 4 agents (3 honest + 1 adversarial), majority vote with unclear-filtered positions. High 'unclear' rate weakens this baseline."
|
| 11 |
+
},
|
| 12 |
+
"occ": {
|
| 13 |
+
"accuracy": 0.833,
|
| 14 |
+
"correct": 25,
|
| 15 |
+
"total_tokens": 138752,
|
| 16 |
+
"decision_quality_per_1k_tokens": 0.0060,
|
| 17 |
+
"rounds": 3,
|
| 18 |
+
"denied_agent_turns": 12,
|
| 19 |
+
"notes": "3 rounds with credit decay (-2 per 2 rounds). Broker denies agents below credit threshold 5. 12 agent-turns denied across all topics. Position extraction still noisy."
|
| 20 |
+
},
|
| 21 |
+
"caveats": {
|
| 22 |
+
"not_iso_compute": "OCC ran 3 rounds vs 1 round for equal turns. The 2.3x token increase is expected. For iso-compute comparison, need a 3-round equal-turns baseline.",
|
| 23 |
+
"position_extraction": "The extract_position() heuristic is too simplistic for nuanced model responses. Many positions classified as 'unclear'.",
|
| 24 |
+
"credit_scoring": "The score_arg() heuristic is crude (rewards presence of words like 'because'). A proper verifier-based scorer would improve OCC allocation decisions."
|
| 25 |
+
}
|
| 26 |
+
}
|