narcolepticchicken commited on
Commit
799bf90
·
verified ·
1 Parent(s): 9731829

Upload reports/debate_real_results.json

Browse files
Files changed (1) hide show
  1. reports/debate_real_results.json +26 -0
reports/debate_real_results.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
3
+ "date": "2026-05-07",
4
+ "num_topics": 30,
5
+ "equal_turns": {
6
+ "accuracy": 0.533,
7
+ "correct": 16,
8
+ "total_tokens": 61440,
9
+ "decision_quality_per_1k_tokens": 0.0087,
10
+ "notes": "Single round, 4 agents (3 honest + 1 adversarial), majority vote with unclear-filtered positions. High 'unclear' rate weakens this baseline."
11
+ },
12
+ "occ": {
13
+ "accuracy": 0.833,
14
+ "correct": 25,
15
+ "total_tokens": 138752,
16
+ "decision_quality_per_1k_tokens": 0.0060,
17
+ "rounds": 3,
18
+ "denied_agent_turns": 12,
19
+ "notes": "3 rounds with credit decay (-2 per 2 rounds). Broker denies agents below credit threshold 5. 12 agent-turns denied across all topics. Position extraction still noisy."
20
+ },
21
+ "caveats": {
22
+ "not_iso_compute": "OCC ran 3 rounds vs 1 round for equal turns. The 2.3x token increase is expected. For iso-compute comparison, need a 3-round equal-turns baseline.",
23
+ "position_extraction": "The extract_position() heuristic is too simplistic for nuanced model responses. Many positions classified as 'unclear'.",
24
+ "credit_scoring": "The score_arg() heuristic is crude (rewards presence of words like 'because'). A proper verifier-based scorer would improve OCC allocation decisions."
25
+ }
26
+ }