narcolepticchicken commited on
Commit
81ca5aa
·
verified ·
1 Parent(s): 58df7e3

Upload reports/debate_v2_results.json

Browse files
Files changed (1) hide show
  1. reports/debate_v2_results.json +66 -0
reports/debate_v2_results.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "benchmark": "debate_v2",
3
+ "config": {
4
+ "n_topics": 100,
5
+ "n_agents": 5,
6
+ "adversarial_fraction": 0.4,
7
+ "budget_per_topic": 2000,
8
+ "seed": 42
9
+ },
10
+ "agents": [
11
+ {"id": "agent_fast", "accuracy": 0.70, "cost_per_turn": 50, "adversarial": false},
12
+ {"id": "agent_medium", "accuracy": 0.65, "cost_per_turn": 200, "adversarial": false},
13
+ {"id": "agent_expensive", "accuracy": 0.72, "cost_per_turn": 500, "adversarial": false},
14
+ {"id": "agent_adv_1", "accuracy": 0.446, "cost_per_turn": 312, "adversarial": true},
15
+ {"id": "agent_adv_2", "accuracy": 0.461, "cost_per_turn": 425, "adversarial": true}
16
+ ],
17
+ "results": {
18
+ "A_equal_turns": {
19
+ "accuracy": 0.930,
20
+ "mean_compute_per_topic": 5087,
21
+ "mean_turns": 10.0,
22
+ "mean_adv_turns": 4.0,
23
+ "bad_agent_tokens": 324857,
24
+ "containment": 1.00
25
+ },
26
+ "B_majority_vote": {
27
+ "accuracy": 0.820,
28
+ "mean_compute_per_topic": 2521,
29
+ "mean_turns": 5.0,
30
+ "mean_adv_turns": 2.0,
31
+ "bad_agent_tokens": 157301,
32
+ "containment": 1.00
33
+ },
34
+ "C_confidence_weighted": {
35
+ "accuracy": 0.910,
36
+ "mean_compute_per_topic": 2421,
37
+ "mean_turns": 5.0,
38
+ "mean_adv_turns": 2.0,
39
+ "bad_agent_tokens": 151612,
40
+ "containment": 1.00
41
+ },
42
+ "E_occ": {
43
+ "accuracy": 0.930,
44
+ "mean_compute_per_topic": 2890,
45
+ "mean_turns": 6.5,
46
+ "mean_adv_turns": 2.0,
47
+ "bad_agent_tokens": 177431,
48
+ "containment": 0.00
49
+ },
50
+ "F_occ_no_decay": {
51
+ "accuracy": 0.870,
52
+ "mean_compute_per_topic": 2727,
53
+ "mean_turns": 6.2,
54
+ "mean_adv_turns": 2.0,
55
+ "bad_agent_tokens": 166407,
56
+ "containment": 0.00
57
+ }
58
+ },
59
+ "key_comparisons": {
60
+ "occ_vs_equal_turns_compute_savings": "43.2%",
61
+ "occ_accuracy": 0.930,
62
+ "best_baseline_accuracy": 0.930,
63
+ "occ_matches_best_accuracy": true,
64
+ "confidence_weighted_risk": "With adversarial agents, confidence-weighted voting can amplify wrong overconfident answers"
65
+ }
66
+ }