narcolepticchicken commited on
Commit
b2c7131
·
verified ·
1 Parent(s): 3a8b0c3

Upload benchmarks/benchmark_debate.py

Browse files
Files changed (1) hide show
  1. benchmarks/benchmark_debate.py +419 -0
benchmarks/benchmark_debate.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark 3: Multi-Agent Debate Under Shared Compute
3
+
4
+ Compares:
5
+ A. equal turns
6
+ B. majority vote
7
+ C. confidence-weighted vote
8
+ D. verifier-only allocation
9
+ E. OCC credit allocation
10
+ F. OCC with decay and non-transferability
11
+
12
+ Uses simulated factual disputes and code debates.
13
+ """
14
+
15
+ import json
16
+ import random
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Dict, List, Optional
20
+
21
+ import numpy as np
22
+
23
+ import sys
24
+ sys.path.insert(0, str(Path(__file__).parent.parent))
25
+ from oracle.oracle import ImpactOracle, OracleResult
26
+ from ledger.ledger import CreditLedger
27
+ from broker.broker import ResourceBroker, Decision
28
+
29
+
30
+ @dataclass
31
+ class DebateTopic:
32
+ question: str
33
+ correct_answer: str
34
+ distractors: List[str]
35
+
36
+
37
+ class SimulatedDebateAgent:
38
+ """
39
+ Simulates a debate participant with variable accuracy and confidence.
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ agent_id: str,
45
+ accuracy: float = 0.6,
46
+ confidence_bias: float = 0.1,
47
+ verbose_prob: float = 0.0,
48
+ collude_with: Optional[str] = None,
49
+ ):
50
+ self.agent_id = agent_id
51
+ self.accuracy = accuracy
52
+ self.confidence_bias = confidence_bias
53
+ self.verbose_prob = verbose_prob
54
+ self.collude_with = collude_with
55
+ self.tokens_used = 0
56
+ self.turns_taken = 0
57
+ self.influence_score = 0.0
58
+
59
+ def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
60
+ """Propose an answer with confidence."""
61
+ self.turns_taken += 1
62
+ tokens = 50 + random.randint(0, 50)
63
+ if random.random() < self.verbose_prob:
64
+ tokens *= 4 # verbose padding
65
+ self.tokens_used += tokens
66
+
67
+ # Accuracy
68
+ correct = random.random() < self.accuracy
69
+ if correct:
70
+ answer = topic.correct_answer
71
+ else:
72
+ answer = random.choice(topic.distractors)
73
+
74
+ # Confidence calibration
75
+ if correct:
76
+ confidence = 0.7 + random.random() * 0.3 + self.confidence_bias
77
+ else:
78
+ confidence = 0.4 + random.random() * 0.4 + self.confidence_bias
79
+ confidence = max(0.0, min(1.0, confidence))
80
+
81
+ # Influence: if we agree with majority, our influence is lower
82
+ if prior_proposals:
83
+ majority = max(set(p["answer"] for p in prior_proposals), key=lambda x: sum(1 for p in prior_proposals if p["answer"] == x))
84
+ if answer == majority:
85
+ influence = 0.1
86
+ else:
87
+ influence = 0.5
88
+ else:
89
+ influence = 0.3
90
+
91
+ self.influence_score += influence
92
+
93
+ return {
94
+ "agent_id": self.agent_id,
95
+ "answer": answer,
96
+ "confidence": confidence,
97
+ "correct": correct,
98
+ "tokens": tokens,
99
+ "influence": influence,
100
+ }
101
+
102
+
103
+ class DebateBenchmark:
104
+ """
105
+ Benchmark multi-agent debate under shared compute budgets.
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ n_topics: int = 50,
111
+ n_agents: int = 4,
112
+ budget_per_topic: float = 500.0,
113
+ seed: int = 42,
114
+ ):
115
+ self.n_topics = n_topics
116
+ self.n_agents = n_agents
117
+ self.budget_per_topic = budget_per_topic
118
+ self.seed = seed
119
+ self.topics: List[DebateTopic] = []
120
+ self.oracle = ImpactOracle(compute_budget=budget_per_topic)
121
+
122
+ def generate_topics(self):
123
+ random.seed(self.seed)
124
+ np.random.seed(self.seed)
125
+
126
+ topic_pool = [
127
+ ("What is 15 * 17?", "255", ["245", "265", "225", "275"]),
128
+ ("Capital of Australia?", "Canberra", ["Sydney", "Melbourne", "Perth", "Brisbane"]),
129
+ ("Author of '1984'?", "George Orwell", ["Aldous Huxley", "Ray Bradbury", "H.G. Wells", "Kurt Vonnegut"]),
130
+ ("Square root of 256?", "16", ["14", "18", "12", "20"]),
131
+ ("Element with symbol Au?", "Gold", ["Silver", "Aluminum", "Argon", "Astatine"]),
132
+ ("Year WWI ended?", "1918", ["1919", "1917", "1920", "1916"]),
133
+ ("Smallest prime number?", "2", ["1", "3", "0", "-1"]),
134
+ ("Largest planet?", "Jupiter", ["Saturn", "Neptune", "Uranus", "Earth"]),
135
+ ("Speed of light (m/s)?", "299792458", ["300000000", "299000000", "310000000", "280000000"]),
136
+ ("First US president?", "George Washington", ["Thomas Jefferson", "John Adams", "Abraham Lincoln", "Benjamin Franklin"]),
137
+ ]
138
+
139
+ for i in range(self.n_topics):
140
+ t = topic_pool[i % len(topic_pool)]
141
+ self.topics.append(DebateTopic(question=t[0], correct_answer=t[1], distractors=t[2]))
142
+
143
+ def _resolve_equal_turns(self, agents: List[SimulatedDebateAgent], topic: DebateTopic, turns_per_agent: int = 2) -> Dict:
144
+ """Strategy A: equal turns, then majority vote."""
145
+ proposals = []
146
+ compute_used = 0.0
147
+ for agent in agents:
148
+ for _ in range(turns_per_agent):
149
+ prop = agent.propose(topic, proposals)
150
+ proposals.append(prop)
151
+ compute_used += prop["tokens"]
152
+
153
+ # Majority vote (all proposals equal weight)
154
+ answers = [p["answer"] for p in proposals]
155
+ final_answer = max(set(answers), key=answers.count)
156
+ correct = final_answer == topic.correct_answer
157
+
158
+ return {
159
+ "strategy": "equal_turns",
160
+ "correct": correct,
161
+ "final_answer": final_answer,
162
+ "compute_used": compute_used,
163
+ "n_turns": len(proposals),
164
+ "proposals": proposals,
165
+ }
166
+
167
+ def _resolve_majority_vote(self, agents: List[SimulatedDebateAgent], topic: DebateTopic, turns_per_agent: int = 2) -> Dict:
168
+ """Strategy B: majority vote on first proposal per agent."""
169
+ proposals = []
170
+ compute_used = 0.0
171
+ for agent in agents:
172
+ prop = agent.propose(topic, proposals)
173
+ proposals.append(prop)
174
+ compute_used += prop["tokens"]
175
+
176
+ answers = [p["answer"] for p in proposals]
177
+ final_answer = max(set(answers), key=answers.count)
178
+ correct = final_answer == topic.correct_answer
179
+
180
+ return {
181
+ "strategy": "majority_vote",
182
+ "correct": correct,
183
+ "final_answer": final_answer,
184
+ "compute_used": compute_used,
185
+ "n_turns": len(proposals),
186
+ "proposals": proposals,
187
+ }
188
+
189
+ def _resolve_confidence_weighted(self, agents: List[SimulatedDebateAgent], topic: DebateTopic, turns_per_agent: int = 2) -> Dict:
190
+ """Strategy C: confidence-weighted vote."""
191
+ proposals = []
192
+ compute_used = 0.0
193
+ for agent in agents:
194
+ prop = agent.propose(topic, proposals)
195
+ proposals.append(prop)
196
+ compute_used += prop["tokens"]
197
+
198
+ # Weighted vote by confidence
199
+ vote_scores: Dict[str, float] = {}
200
+ for p in proposals:
201
+ vote_scores[p["answer"]] = vote_scores.get(p["answer"], 0.0) + p["confidence"]
202
+ final_answer = max(vote_scores, key=vote_scores.get)
203
+ correct = final_answer == topic.correct_answer
204
+
205
+ return {
206
+ "strategy": "confidence_weighted",
207
+ "correct": correct,
208
+ "final_answer": final_answer,
209
+ "compute_used": compute_used,
210
+ "n_turns": len(proposals),
211
+ "proposals": proposals,
212
+ }
213
+
214
+ def _resolve_occ_allocation(
215
+ self,
216
+ agents: List[SimulatedDebateAgent],
217
+ topic: DebateTopic,
218
+ max_turns: int = 12,
219
+ use_decay: bool = True,
220
+ ) -> Dict:
221
+ """Strategy E/F: OCC allocates turns based on marginal contribution."""
222
+ ledger = CreditLedger(decay_lambda=0.1 if use_decay else 0.0)
223
+ broker = ResourceBroker()
224
+ proposals = []
225
+ compute_used = 0.0
226
+ turns = 0
227
+
228
+ # One initial proposal from each agent
229
+ for agent in agents:
230
+ prop = agent.propose(topic, proposals)
231
+ proposals.append(prop)
232
+ compute_used += prop["tokens"]
233
+ turns += 1
234
+
235
+ # Score the proposal
236
+ oracle_res = self.oracle.score(
237
+ mode="debate",
238
+ action={"tokens_used": prop["tokens"]},
239
+ context={"previous_correct": False},
240
+ result={
241
+ "final_correct": prop["correct"],
242
+ "agent_contribution": prop["influence"],
243
+ "compute_cost": prop["tokens"],
244
+ "tokens_used": prop["tokens"],
245
+ "total_turns": turns,
246
+ },
247
+ agent_id=agent.agent_id,
248
+ )
249
+
250
+ if prop["correct"]:
251
+ ledger.earn(
252
+ agent_id=agent.agent_id,
253
+ task_id=topic.question[:30],
254
+ action_id=f"turn_{turns}",
255
+ amount=oracle_res.reward_value * 5.0,
256
+ oracle_score=oracle_res.raw_score,
257
+ compute_cost=prop["tokens"],
258
+ reason="correct_proposal",
259
+ )
260
+
261
+ # Iteratively allocate additional turns to best performers
262
+ while turns < max_turns and compute_used < self.budget_per_topic:
263
+ # Sort agents by credit balance
264
+ balances = [(a, ledger.balance(a.agent_id, "general", "global")) for a in agents]
265
+ balances.sort(key=lambda x: x[1], reverse=True)
266
+
267
+ # Try to give a turn to the top agent
268
+ top_agent, top_balance = balances[0]
269
+ dec = broker.request(
270
+ "debate_turn",
271
+ top_agent.agent_id,
272
+ top_balance,
273
+ task_state={"progress": sum(1 for p in proposals if p["correct"]) / len(proposals)},
274
+ )
275
+
276
+ if dec.decision == Decision.DENY:
277
+ # Try next agent
278
+ if len(balances) > 1:
279
+ top_agent, top_balance = balances[1]
280
+ dec = broker.request("debate_turn", top_agent.agent_id, top_balance, task_state={})
281
+ if dec.decision == Decision.DENY:
282
+ break
283
+ else:
284
+ break
285
+
286
+ prop = top_agent.propose(topic, proposals)
287
+ proposals.append(prop)
288
+ compute_used += prop["tokens"]
289
+ turns += 1
290
+
291
+ # Update ledger
292
+ oracle_res = self.oracle.score(
293
+ mode="debate",
294
+ action={"tokens_used": prop["tokens"]},
295
+ context={"previous_correct": any(p["correct"] for p in proposals[:-1])},
296
+ result={
297
+ "final_correct": prop["correct"],
298
+ "agent_contribution": prop["influence"],
299
+ "compute_cost": prop["tokens"],
300
+ "tokens_used": prop["tokens"],
301
+ "total_turns": turns,
302
+ },
303
+ agent_id=top_agent.agent_id,
304
+ )
305
+
306
+ if prop["correct"]:
307
+ ledger.earn(
308
+ agent_id=top_agent.agent_id,
309
+ task_id=topic.question[:30],
310
+ action_id=f"turn_{turns}",
311
+ amount=oracle_res.reward_value * 3.0,
312
+ oracle_score=oracle_res.raw_score,
313
+ compute_cost=prop["tokens"],
314
+ reason="correct_proposal",
315
+ )
316
+ else:
317
+ # Small spend for wrong turn
318
+ ledger.spend(
319
+ agent_id=top_agent.agent_id,
320
+ task_id=topic.question[:30],
321
+ action_id=f"turn_{turns}",
322
+ amount=0.3,
323
+ reason="wrong_proposal_cost",
324
+ )
325
+
326
+ # Weighted vote using final credit balances as weights
327
+ vote_scores: Dict[str, float] = {}
328
+ for p in proposals:
329
+ weight = ledger.balance(p["agent_id"], "general", "global")
330
+ weight = max(0.1, weight)
331
+ vote_scores[p["answer"]] = vote_scores.get(p["answer"], 0.0) + weight
332
+ final_answer = max(vote_scores, key=vote_scores.get)
333
+ correct = final_answer == topic.correct_answer
334
+
335
+ return {
336
+ "strategy": "occ_allocation",
337
+ "correct": correct,
338
+ "final_answer": final_answer,
339
+ "compute_used": compute_used,
340
+ "n_turns": turns,
341
+ "proposals": proposals,
342
+ }
343
+
344
+ def _summarize(self, results: List[Dict], label: str) -> Dict:
345
+ n = len(results)
346
+ correct = sum(1 for r in results if r["correct"])
347
+ total_compute = sum(r["compute_used"] for r in results)
348
+ total_turns = sum(r["n_turns"] for r in results)
349
+
350
+ return {
351
+ "label": label,
352
+ "n_topics": n,
353
+ "accuracy": correct / n if n else 0.0,
354
+ "total_compute": float(total_compute),
355
+ "mean_compute_per_topic": float(total_compute / n) if n else 0.0,
356
+ "total_turns": total_turns,
357
+ "mean_turns_per_topic": float(total_turns / n) if n else 0.0,
358
+ "quality_per_compute": (correct / n) / (total_compute / n) if total_compute else 0.0,
359
+ "results": results,
360
+ }
361
+
362
+ def run_all(self) -> Dict[str, Dict]:
363
+ if not self.topics:
364
+ self.generate_topics()
365
+
366
+ # Create agents with varied abilities
367
+ agents = [
368
+ SimulatedDebateAgent("agent_1", accuracy=0.75, confidence_bias=0.05),
369
+ SimulatedDebateAgent("agent_2", accuracy=0.60, confidence_bias=0.15),
370
+ SimulatedDebateAgent("agent_3", accuracy=0.55, confidence_bias=-0.05),
371
+ SimulatedDebateAgent("agent_4", accuracy=0.50, confidence_bias=0.20),
372
+ ]
373
+
374
+ strategies = [
375
+ ("equal_turns", lambda topic: self._resolve_equal_turns(agents, topic)),
376
+ ("majority_vote", lambda topic: self._resolve_majority_vote(agents, topic)),
377
+ ("confidence_weighted", lambda topic: self._resolve_confidence_weighted(agents, topic)),
378
+ ("occ_allocation", lambda topic: self._resolve_occ_allocation(agents, topic)),
379
+ ]
380
+
381
+ results = {}
382
+ for name, fn in strategies:
383
+ # Reset agents between strategies
384
+ for a in agents:
385
+ a.tokens_used = 0
386
+ a.turns_taken = 0
387
+ a.influence_score = 0.0
388
+
389
+ topic_results = []
390
+ for topic in self.topics:
391
+ topic_results.append(fn(topic))
392
+ results[name] = self._summarize(topic_results, name)
393
+
394
+ return results
395
+
396
+
397
+ def main():
398
+ bench = DebateBenchmark(n_topics=50, n_agents=4, seed=42)
399
+ bench.generate_topics()
400
+ results = bench.run_all()
401
+
402
+ print("=" * 60)
403
+ print("MULTI-AGENT DEBATE BENCHMARK")
404
+ print("=" * 60)
405
+ for label, res in results.items():
406
+ print(f"\n{label}")
407
+ print(f" accuracy: {res['accuracy']:.3f}")
408
+ print(f" mean compute/topic: {res['mean_compute_per_topic']:.1f}")
409
+ print(f" mean turns/topic: {res['mean_turns_per_topic']:.1f}")
410
+ print(f" quality per compute: {res['quality_per_compute']:.6f}")
411
+
412
+ Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
413
+ with open("/app/occ/reports/benchmark_debate_results.json", "w") as f:
414
+ json.dump(results, f, indent=2, default=str)
415
+ print("\nSaved to reports/benchmark_debate_results.json")
416
+
417
+
418
+ if __name__ == "__main__":
419
+ main()