narcolepticchicken commited on
Commit
71e5b0f
·
verified ·
1 Parent(s): c130a96

Upload benchmarks/benchmark_debate_v2.py

Browse files
Files changed (1) hide show
  1. benchmarks/benchmark_debate_v2.py +475 -0
benchmarks/benchmark_debate_v2.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark 3 v2: Multi-Agent Debate with Variable Token Costs and Adversarial Agents
3
+
4
+ Key improvements over v1:
5
+ - Agents have variable cost_per_turn (50 vs 500 tokens) — exposes OCC's advantage
6
+ - Adversarial overconfident agents (high verbosity, low accuracy)
7
+ - Tracks influence efficiency (correct flips per token)
8
+ - Measures bad-agent containment
9
+
10
+ From v1: all agents had similar token costs, limiting compute savings to ~12%.
11
+ With variable costs, OCC should show >>30% savings by denying expensive wrong agents.
12
+ """
13
+
14
+ import json
15
+ import random
16
+ from dataclasses import dataclass, field
17
+ from pathlib import Path
18
+ from typing import Dict, List, Optional, Any
19
+
20
+ import numpy as np
21
+
22
+ import sys
23
+ sys.path.insert(0, str(Path(__file__).parent.parent))
24
+ from oracle.oracle import ImpactOracle, OracleResult
25
+ from ledger.ledger import CreditLedger
26
+ from broker.broker import ResourceBroker, Decision
27
+
28
+
29
+ @dataclass
30
+ class DebateTopic:
31
+ question: str
32
+ correct_answer: str
33
+ distractors: List[str]
34
+
35
+
36
+ @dataclass
37
+ class AgentConfig:
38
+ agent_id: str
39
+ accuracy: float
40
+ cost_per_turn: int # Token cost per debate turn
41
+ confidence_bias: float
42
+ verbose_prob: float # Probability of 4x verbose padding
43
+ is_adversarial: bool = False
44
+
45
+
46
+ class DebateAgent:
47
+ """Simulated debate participant with configurable cost and behavior."""
48
+
49
+ def __init__(self, config: AgentConfig):
50
+ self.config = config
51
+ self.tokens_used = 0
52
+ self.turns_taken = 0
53
+ self.influence_score = 0.0
54
+ self.correct_flips = 0 # times this agent changed majority to correct
55
+ self.wrong_flips = 0 # times this agent changed majority to wrong
56
+
57
+ def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
58
+ self.turns_taken += 1
59
+
60
+ # Variable cost: base cost + verbose padding
61
+ if random.random() < self.config.verbose_prob:
62
+ tokens = self.config.cost_per_turn * 4
63
+ else:
64
+ tokens = self.config.cost_per_turn + random.randint(-10, 20)
65
+ tokens = max(10, tokens)
66
+ self.tokens_used += tokens
67
+
68
+ # Accuracy
69
+ correct = random.random() < self.config.accuracy
70
+ if correct:
71
+ answer = topic.correct_answer
72
+ else:
73
+ answer = random.choice(topic.distractors)
74
+
75
+ # Confidence calibration
76
+ if correct:
77
+ confidence = 0.7 + random.random() * 0.3 + self.config.confidence_bias
78
+ else:
79
+ # Adversarial agents are overconfident about wrong answers
80
+ if self.config.is_adversarial:
81
+ confidence = 0.8 + random.random() * 0.2
82
+ else:
83
+ confidence = 0.4 + random.random() * 0.4 + self.config.confidence_bias
84
+ confidence = max(0.0, min(1.0, confidence))
85
+
86
+ # Influence: disagreeing with current majority is more influential
87
+ if prior_proposals:
88
+ answers = [p["answer"] for p in prior_proposals]
89
+ majority = max(set(answers), key=answers.count)
90
+ if answer == majority:
91
+ influence = 0.1
92
+ else:
93
+ influence = 0.5
94
+ # Track flips
95
+ if correct:
96
+ self.correct_flips += 1
97
+ else:
98
+ self.wrong_flips += 1
99
+ else:
100
+ influence = 0.3
101
+
102
+ self.influence_score += influence
103
+
104
+ return {
105
+ "agent_id": self.config.agent_id,
106
+ "answer": answer,
107
+ "confidence": confidence,
108
+ "correct": correct,
109
+ "tokens": tokens,
110
+ "influence": influence,
111
+ "is_adversarial": self.config.is_adversarial,
112
+ }
113
+
114
+
115
+ class DebateBenchmarkV2:
116
+ """v2: Variable-cost agents + adversarial scenarios."""
117
+
118
+ def __init__(
119
+ self,
120
+ n_topics: int = 50,
121
+ n_agents: int = 5,
122
+ budget_per_topic: float = 2000.0,
123
+ adversarial_fraction: float = 0.4, # 40% of agents are adversarial
124
+ seed: int = 42,
125
+ ):
126
+ self.n_topics = n_topics
127
+ self.n_agents = n_agents
128
+ self.budget_per_topic = budget_per_topic
129
+ self.adversarial_fraction = adversarial_fraction
130
+ self.seed = seed
131
+ self.topics: List[DebateTopic] = []
132
+ self.oracle = ImpactOracle(compute_budget=budget_per_topic)
133
+
134
+ def create_agents(self) -> List[AgentConfig]:
135
+ """Create agents with variable costs and adversarial mix."""
136
+ n_adversarial = int(self.n_agents * self.adversarial_fraction)
137
+ n_normal = self.n_agents - n_adversarial
138
+
139
+ configs = []
140
+
141
+ # Normal agents with variable costs
142
+ base_configs = [
143
+ AgentConfig("agent_fast", accuracy=0.70, cost_per_turn=50, confidence_bias=0.05, verbose_prob=0.05),
144
+ AgentConfig("agent_medium", accuracy=0.65, cost_per_turn=200, confidence_bias=0.10, verbose_prob=0.10),
145
+ AgentConfig("agent_expensive", accuracy=0.72, cost_per_turn=500, confidence_bias=0.02, verbose_prob=0.05),
146
+ ]
147
+ configs.extend(base_configs[:n_normal])
148
+
149
+ # Adversarial agents: high cost, low accuracy, overconfident
150
+ for i in range(n_adversarial):
151
+ configs.append(AgentConfig(
152
+ agent_id=f"agent_adversarial_{i+1}",
153
+ accuracy=0.35 + random.random() * 0.15, # 35-50% accuracy
154
+ cost_per_turn=300 + random.randint(0, 300), # Expensive
155
+ confidence_bias=0.30, # Overconfident
156
+ verbose_prob=0.40, # Verbose
157
+ is_adversarial=True,
158
+ ))
159
+
160
+ random.shuffle(configs)
161
+ return configs
162
+
163
+ def generate_topics(self):
164
+ random.seed(self.seed)
165
+ np.random.seed(self.seed)
166
+
167
+ topic_pool = [
168
+ ("What is 15 * 17?", "255", ["245", "265", "225", "275"]),
169
+ ("Capital of Australia?", "Canberra", ["Sydney", "Melbourne", "Perth", "Brisbane"]),
170
+ ("Author of '1984'?", "George Orwell", ["Aldous Huxley", "Ray Bradbury", "H.G. Wells", "Kurt Vonnegut"]),
171
+ ("Square root of 256?", "16", ["14", "18", "12", "20"]),
172
+ ("Element with symbol Au?", "Gold", ["Silver", "Aluminum", "Argon", "Astatine"]),
173
+ ("Year WWI ended?", "1918", ["1919", "1917", "1920", "1916"]),
174
+ ("Smallest prime number?", "2", ["1", "3", "0", "-1"]),
175
+ ("Largest planet?", "Jupiter", ["Saturn", "Neptune", "Uranus", "Earth"]),
176
+ ("Speed of light (m/s)?", "299792458", ["300000000", "299000000", "310000000", "280000000"]),
177
+ ("First US president?", "George Washington", ["Thomas Jefferson", "John Adams", "Abraham Lincoln", "Benjamin Franklin"]),
178
+ ("Chemical formula of water?", "H2O", ["HO2", "H2O2", "HO", "OH"]),
179
+ ("Number of continents?", "7", ["5", "6", "8", "4"]),
180
+ ("Distance from Earth to Sun (km)?", "149600000", ["150000000", "148000000", "151000000", "147000000"]),
181
+ ("Primary language of Brazil?", "Portuguese", ["Spanish", "English", "French", "Italian"]),
182
+ ("Formula for area of circle?", "pi*r^2", ["2*pi*r", "pi*d", "r^2*pi/2", "pi*r"]),
183
+ ]
184
+
185
+ for i in range(self.n_topics):
186
+ t = topic_pool[i % len(topic_pool)]
187
+ self.topics.append(DebateTopic(question=t[0], correct_answer=t[1], distractors=t[2]))
188
+
189
+ def _resolve_equal_turns(self, agents: List[DebateAgent], topic: DebateTopic, turns_per: int = 2) -> Dict:
190
+ proposals = []
191
+ compute_used = 0.0
192
+ for agent in agents:
193
+ for _ in range(turns_per):
194
+ prop = agent.propose(topic, proposals)
195
+ proposals.append(prop)
196
+ compute_used += prop["tokens"]
197
+
198
+ answers = [p["answer"] for p in proposals]
199
+ final = max(set(answers), key=answers.count)
200
+ correct = final == topic.correct_answer
201
+
202
+ return {
203
+ "strategy": "equal_turns",
204
+ "correct": correct, "final_answer": final,
205
+ "compute_used": compute_used, "n_turns": len(proposals),
206
+ "proposals": proposals,
207
+ "adversarial_turns": sum(1 for p in proposals if p.get("is_adversarial")),
208
+ "bad_agent_tokens": sum(p["tokens"] for p in proposals if p.get("is_adversarial")),
209
+ }
210
+
211
+ def _resolve_majority_vote(self, agents: List[DebateAgent], topic: DebateTopic) -> Dict:
212
+ proposals = []
213
+ compute_used = 0.0
214
+ for agent in agents:
215
+ prop = agent.propose(topic, proposals)
216
+ proposals.append(prop)
217
+ compute_used += prop["tokens"]
218
+
219
+ answers = [p["answer"] for p in proposals]
220
+ final = max(set(answers), key=answers.count)
221
+ correct = final == topic.correct_answer
222
+
223
+ return {
224
+ "strategy": "majority_vote",
225
+ "correct": correct, "final_answer": final,
226
+ "compute_used": compute_used, "n_turns": len(proposals),
227
+ "proposals": proposals,
228
+ "adversarial_turns": sum(1 for p in proposals if p.get("is_adversarial")),
229
+ "bad_agent_tokens": sum(p["tokens"] for p in proposals if p.get("is_adversarial")),
230
+ }
231
+
232
+ def _resolve_confidence_weighted(self, agents: List[DebateAgent], topic: DebateTopic) -> Dict:
233
+ proposals = []
234
+ compute_used = 0.0
235
+ for agent in agents:
236
+ prop = agent.propose(topic, proposals)
237
+ proposals.append(prop)
238
+ compute_used += prop["tokens"]
239
+
240
+ vote_scores: Dict[str, float] = {}
241
+ for p in proposals:
242
+ vote_scores[p["answer"]] = vote_scores.get(p["answer"], 0.0) + p["confidence"]
243
+ final = max(vote_scores, key=vote_scores.get)
244
+ correct = final == topic.correct_answer
245
+
246
+ return {
247
+ "strategy": "confidence_weighted",
248
+ "correct": correct, "final_answer": final,
249
+ "compute_used": compute_used, "n_turns": len(proposals),
250
+ "proposals": proposals,
251
+ "adversarial_turns": sum(1 for p in proposals if p.get("is_adversarial")),
252
+ "bad_agent_tokens": sum(p["tokens"] for p in proposals if p.get("is_adversarial")),
253
+ }
254
+
255
+ def _resolve_occ(self, agents: List[DebateAgent], topic: DebateTopic,
256
+ use_decay: bool = True, max_turns: int = 15) -> Dict:
257
+ """OCC with credit allocation and broker gating."""
258
+ ledger = CreditLedger(decay_lambda=0.1 if use_decay else 0.0)
259
+ broker = ResourceBroker()
260
+ proposals = []
261
+ compute_used = 0.0
262
+ turns = 0
263
+
264
+ # Seed each agent with initial credits
265
+ for agent in agents:
266
+ ledger.earn(agent.config.agent_id, topic.question[:30], "seed", 10.0, 0.0, 0.0, "initial_seed")
267
+
268
+ # One initial proposal from each agent
269
+ for agent in agents:
270
+ prop = agent.propose(topic, proposals)
271
+ proposals.append(prop)
272
+ compute_used += prop["tokens"]
273
+ turns += 1
274
+
275
+ oracle_res = self.oracle.score(
276
+ mode="debate",
277
+ action={"tokens_used": prop["tokens"]},
278
+ context={"previous_correct": False},
279
+ result={
280
+ "final_correct": prop["correct"],
281
+ "agent_contribution": prop["influence"],
282
+ "compute_cost": prop["tokens"],
283
+ "tokens_used": prop["tokens"],
284
+ "total_turns": turns,
285
+ },
286
+ agent_id=agent.config.agent_id,
287
+ )
288
+
289
+ if prop["correct"]:
290
+ ledger.earn(agent.config.agent_id, topic.question[:30], f"turn_{turns}",
291
+ oracle_res.reward_value * 5.0, oracle_res.raw_score, prop["tokens"], "correct")
292
+ else:
293
+ # Wrong cost: proportional to token cost
294
+ wrong_cost = prop["tokens"] / 500.0
295
+ ledger.spend(agent.config.agent_id, topic.question[:30], f"turn_{turns}",
296
+ wrong_cost, reason="wrong_proposal")
297
+
298
+ # Iterative allocation: best agents get more turns
299
+ while turns < max_turns and compute_used < self.budget_per_topic:
300
+ # Rank agents by credit balance
301
+ ranked = sorted(
302
+ [(a, ledger.balance(a.config.agent_id)) for a in agents],
303
+ key=lambda x: x[1], reverse=True,
304
+ )
305
+
306
+ allocated = False
307
+ for agent, balance in ranked:
308
+ dec = broker.request(
309
+ "debate_turn", agent.config.agent_id, balance,
310
+ task_state={
311
+ "correct_so_far": any(p["correct"] for p in proposals),
312
+ "n_adversarial": sum(1 for p in proposals if p.get("is_adversarial")),
313
+ },
314
+ gaming_flags=["adversarial_agent"] if agent.config.is_adversarial else [],
315
+ )
316
+
317
+ if dec.decision == Decision.ALLOW:
318
+ prop = agent.propose(topic, proposals)
319
+ proposals.append(prop)
320
+ compute_used += prop["tokens"]
321
+ turns += 1
322
+
323
+ oracle_res = self.oracle.score(
324
+ mode="debate",
325
+ action={"tokens_used": prop["tokens"]},
326
+ context={"previous_correct": any(p["correct"] for p in proposals[:-1])},
327
+ result={
328
+ "final_correct": prop["correct"],
329
+ "agent_contribution": prop["influence"],
330
+ "compute_cost": prop["tokens"],
331
+ "tokens_used": prop["tokens"],
332
+ "total_turns": turns,
333
+ },
334
+ agent_id=agent.config.agent_id,
335
+ )
336
+
337
+ if prop["correct"]:
338
+ ledger.earn(agent.config.agent_id, topic.question[:30], f"turn_{turns}",
339
+ oracle_res.reward_value * 3.0, oracle_res.raw_score, prop["tokens"], "correct")
340
+ else:
341
+ wrong_cost = prop["tokens"] / 500.0
342
+ ledger.spend(agent.config.agent_id, topic.question[:30], f"turn_{turns}",
343
+ wrong_cost, reason="wrong_proposal")
344
+
345
+ allocated = True
346
+ break # One turn per round
347
+
348
+ if not allocated:
349
+ break
350
+
351
+ # Weighted vote using credit balances
352
+ vote_scores: Dict[str, float] = {}
353
+ for p in proposals:
354
+ w = max(0.1, ledger.balance(p["agent_id"]))
355
+ vote_scores[p["answer"]] = vote_scores.get(p["answer"], 0.0) + w
356
+ final = max(vote_scores, key=vote_scores.get)
357
+ correct = final == topic.correct_answer
358
+
359
+ n_adversarial_turns = sum(1 for p in proposals if p.get("is_adversarial"))
360
+ bad_tokens = sum(p["tokens"] for p in proposals if p.get("is_adversarial"))
361
+ adversarial_contained = n_adversarial_turns <= 1
362
+
363
+ return {
364
+ "strategy": "occ_allocation",
365
+ "correct": correct, "final_answer": final,
366
+ "compute_used": compute_used, "n_turns": turns,
367
+ "proposals": proposals,
368
+ "adversarial_turns": n_adversarial_turns,
369
+ "bad_agent_tokens": bad_tokens,
370
+ "adversarial_contained": adversarial_contained,
371
+ }
372
+
373
+ def _summarize(self, results: List[Dict], label: str) -> Dict:
374
+ n = len(results)
375
+ correct = sum(1 for r in results if r["correct"])
376
+ total_compute = sum(r["compute_used"] for r in results)
377
+ total_turns = sum(r["n_turns"] for r in results)
378
+ total_adv_turns = sum(r.get("adversarial_turns", 0) for r in results)
379
+ total_bad_tokens = sum(r.get("bad_agent_tokens", 0) for r in results)
380
+ contained = sum(1 for r in results if r.get("adversarial_contained", True))
381
+
382
+ return {
383
+ "label": label,
384
+ "n_topics": n,
385
+ "accuracy": correct / n if n else 0.0,
386
+ "total_compute": float(total_compute),
387
+ "mean_compute_per_topic": float(total_compute / n) if n else 0.0,
388
+ "mean_turns": float(total_turns / n) if n else 0.0,
389
+ "mean_adv_turns": float(total_adv_turns / n) if n else 0.0,
390
+ "bad_agent_tokens": float(total_bad_tokens),
391
+ "bad_agent_containment": contained / n if n else 0.0,
392
+ "quality_per_1k_tokens": (correct / n) / (total_compute / 1000) if total_compute else 0.0,
393
+ "results": results,
394
+ }
395
+
396
+ def run_all(self) -> Dict[str, Dict]:
397
+ if not self.topics:
398
+ self.generate_topics()
399
+
400
+ agent_configs = self.create_agents()
401
+ print(f"Agents: {[(c.agent_id, c.accuracy, c.cost_per_turn, c.is_adversarial) for c in agent_configs]}")
402
+
403
+ strategies = {}
404
+
405
+ # A: Equal turns
406
+ agents_a = [DebateAgent(c) for c in agent_configs]
407
+ strategies["A_equal_turns"] = self._summarize(
408
+ [self._resolve_equal_turns(agents_a, t) for t in self.topics], "A. Equal turns"
409
+ )
410
+
411
+ # B: Majority vote
412
+ agents_b = [DebateAgent(c) for c in agent_configs]
413
+ strategies["B_majority_vote"] = self._summarize(
414
+ [self._resolve_majority_vote(agents_b, t) for t in self.topics], "B. Majority vote"
415
+ )
416
+
417
+ # C: Confidence-weighted
418
+ agents_c = [DebateAgent(c) for c in agent_configs]
419
+ strategies["C_confidence_weighted"] = self._summarize(
420
+ [self._resolve_confidence_weighted(agents_c, t) for t in self.topics], "C. Confidence-weighted"
421
+ )
422
+
423
+ # E: OCC with decay
424
+ agents_e = [DebateAgent(c) for c in agent_configs]
425
+ strategies["E_occ"] = self._summarize(
426
+ [self._resolve_occ(agents_e, t, use_decay=True) for t in self.topics], "E. OCC allocation"
427
+ )
428
+
429
+ # F: OCC no decay (ablation)
430
+ agents_f = [DebateAgent(c) for c in agent_configs]
431
+ strategies["F_occ_no_decay"] = self._summarize(
432
+ [self._resolve_occ(agents_f, t, use_decay=False) for t in self.topics], "F. OCC (no decay)"
433
+ )
434
+
435
+ return strategies
436
+
437
+
438
+ def main():
439
+ bench = DebateBenchmarkV2(n_topics=50, n_agents=5, adversarial_fraction=0.4, seed=42)
440
+ bench.generate_topics()
441
+ results = bench.run_all()
442
+
443
+ print("\n" + "=" * 70)
444
+ print("MULTI-AGENT DEBATE BENCHMARK v2 (Variable Costs + Adversarial)")
445
+ print("=" * 70)
446
+ print(f"{'Strategy':<25} {'Acc':>6} {'Comp':>8} {'Turns':>6} {'AdvT':>6} {'BadTok':>8} {'Contain':>8} {'Qual/K':>8}")
447
+ print("-" * 70)
448
+ for key in ["A_equal_turns", "B_majority_vote", "C_confidence_weighted", "E_occ", "F_occ_no_decay"]:
449
+ r = results[key]
450
+ print(f"{r['label']:<25} {r['accuracy']:.3f} {r['mean_compute_per_topic']:>7.0f} {r['mean_turns']:>5.1f} {r['mean_adv_turns']:>5.1f} {r['bad_agent_tokens']:>7.0f} {r['bad_agent_containment']:.2f} {r['quality_per_1k_tokens']:>8.4f}")
451
+
452
+ # Find best baseline accuracy and compute
453
+ baseline_acc = max(results["A_equal_turns"]["accuracy"],
454
+ results["B_majority_vote"]["accuracy"],
455
+ results["C_confidence_weighted"]["accuracy"])
456
+ baseline_comp = min(results["A_equal_turns"]["mean_compute_per_topic"],
457
+ results["B_majority_vote"]["mean_compute_per_topic"],
458
+ results["C_confidence_weighted"]["mean_compute_per_topic"])
459
+
460
+ occ = results["E_occ"]
461
+ print(f"\n--- Key Comparisons ---")
462
+ print(f"Best baseline accuracy: {baseline_acc:.3f}")
463
+ print(f"OCC accuracy: {occ['accuracy']:.3f}")
464
+ print(f"OCC compute saving vs equal_turns: {(1 - occ['mean_compute_per_topic'] / results['A_equal_turns']['mean_compute_per_topic']) * 100:.1f}%")
465
+ print(f"OCC bad-agent containment: {occ['bad_agent_containment']:.1%}")
466
+ print(f"Confidence-weighted bad-agent containment: {results['C_confidence_weighted']['bad_agent_containment']:.1%}")
467
+
468
+ Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
469
+ with open("/app/occ/reports/benchmark_debate_v2_results.json", "w") as f:
470
+ json.dump(results, f, indent=2, default=str)
471
+ print("\nSaved to reports/benchmark_debate_v2_results.json")
472
+
473
+
474
+ if __name__ == "__main__":
475
+ main()