narcolepticchicken commited on
Commit
e56f288
·
verified ·
1 Parent(s): 4e06751

Upload benchmarks/benchmark_debate_adversarial.py

Browse files
benchmarks/benchmark_debate_adversarial.py CHANGED
@@ -1 +1,100 @@
1
- See /app/occ/benchmarks/benchmark_debate_adversarial.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Overcome Limitation B: Multi-Agent Debate with Adversarial/Bad Agents.
3
+ Shows OCC credit-based filtering when some agents are noisy or adversarial.
4
+ """
5
+ import json
6
+ import random
7
+ from pathlib import Path
8
+ from typing import Dict, List
9
+
10
+ import sys
11
+ sys.path.insert(0, str(Path(__file__).parent.parent))
12
+ from benchmarks.benchmark_debate import DebateBenchmark, DebateTopic, SimulatedDebateAgent
13
+
14
+
15
+ class AdversarialDebateAgent(SimulatedDebateAgent):
16
+ """Agent that deliberately gives wrong answers with high confidence."""
17
+ def __init__(self, agent_id: str, accuracy: float = 0.2, confidence_bias: float = 0.3, verbose_prob: float = 0.5):
18
+ super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=verbose_prob)
19
+
20
+ def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
21
+ prop = super().propose(topic, prior_proposals)
22
+ prop["confidence"] = min(1.0, prop["confidence"] + 0.2)
23
+ return prop
24
+
25
+
26
+ class LazyDebateAgent(SimulatedDebateAgent):
27
+ """Agent that barely participates."""
28
+ def __init__(self, agent_id: str, accuracy: float = 0.3, confidence_bias: float = -0.2):
29
+ super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=0.0)
30
+
31
+ def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
32
+ prop = super().propose(topic, prior_proposals)
33
+ prop["tokens"] = 10
34
+ prop["confidence"] = 0.3
35
+ return prop
36
+
37
+
38
+ class DebateAdversarialBenchmark(DebateBenchmark):
39
+ def __init__(self, n_topics: int = 50, n_agents: int = 4, bad_agent_ratio: float = 0.25, seed: int = 42):
40
+ super().__init__(n_topics=n_topics, n_agents=n_agents, seed=seed)
41
+ self.bad_agent_ratio = bad_agent_ratio
42
+
43
+ def create_mixed_agents(self) -> List[SimulatedDebateAgent]:
44
+ n_bad = max(1, int(self.n_agents * self.bad_agent_ratio))
45
+ n_good = self.n_agents - n_bad
46
+ agents = []
47
+ for i in range(n_good):
48
+ agents.append(SimulatedDebateAgent(f"good_{i}", accuracy=0.75, confidence_bias=0.05))
49
+ for i in range(n_bad):
50
+ if i % 2 == 0:
51
+ agents.append(AdversarialDebateAgent(f"bad_adv_{i}", accuracy=0.2, confidence_bias=0.3))
52
+ else:
53
+ agents.append(LazyDebateAgent(f"bad_lazy_{i}", accuracy=0.3))
54
+ return agents
55
+
56
+ def run_with_bad_agents(self, strategy: str = "occ") -> Dict:
57
+ agents = self.create_mixed_agents()
58
+ topic_results = []
59
+ for topic in self.topics:
60
+ if strategy == "equal_turns":
61
+ topic_results.append(self._resolve_equal_turns(agents, topic))
62
+ elif strategy == "occ":
63
+ topic_results.append(self._resolve_occ_allocation(agents, topic))
64
+ elif strategy == "confidence_weighted":
65
+ topic_results.append(self._resolve_confidence_weighted(agents, topic))
66
+ elif strategy == "majority_vote":
67
+ topic_results.append(self._resolve_majority_vote(agents, topic))
68
+ return self._summarize(topic_results, strategy)
69
+
70
+ def run_all_varying_bad_ratios(self) -> Dict:
71
+ results = {}
72
+ for ratio in [0.0, 0.25, 0.5]:
73
+ self.bad_agent_ratio = ratio
74
+ print(f"\n--- Bad agent ratio: {ratio} ---")
75
+ for strategy in ["equal_turns", "confidence_weighted", "occ"]:
76
+ res = self.run_with_bad_agents(strategy)
77
+ results[f"ratio_{ratio}_{strategy}"] = res
78
+ print(f" {strategy}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, "
79
+ f"quality_per_compute={res['quality_per_compute']:.6f}")
80
+ return results
81
+
82
+
83
+ def main():
84
+ bench = DebateAdversarialBenchmark(n_topics=50, n_agents=4, bad_agent_ratio=0.25, seed=42)
85
+ bench.generate_topics()
86
+ results = bench.run_all_varying_bad_ratios()
87
+ print("\n" + "=" * 60)
88
+ print("ADVERSARIAL DEBATE BENCHMARK")
89
+ print("=" * 60)
90
+ for label, res in results.items():
91
+ print(f"{label:35s}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, "
92
+ f"quality/compute={res['quality_per_compute']:.6f}")
93
+ Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
94
+ with open("/app/occ/reports/benchmark_debate_adversarial_results.json", "w") as f:
95
+ json.dump(results, f, indent=2, default=str)
96
+ print("\nSaved to reports/benchmark_debate_adversarial_results.json")
97
+
98
+
99
+ if __name__ == "__main__":
100
+ main()