Upload benchmarks/benchmark_debate_adversarial.py
Browse files
benchmarks/benchmark_debate_adversarial.py
CHANGED
|
@@ -1 +1,100 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Overcome Limitation B: Multi-Agent Debate with Adversarial/Bad Agents.
|
| 3 |
+
Shows OCC credit-based filtering when some agents are noisy or adversarial.
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
import random
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
|
| 10 |
+
import sys
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 12 |
+
from benchmarks.benchmark_debate import DebateBenchmark, DebateTopic, SimulatedDebateAgent
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class AdversarialDebateAgent(SimulatedDebateAgent):
|
| 16 |
+
"""Agent that deliberately gives wrong answers with high confidence."""
|
| 17 |
+
def __init__(self, agent_id: str, accuracy: float = 0.2, confidence_bias: float = 0.3, verbose_prob: float = 0.5):
|
| 18 |
+
super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=verbose_prob)
|
| 19 |
+
|
| 20 |
+
def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
|
| 21 |
+
prop = super().propose(topic, prior_proposals)
|
| 22 |
+
prop["confidence"] = min(1.0, prop["confidence"] + 0.2)
|
| 23 |
+
return prop
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class LazyDebateAgent(SimulatedDebateAgent):
|
| 27 |
+
"""Agent that barely participates."""
|
| 28 |
+
def __init__(self, agent_id: str, accuracy: float = 0.3, confidence_bias: float = -0.2):
|
| 29 |
+
super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=0.0)
|
| 30 |
+
|
| 31 |
+
def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
|
| 32 |
+
prop = super().propose(topic, prior_proposals)
|
| 33 |
+
prop["tokens"] = 10
|
| 34 |
+
prop["confidence"] = 0.3
|
| 35 |
+
return prop
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class DebateAdversarialBenchmark(DebateBenchmark):
|
| 39 |
+
def __init__(self, n_topics: int = 50, n_agents: int = 4, bad_agent_ratio: float = 0.25, seed: int = 42):
|
| 40 |
+
super().__init__(n_topics=n_topics, n_agents=n_agents, seed=seed)
|
| 41 |
+
self.bad_agent_ratio = bad_agent_ratio
|
| 42 |
+
|
| 43 |
+
def create_mixed_agents(self) -> List[SimulatedDebateAgent]:
|
| 44 |
+
n_bad = max(1, int(self.n_agents * self.bad_agent_ratio))
|
| 45 |
+
n_good = self.n_agents - n_bad
|
| 46 |
+
agents = []
|
| 47 |
+
for i in range(n_good):
|
| 48 |
+
agents.append(SimulatedDebateAgent(f"good_{i}", accuracy=0.75, confidence_bias=0.05))
|
| 49 |
+
for i in range(n_bad):
|
| 50 |
+
if i % 2 == 0:
|
| 51 |
+
agents.append(AdversarialDebateAgent(f"bad_adv_{i}", accuracy=0.2, confidence_bias=0.3))
|
| 52 |
+
else:
|
| 53 |
+
agents.append(LazyDebateAgent(f"bad_lazy_{i}", accuracy=0.3))
|
| 54 |
+
return agents
|
| 55 |
+
|
| 56 |
+
def run_with_bad_agents(self, strategy: str = "occ") -> Dict:
|
| 57 |
+
agents = self.create_mixed_agents()
|
| 58 |
+
topic_results = []
|
| 59 |
+
for topic in self.topics:
|
| 60 |
+
if strategy == "equal_turns":
|
| 61 |
+
topic_results.append(self._resolve_equal_turns(agents, topic))
|
| 62 |
+
elif strategy == "occ":
|
| 63 |
+
topic_results.append(self._resolve_occ_allocation(agents, topic))
|
| 64 |
+
elif strategy == "confidence_weighted":
|
| 65 |
+
topic_results.append(self._resolve_confidence_weighted(agents, topic))
|
| 66 |
+
elif strategy == "majority_vote":
|
| 67 |
+
topic_results.append(self._resolve_majority_vote(agents, topic))
|
| 68 |
+
return self._summarize(topic_results, strategy)
|
| 69 |
+
|
| 70 |
+
def run_all_varying_bad_ratios(self) -> Dict:
|
| 71 |
+
results = {}
|
| 72 |
+
for ratio in [0.0, 0.25, 0.5]:
|
| 73 |
+
self.bad_agent_ratio = ratio
|
| 74 |
+
print(f"\n--- Bad agent ratio: {ratio} ---")
|
| 75 |
+
for strategy in ["equal_turns", "confidence_weighted", "occ"]:
|
| 76 |
+
res = self.run_with_bad_agents(strategy)
|
| 77 |
+
results[f"ratio_{ratio}_{strategy}"] = res
|
| 78 |
+
print(f" {strategy}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, "
|
| 79 |
+
f"quality_per_compute={res['quality_per_compute']:.6f}")
|
| 80 |
+
return results
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def main():
|
| 84 |
+
bench = DebateAdversarialBenchmark(n_topics=50, n_agents=4, bad_agent_ratio=0.25, seed=42)
|
| 85 |
+
bench.generate_topics()
|
| 86 |
+
results = bench.run_all_varying_bad_ratios()
|
| 87 |
+
print("\n" + "=" * 60)
|
| 88 |
+
print("ADVERSARIAL DEBATE BENCHMARK")
|
| 89 |
+
print("=" * 60)
|
| 90 |
+
for label, res in results.items():
|
| 91 |
+
print(f"{label:35s}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, "
|
| 92 |
+
f"quality/compute={res['quality_per_compute']:.6f}")
|
| 93 |
+
Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
|
| 94 |
+
with open("/app/occ/reports/benchmark_debate_adversarial_results.json", "w") as f:
|
| 95 |
+
json.dump(results, f, indent=2, default=str)
|
| 96 |
+
print("\nSaved to reports/benchmark_debate_adversarial_results.json")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
main()
|