""" Overcome Limitation B: Multi-Agent Debate with Adversarial/Bad Agents. Shows OCC credit-based filtering when some agents are noisy or adversarial. """ import json import random from pathlib import Path from typing import Dict, List import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from benchmarks.benchmark_debate import DebateBenchmark, DebateTopic, SimulatedDebateAgent class AdversarialDebateAgent(SimulatedDebateAgent): """Agent that deliberately gives wrong answers with high confidence.""" def __init__(self, agent_id: str, accuracy: float = 0.2, confidence_bias: float = 0.3, verbose_prob: float = 0.5): super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=verbose_prob) def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict: prop = super().propose(topic, prior_proposals) prop["confidence"] = min(1.0, prop["confidence"] + 0.2) return prop class LazyDebateAgent(SimulatedDebateAgent): """Agent that barely participates.""" def __init__(self, agent_id: str, accuracy: float = 0.3, confidence_bias: float = -0.2): super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=0.0) def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict: prop = super().propose(topic, prior_proposals) prop["tokens"] = 10 prop["confidence"] = 0.3 return prop class DebateAdversarialBenchmark(DebateBenchmark): def __init__(self, n_topics: int = 50, n_agents: int = 4, bad_agent_ratio: float = 0.25, seed: int = 42): super().__init__(n_topics=n_topics, n_agents=n_agents, seed=seed) self.bad_agent_ratio = bad_agent_ratio def create_mixed_agents(self) -> List[SimulatedDebateAgent]: n_bad = max(1, int(self.n_agents * self.bad_agent_ratio)) n_good = self.n_agents - n_bad agents = [] for i in range(n_good): agents.append(SimulatedDebateAgent(f"good_{i}", accuracy=0.75, confidence_bias=0.05)) for i in range(n_bad): if i % 2 == 0: agents.append(AdversarialDebateAgent(f"bad_adv_{i}", accuracy=0.2, confidence_bias=0.3)) else: agents.append(LazyDebateAgent(f"bad_lazy_{i}", accuracy=0.3)) return agents def run_with_bad_agents(self, strategy: str = "occ") -> Dict: agents = self.create_mixed_agents() topic_results = [] for topic in self.topics: if strategy == "equal_turns": topic_results.append(self._resolve_equal_turns(agents, topic)) elif strategy == "occ": topic_results.append(self._resolve_occ_allocation(agents, topic)) elif strategy == "confidence_weighted": topic_results.append(self._resolve_confidence_weighted(agents, topic)) elif strategy == "majority_vote": topic_results.append(self._resolve_majority_vote(agents, topic)) return self._summarize(topic_results, strategy) def run_all_varying_bad_ratios(self) -> Dict: results = {} for ratio in [0.0, 0.25, 0.5]: self.bad_agent_ratio = ratio print(f"\n--- Bad agent ratio: {ratio} ---") for strategy in ["equal_turns", "confidence_weighted", "occ"]: res = self.run_with_bad_agents(strategy) results[f"ratio_{ratio}_{strategy}"] = res print(f" {strategy}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, " f"quality_per_compute={res['quality_per_compute']:.6f}") return results def main(): bench = DebateAdversarialBenchmark(n_topics=50, n_agents=4, bad_agent_ratio=0.25, seed=42) bench.generate_topics() results = bench.run_all_varying_bad_ratios() print("\n" + "=" * 60) print("ADVERSARIAL DEBATE BENCHMARK") print("=" * 60) for label, res in results.items(): print(f"{label:35s}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, " f"quality/compute={res['quality_per_compute']:.6f}") Path("/app/occ/reports").mkdir(parents=True, exist_ok=True) with open("/app/occ/reports/benchmark_debate_adversarial_results.json", "w") as f: json.dump(results, f, indent=2, default=str) print("\nSaved to reports/benchmark_debate_adversarial_results.json") if __name__ == "__main__": main()