File size: 4,515 Bytes
e56f288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Overcome Limitation B: Multi-Agent Debate with Adversarial/Bad Agents.
Shows OCC credit-based filtering when some agents are noisy or adversarial.
"""
import json
import random
from pathlib import Path
from typing import Dict, List

import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from benchmarks.benchmark_debate import DebateBenchmark, DebateTopic, SimulatedDebateAgent


class AdversarialDebateAgent(SimulatedDebateAgent):
    """Agent that deliberately gives wrong answers with high confidence."""
    def __init__(self, agent_id: str, accuracy: float = 0.2, confidence_bias: float = 0.3, verbose_prob: float = 0.5):
        super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=verbose_prob)

    def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
        prop = super().propose(topic, prior_proposals)
        prop["confidence"] = min(1.0, prop["confidence"] + 0.2)
        return prop


class LazyDebateAgent(SimulatedDebateAgent):
    """Agent that barely participates."""
    def __init__(self, agent_id: str, accuracy: float = 0.3, confidence_bias: float = -0.2):
        super().__init__(agent_id, accuracy=accuracy, confidence_bias=confidence_bias, verbose_prob=0.0)

    def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict:
        prop = super().propose(topic, prior_proposals)
        prop["tokens"] = 10
        prop["confidence"] = 0.3
        return prop


class DebateAdversarialBenchmark(DebateBenchmark):
    def __init__(self, n_topics: int = 50, n_agents: int = 4, bad_agent_ratio: float = 0.25, seed: int = 42):
        super().__init__(n_topics=n_topics, n_agents=n_agents, seed=seed)
        self.bad_agent_ratio = bad_agent_ratio

    def create_mixed_agents(self) -> List[SimulatedDebateAgent]:
        n_bad = max(1, int(self.n_agents * self.bad_agent_ratio))
        n_good = self.n_agents - n_bad
        agents = []
        for i in range(n_good):
            agents.append(SimulatedDebateAgent(f"good_{i}", accuracy=0.75, confidence_bias=0.05))
        for i in range(n_bad):
            if i % 2 == 0:
                agents.append(AdversarialDebateAgent(f"bad_adv_{i}", accuracy=0.2, confidence_bias=0.3))
            else:
                agents.append(LazyDebateAgent(f"bad_lazy_{i}", accuracy=0.3))
        return agents

    def run_with_bad_agents(self, strategy: str = "occ") -> Dict:
        agents = self.create_mixed_agents()
        topic_results = []
        for topic in self.topics:
            if strategy == "equal_turns":
                topic_results.append(self._resolve_equal_turns(agents, topic))
            elif strategy == "occ":
                topic_results.append(self._resolve_occ_allocation(agents, topic))
            elif strategy == "confidence_weighted":
                topic_results.append(self._resolve_confidence_weighted(agents, topic))
            elif strategy == "majority_vote":
                topic_results.append(self._resolve_majority_vote(agents, topic))
        return self._summarize(topic_results, strategy)

    def run_all_varying_bad_ratios(self) -> Dict:
        results = {}
        for ratio in [0.0, 0.25, 0.5]:
            self.bad_agent_ratio = ratio
            print(f"\n--- Bad agent ratio: {ratio} ---")
            for strategy in ["equal_turns", "confidence_weighted", "occ"]:
                res = self.run_with_bad_agents(strategy)
                results[f"ratio_{ratio}_{strategy}"] = res
                print(f"  {strategy}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, "
                      f"quality_per_compute={res['quality_per_compute']:.6f}")
        return results


def main():
    bench = DebateAdversarialBenchmark(n_topics=50, n_agents=4, bad_agent_ratio=0.25, seed=42)
    bench.generate_topics()
    results = bench.run_all_varying_bad_ratios()
    print("\n" + "=" * 60)
    print("ADVERSARIAL DEBATE BENCHMARK")
    print("=" * 60)
    for label, res in results.items():
        print(f"{label:35s}: acc={res['accuracy']:.3f}, compute={res['mean_compute_per_topic']:.1f}, "
              f"quality/compute={res['quality_per_compute']:.6f}")
    Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
    with open("/app/occ/reports/benchmark_debate_adversarial_results.json", "w") as f:
        json.dump(results, f, indent=2, default=str)
    print("\nSaved to reports/benchmark_debate_adversarial_results.json")


if __name__ == "__main__":
    main()