| """ |
| Benchmark 3: Multi-Agent Debate Under Shared Compute |
| |
| Compares: |
| A. equal turns |
| B. majority vote |
| C. confidence-weighted vote |
| D. verifier-only allocation |
| E. OCC credit allocation |
| F. OCC with decay and non-transferability |
| |
| Uses simulated factual disputes and code debates. |
| """ |
|
|
| import json |
| import random |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Dict, List, Optional |
|
|
| import numpy as np |
|
|
| import sys |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
| from oracle.oracle import ImpactOracle, OracleResult |
| from ledger.ledger import CreditLedger |
| from broker.broker import ResourceBroker, Decision |
|
|
|
|
| @dataclass |
| class DebateTopic: |
| question: str |
| correct_answer: str |
| distractors: List[str] |
|
|
|
|
| class SimulatedDebateAgent: |
| """ |
| Simulates a debate participant with variable accuracy and confidence. |
| """ |
|
|
| def __init__( |
| self, |
| agent_id: str, |
| accuracy: float = 0.6, |
| confidence_bias: float = 0.1, |
| verbose_prob: float = 0.0, |
| collude_with: Optional[str] = None, |
| ): |
| self.agent_id = agent_id |
| self.accuracy = accuracy |
| self.confidence_bias = confidence_bias |
| self.verbose_prob = verbose_prob |
| self.collude_with = collude_with |
| self.tokens_used = 0 |
| self.turns_taken = 0 |
| self.influence_score = 0.0 |
|
|
| def propose(self, topic: DebateTopic, prior_proposals: List[Dict]) -> Dict: |
| """Propose an answer with confidence.""" |
| self.turns_taken += 1 |
| tokens = 50 + random.randint(0, 50) |
| if random.random() < self.verbose_prob: |
| tokens *= 4 |
| self.tokens_used += tokens |
|
|
| |
| correct = random.random() < self.accuracy |
| if correct: |
| answer = topic.correct_answer |
| else: |
| answer = random.choice(topic.distractors) |
|
|
| |
| if correct: |
| confidence = 0.7 + random.random() * 0.3 + self.confidence_bias |
| else: |
| confidence = 0.4 + random.random() * 0.4 + self.confidence_bias |
| confidence = max(0.0, min(1.0, confidence)) |
|
|
| |
| if prior_proposals: |
| majority = max(set(p["answer"] for p in prior_proposals), key=lambda x: sum(1 for p in prior_proposals if p["answer"] == x)) |
| if answer == majority: |
| influence = 0.1 |
| else: |
| influence = 0.5 |
| else: |
| influence = 0.3 |
|
|
| self.influence_score += influence |
|
|
| return { |
| "agent_id": self.agent_id, |
| "answer": answer, |
| "confidence": confidence, |
| "correct": correct, |
| "tokens": tokens, |
| "influence": influence, |
| } |
|
|
|
|
| class DebateBenchmark: |
| """ |
| Benchmark multi-agent debate under shared compute budgets. |
| """ |
|
|
| def __init__( |
| self, |
| n_topics: int = 50, |
| n_agents: int = 4, |
| budget_per_topic: float = 500.0, |
| seed: int = 42, |
| ): |
| self.n_topics = n_topics |
| self.n_agents = n_agents |
| self.budget_per_topic = budget_per_topic |
| self.seed = seed |
| self.topics: List[DebateTopic] = [] |
| self.oracle = ImpactOracle(compute_budget=budget_per_topic) |
|
|
| def generate_topics(self): |
| random.seed(self.seed) |
| np.random.seed(self.seed) |
|
|
| topic_pool = [ |
| ("What is 15 * 17?", "255", ["245", "265", "225", "275"]), |
| ("Capital of Australia?", "Canberra", ["Sydney", "Melbourne", "Perth", "Brisbane"]), |
| ("Author of '1984'?", "George Orwell", ["Aldous Huxley", "Ray Bradbury", "H.G. Wells", "Kurt Vonnegut"]), |
| ("Square root of 256?", "16", ["14", "18", "12", "20"]), |
| ("Element with symbol Au?", "Gold", ["Silver", "Aluminum", "Argon", "Astatine"]), |
| ("Year WWI ended?", "1918", ["1919", "1917", "1920", "1916"]), |
| ("Smallest prime number?", "2", ["1", "3", "0", "-1"]), |
| ("Largest planet?", "Jupiter", ["Saturn", "Neptune", "Uranus", "Earth"]), |
| ("Speed of light (m/s)?", "299792458", ["300000000", "299000000", "310000000", "280000000"]), |
| ("First US president?", "George Washington", ["Thomas Jefferson", "John Adams", "Abraham Lincoln", "Benjamin Franklin"]), |
| ] |
|
|
| for i in range(self.n_topics): |
| t = topic_pool[i % len(topic_pool)] |
| self.topics.append(DebateTopic(question=t[0], correct_answer=t[1], distractors=t[2])) |
|
|
| def _resolve_equal_turns(self, agents: List[SimulatedDebateAgent], topic: DebateTopic, turns_per_agent: int = 2) -> Dict: |
| """Strategy A: equal turns, then majority vote.""" |
| proposals = [] |
| compute_used = 0.0 |
| for agent in agents: |
| for _ in range(turns_per_agent): |
| prop = agent.propose(topic, proposals) |
| proposals.append(prop) |
| compute_used += prop["tokens"] |
|
|
| |
| answers = [p["answer"] for p in proposals] |
| final_answer = max(set(answers), key=answers.count) |
| correct = final_answer == topic.correct_answer |
|
|
| return { |
| "strategy": "equal_turns", |
| "correct": correct, |
| "final_answer": final_answer, |
| "compute_used": compute_used, |
| "n_turns": len(proposals), |
| "proposals": proposals, |
| } |
|
|
| def _resolve_majority_vote(self, agents: List[SimulatedDebateAgent], topic: DebateTopic, turns_per_agent: int = 2) -> Dict: |
| """Strategy B: majority vote on first proposal per agent.""" |
| proposals = [] |
| compute_used = 0.0 |
| for agent in agents: |
| prop = agent.propose(topic, proposals) |
| proposals.append(prop) |
| compute_used += prop["tokens"] |
|
|
| answers = [p["answer"] for p in proposals] |
| final_answer = max(set(answers), key=answers.count) |
| correct = final_answer == topic.correct_answer |
|
|
| return { |
| "strategy": "majority_vote", |
| "correct": correct, |
| "final_answer": final_answer, |
| "compute_used": compute_used, |
| "n_turns": len(proposals), |
| "proposals": proposals, |
| } |
|
|
| def _resolve_confidence_weighted(self, agents: List[SimulatedDebateAgent], topic: DebateTopic, turns_per_agent: int = 2) -> Dict: |
| """Strategy C: confidence-weighted vote.""" |
| proposals = [] |
| compute_used = 0.0 |
| for agent in agents: |
| prop = agent.propose(topic, proposals) |
| proposals.append(prop) |
| compute_used += prop["tokens"] |
|
|
| |
| vote_scores: Dict[str, float] = {} |
| for p in proposals: |
| vote_scores[p["answer"]] = vote_scores.get(p["answer"], 0.0) + p["confidence"] |
| final_answer = max(vote_scores, key=vote_scores.get) |
| correct = final_answer == topic.correct_answer |
|
|
| return { |
| "strategy": "confidence_weighted", |
| "correct": correct, |
| "final_answer": final_answer, |
| "compute_used": compute_used, |
| "n_turns": len(proposals), |
| "proposals": proposals, |
| } |
|
|
| def _resolve_occ_allocation( |
| self, |
| agents: List[SimulatedDebateAgent], |
| topic: DebateTopic, |
| max_turns: int = 12, |
| use_decay: bool = True, |
| ) -> Dict: |
| """Strategy E/F: OCC allocates turns based on marginal contribution.""" |
| ledger = CreditLedger(decay_lambda=0.1 if use_decay else 0.0) |
| broker = ResourceBroker() |
| proposals = [] |
| compute_used = 0.0 |
| turns = 0 |
|
|
| |
| for agent in agents: |
| prop = agent.propose(topic, proposals) |
| proposals.append(prop) |
| compute_used += prop["tokens"] |
| turns += 1 |
|
|
| |
| oracle_res = self.oracle.score( |
| mode="debate", |
| action={"tokens_used": prop["tokens"]}, |
| context={"previous_correct": False}, |
| result={ |
| "final_correct": prop["correct"], |
| "agent_contribution": prop["influence"], |
| "compute_cost": prop["tokens"], |
| "tokens_used": prop["tokens"], |
| "total_turns": turns, |
| }, |
| agent_id=agent.agent_id, |
| ) |
|
|
| if prop["correct"]: |
| ledger.earn( |
| agent_id=agent.agent_id, |
| task_id=topic.question[:30], |
| action_id=f"turn_{turns}", |
| amount=oracle_res.reward_value * 5.0, |
| oracle_score=oracle_res.raw_score, |
| compute_cost=prop["tokens"], |
| reason="correct_proposal", |
| ) |
|
|
| |
| while turns < max_turns and compute_used < self.budget_per_topic: |
| |
| balances = [(a, ledger.balance(a.agent_id, "general", "global")) for a in agents] |
| balances.sort(key=lambda x: x[1], reverse=True) |
|
|
| |
| top_agent, top_balance = balances[0] |
| dec = broker.request( |
| "debate_turn", |
| top_agent.agent_id, |
| top_balance, |
| task_state={"progress": sum(1 for p in proposals if p["correct"]) / len(proposals)}, |
| ) |
|
|
| if dec.decision == Decision.DENY: |
| |
| if len(balances) > 1: |
| top_agent, top_balance = balances[1] |
| dec = broker.request("debate_turn", top_agent.agent_id, top_balance, task_state={}) |
| if dec.decision == Decision.DENY: |
| break |
| else: |
| break |
|
|
| prop = top_agent.propose(topic, proposals) |
| proposals.append(prop) |
| compute_used += prop["tokens"] |
| turns += 1 |
|
|
| |
| oracle_res = self.oracle.score( |
| mode="debate", |
| action={"tokens_used": prop["tokens"]}, |
| context={"previous_correct": any(p["correct"] for p in proposals[:-1])}, |
| result={ |
| "final_correct": prop["correct"], |
| "agent_contribution": prop["influence"], |
| "compute_cost": prop["tokens"], |
| "tokens_used": prop["tokens"], |
| "total_turns": turns, |
| }, |
| agent_id=top_agent.agent_id, |
| ) |
|
|
| if prop["correct"]: |
| ledger.earn( |
| agent_id=top_agent.agent_id, |
| task_id=topic.question[:30], |
| action_id=f"turn_{turns}", |
| amount=oracle_res.reward_value * 3.0, |
| oracle_score=oracle_res.raw_score, |
| compute_cost=prop["tokens"], |
| reason="correct_proposal", |
| ) |
| else: |
| |
| ledger.spend( |
| agent_id=top_agent.agent_id, |
| task_id=topic.question[:30], |
| action_id=f"turn_{turns}", |
| amount=0.3, |
| reason="wrong_proposal_cost", |
| ) |
|
|
| |
| vote_scores: Dict[str, float] = {} |
| for p in proposals: |
| weight = ledger.balance(p["agent_id"], "general", "global") |
| weight = max(0.1, weight) |
| vote_scores[p["answer"]] = vote_scores.get(p["answer"], 0.0) + weight |
| final_answer = max(vote_scores, key=vote_scores.get) |
| correct = final_answer == topic.correct_answer |
|
|
| return { |
| "strategy": "occ_allocation", |
| "correct": correct, |
| "final_answer": final_answer, |
| "compute_used": compute_used, |
| "n_turns": turns, |
| "proposals": proposals, |
| } |
|
|
| def _summarize(self, results: List[Dict], label: str) -> Dict: |
| n = len(results) |
| correct = sum(1 for r in results if r["correct"]) |
| total_compute = sum(r["compute_used"] for r in results) |
| total_turns = sum(r["n_turns"] for r in results) |
|
|
| return { |
| "label": label, |
| "n_topics": n, |
| "accuracy": correct / n if n else 0.0, |
| "total_compute": float(total_compute), |
| "mean_compute_per_topic": float(total_compute / n) if n else 0.0, |
| "total_turns": total_turns, |
| "mean_turns_per_topic": float(total_turns / n) if n else 0.0, |
| "quality_per_compute": (correct / n) / (total_compute / n) if total_compute else 0.0, |
| "results": results, |
| } |
|
|
| def run_all(self) -> Dict[str, Dict]: |
| if not self.topics: |
| self.generate_topics() |
|
|
| |
| agents = [ |
| SimulatedDebateAgent("agent_1", accuracy=0.75, confidence_bias=0.05), |
| SimulatedDebateAgent("agent_2", accuracy=0.60, confidence_bias=0.15), |
| SimulatedDebateAgent("agent_3", accuracy=0.55, confidence_bias=-0.05), |
| SimulatedDebateAgent("agent_4", accuracy=0.50, confidence_bias=0.20), |
| ] |
|
|
| strategies = [ |
| ("equal_turns", lambda topic: self._resolve_equal_turns(agents, topic)), |
| ("majority_vote", lambda topic: self._resolve_majority_vote(agents, topic)), |
| ("confidence_weighted", lambda topic: self._resolve_confidence_weighted(agents, topic)), |
| ("occ_allocation", lambda topic: self._resolve_occ_allocation(agents, topic)), |
| ] |
|
|
| results = {} |
| for name, fn in strategies: |
| |
| for a in agents: |
| a.tokens_used = 0 |
| a.turns_taken = 0 |
| a.influence_score = 0.0 |
|
|
| topic_results = [] |
| for topic in self.topics: |
| topic_results.append(fn(topic)) |
| results[name] = self._summarize(topic_results, name) |
|
|
| return results |
|
|
|
|
| def main(): |
| bench = DebateBenchmark(n_topics=50, n_agents=4, seed=42) |
| bench.generate_topics() |
| results = bench.run_all() |
|
|
| print("=" * 60) |
| print("MULTI-AGENT DEBATE BENCHMARK") |
| print("=" * 60) |
| for label, res in results.items(): |
| print(f"\n{label}") |
| print(f" accuracy: {res['accuracy']:.3f}") |
| print(f" mean compute/topic: {res['mean_compute_per_topic']:.1f}") |
| print(f" mean turns/topic: {res['mean_turns_per_topic']:.1f}") |
| print(f" quality per compute: {res['quality_per_compute']:.6f}") |
|
|
| Path("/app/occ/reports").mkdir(parents=True, exist_ok=True) |
| with open("/app/occ/reports/benchmark_debate_results.json", "w") as f: |
| json.dump(results, f, indent=2, default=str) |
| print("\nSaved to reports/benchmark_debate_results.json") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|