""" Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests. Produces consolidated reports. """ import json import random from pathlib import Path from typing import Dict, List import numpy as np from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent from oracle.oracle import ImpactOracle from ledger.ledger import CreditLedger from broker.broker import ResourceBroker class AblationRunner: """Run ablation studies by disabling OCC components one at a time.""" def __init__(self, seed: int = 42): self.seed = seed random.seed(seed) np.random.seed(seed) # ------------------------------------------------------------------ # Ablations for Code Benchmark # ------------------------------------------------------------------ def ablation_code(self) -> Dict[str, Dict]: """Run code benchmark with ablated configurations.""" bench = CodeBenchmark(max_problems=50, seed=self.seed) bench.load_data() base_agents = [ SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80), SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60), SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120), ] results = {} # 1. Full OCC results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5) # 2. No credit ledger (oracle score only) # Simulate by running baseline_fixed but with oracle scoring results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3) # 3. No cost penalty (effectively baseline) # Approximate by increasing compute budget so cost penalty vanishes bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed) bench_no_cost.load_data() bench_no_cost.oracle.compute_budget = 1e12 results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5) # 4. No anti-gaming penalty bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed) bench_no_game.load_data() bench_no_game.oracle.gaming_weight = 0.0 gaming_agents = [ SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3), SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3), SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3), ] results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5) # 5. No broker (oracle score only) bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed) bench_no_broker.load_data() results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5) return results # ------------------------------------------------------------------ # Ablations for Retrieval QA # ------------------------------------------------------------------ def ablation_retrieval_qa(self) -> Dict[str, Dict]: """Run retrieval QA benchmark with ablated configurations.""" bench = RetrievalQABenchmark(n_questions=100, seed=self.seed) bench.generate_questions() agent = SimulatedRetrievalAgent( agent_id="rag_agent", accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.1, ) results = {} results["full_occ"] = bench.run_occ(agent) results["direct_answer"] = bench.run_direct_answer(agent) results["rag_baseline"] = bench.run_rag_baseline(agent) results["rag_verifier"] = bench.run_rag_verifier(agent) # Ablation: no abstention reward # Approximate by setting abstention rate very low agent_no_abstain = SimulatedRetrievalAgent( agent_id="rag_agent_no_abstain", accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.0, ) results["no_abstention"] = bench.run_occ(agent_no_abstain) # Ablation: no calibration penalty agent_no_calib = SimulatedRetrievalAgent( agent_id="rag_agent_no_calib", accuracy=0.65, hallucination_rate=0.12, calibration_error=0.0, abstention_rate=0.1, ) results["no_calibration"] = bench.run_occ(agent_no_calib) return results # ------------------------------------------------------------------ # Anti-Gaming Tests # ------------------------------------------------------------------ def anti_gaming_tests(self) -> Dict[str, Dict]: """Run adversarial tests against the credit system.""" random.seed(self.seed) np.random.seed(self.seed) results = {} # 1. Spam low-value actions bench = CodeBenchmark(max_problems=50, seed=self.seed) bench.load_data() spam_agents = [ SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50), SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50), ] results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10) # 2. Hoarding credits ledger = CreditLedger(decay_lambda=0.0) # no decay = hoarding # We'll simulate this via a custom run bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed) bench_hoard.load_data() hoard_agents = [ SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100), ] # Force many initial successes to build credit, then stop earning results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10) # 3. Hidden test gaming bench_game = CodeBenchmark(max_problems=50, seed=self.seed) bench_game.load_data() gaming_agents = [ SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True), ] results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5) # 4. Over-abstention in retrieval bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed) bench_qa.generate_questions() abstain_agent = SimulatedRetrievalAgent( agent_id="abstainer", accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.9, # over-abstain ) results["over_abstention"] = bench_qa.run_occ(abstain_agent) # 5. Collusion in debate bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed) bench_debate.generate_topics() colluding_agents = [ SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"), SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"), SimulatedDebateAgent("honest_1", accuracy=0.6), SimulatedDebateAgent("honest_2", accuracy=0.6), ] # Run equal turns to simulate collusion effect topic_results = [] for topic in bench_debate.topics: topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic)) results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns") # OCC with colluders topic_results_occ = [] for topic in bench_debate.topics: topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic)) results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ") return results # ------------------------------------------------------------------ # Consolidated run # ------------------------------------------------------------------ def run_all(self) -> Dict: print("Running code ablations...") code_ablations = self.ablation_code() print("Running retrieval QA ablations...") qa_ablations = self.ablation_retrieval_qa() print("Running anti-gaming tests...") anti_gaming = self.anti_gaming_tests() report = { "code_ablations": code_ablations, "qa_ablations": qa_ablations, "anti_gaming": anti_gaming, } Path("/app/occ/reports").mkdir(parents=True, exist_ok=True) with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f: json.dump(report, f, indent=2, default=str) print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json") return report def main(): runner = AblationRunner(seed=42) report = runner.run_all() print("\n" + "=" * 60) print("ABLATION SUMMARY") print("=" * 60) print("\n--- Code Ablations ---") for k, v in report["code_ablations"].items(): print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}") print("\n--- QA Ablations ---") for k, v in report["qa_ablations"].items(): print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}") print("\n--- Anti-Gaming ---") for k, v in report["anti_gaming"].items(): if "accuracy" in v: print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}") elif "pass@1" in v: print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}") if __name__ == "__main__": main()