| """ |
| Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests. |
| Produces consolidated reports. |
| """ |
|
|
| import json |
| import random |
| from pathlib import Path |
| from typing import Dict, List |
|
|
| import numpy as np |
|
|
| from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent |
| from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent |
| from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent |
| from oracle.oracle import ImpactOracle |
| from ledger.ledger import CreditLedger |
| from broker.broker import ResourceBroker |
|
|
|
|
| class AblationRunner: |
| """Run ablation studies by disabling OCC components one at a time.""" |
|
|
| def __init__(self, seed: int = 42): |
| self.seed = seed |
| random.seed(seed) |
| np.random.seed(seed) |
|
|
| |
| |
| |
|
|
| def ablation_code(self) -> Dict[str, Dict]: |
| """Run code benchmark with ablated configurations.""" |
| bench = CodeBenchmark(max_problems=50, seed=self.seed) |
| bench.load_data() |
|
|
| base_agents = [ |
| SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80), |
| SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60), |
| SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120), |
| ] |
|
|
| results = {} |
|
|
| |
| results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5) |
|
|
| |
| |
| results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3) |
|
|
| |
| |
| bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed) |
| bench_no_cost.load_data() |
| bench_no_cost.oracle.compute_budget = 1e12 |
| results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5) |
|
|
| |
| bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed) |
| bench_no_game.load_data() |
| bench_no_game.oracle.gaming_weight = 0.0 |
| gaming_agents = [ |
| SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3), |
| SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3), |
| SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3), |
| ] |
| results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5) |
|
|
| |
| bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed) |
| bench_no_broker.load_data() |
| results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5) |
|
|
| return results |
|
|
| |
| |
| |
|
|
| def ablation_retrieval_qa(self) -> Dict[str, Dict]: |
| """Run retrieval QA benchmark with ablated configurations.""" |
| bench = RetrievalQABenchmark(n_questions=100, seed=self.seed) |
| bench.generate_questions() |
|
|
| agent = SimulatedRetrievalAgent( |
| agent_id="rag_agent", |
| accuracy=0.65, |
| hallucination_rate=0.12, |
| calibration_error=0.15, |
| abstention_rate=0.1, |
| ) |
|
|
| results = {} |
| results["full_occ"] = bench.run_occ(agent) |
| results["direct_answer"] = bench.run_direct_answer(agent) |
| results["rag_baseline"] = bench.run_rag_baseline(agent) |
| results["rag_verifier"] = bench.run_rag_verifier(agent) |
|
|
| |
| |
| agent_no_abstain = SimulatedRetrievalAgent( |
| agent_id="rag_agent_no_abstain", |
| accuracy=0.65, |
| hallucination_rate=0.12, |
| calibration_error=0.15, |
| abstention_rate=0.0, |
| ) |
| results["no_abstention"] = bench.run_occ(agent_no_abstain) |
|
|
| |
| agent_no_calib = SimulatedRetrievalAgent( |
| agent_id="rag_agent_no_calib", |
| accuracy=0.65, |
| hallucination_rate=0.12, |
| calibration_error=0.0, |
| abstention_rate=0.1, |
| ) |
| results["no_calibration"] = bench.run_occ(agent_no_calib) |
|
|
| return results |
|
|
| |
| |
| |
|
|
| def anti_gaming_tests(self) -> Dict[str, Dict]: |
| """Run adversarial tests against the credit system.""" |
| random.seed(self.seed) |
| np.random.seed(self.seed) |
|
|
| results = {} |
|
|
| |
| bench = CodeBenchmark(max_problems=50, seed=self.seed) |
| bench.load_data() |
| spam_agents = [ |
| SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50), |
| SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50), |
| ] |
| results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10) |
|
|
| |
| ledger = CreditLedger(decay_lambda=0.0) |
| |
| bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed) |
| bench_hoard.load_data() |
| hoard_agents = [ |
| SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100), |
| ] |
| |
| results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10) |
|
|
| |
| bench_game = CodeBenchmark(max_problems=50, seed=self.seed) |
| bench_game.load_data() |
| gaming_agents = [ |
| SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True), |
| ] |
| results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5) |
|
|
| |
| bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed) |
| bench_qa.generate_questions() |
| abstain_agent = SimulatedRetrievalAgent( |
| agent_id="abstainer", |
| accuracy=0.65, |
| hallucination_rate=0.12, |
| calibration_error=0.15, |
| abstention_rate=0.9, |
| ) |
| results["over_abstention"] = bench_qa.run_occ(abstain_agent) |
|
|
| |
| bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed) |
| bench_debate.generate_topics() |
| colluding_agents = [ |
| SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"), |
| SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"), |
| SimulatedDebateAgent("honest_1", accuracy=0.6), |
| SimulatedDebateAgent("honest_2", accuracy=0.6), |
| ] |
| |
| topic_results = [] |
| for topic in bench_debate.topics: |
| topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic)) |
| results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns") |
|
|
| |
| topic_results_occ = [] |
| for topic in bench_debate.topics: |
| topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic)) |
| results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ") |
|
|
| return results |
|
|
| |
| |
| |
|
|
| def run_all(self) -> Dict: |
| print("Running code ablations...") |
| code_ablations = self.ablation_code() |
|
|
| print("Running retrieval QA ablations...") |
| qa_ablations = self.ablation_retrieval_qa() |
|
|
| print("Running anti-gaming tests...") |
| anti_gaming = self.anti_gaming_tests() |
|
|
| report = { |
| "code_ablations": code_ablations, |
| "qa_ablations": qa_ablations, |
| "anti_gaming": anti_gaming, |
| } |
|
|
| Path("/app/occ/reports").mkdir(parents=True, exist_ok=True) |
| with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f: |
| json.dump(report, f, indent=2, default=str) |
| print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json") |
| return report |
|
|
|
|
| def main(): |
| runner = AblationRunner(seed=42) |
| report = runner.run_all() |
|
|
| print("\n" + "=" * 60) |
| print("ABLATION SUMMARY") |
| print("=" * 60) |
|
|
| print("\n--- Code Ablations ---") |
| for k, v in report["code_ablations"].items(): |
| print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}") |
|
|
| print("\n--- QA Ablations ---") |
| for k, v in report["qa_ablations"].items(): |
| print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}") |
|
|
| print("\n--- Anti-Gaming ---") |
| for k, v in report["anti_gaming"].items(): |
| if "accuracy" in v: |
| print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}") |
| elif "pass@1" in v: |
| print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|