narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 27 days ago

Commit

bc02d39

verified ·

1 Parent(s): b40184a

Upload eval_runner.py

Browse files

Files changed (1) hide show

eval_runner.py +250 -0

eval_runner.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
+Produces consolidated reports.
+"""
+import json
+import random
+from pathlib import Path
+from typing import Dict, List
+import numpy as np
+from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
+from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent
+from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent
+from oracle.oracle import ImpactOracle
+from ledger.ledger import CreditLedger
+from broker.broker import ResourceBroker
+class AblationRunner:
+    """Run ablation studies by disabling OCC components one at a time."""
+    def __init__(self, seed: int = 42):
+        self.seed = seed
+        random.seed(seed)
+        np.random.seed(seed)
+    # ------------------------------------------------------------------
+    # Ablations for Code Benchmark
+    # ------------------------------------------------------------------
+    def ablation_code(self) -> Dict[str, Dict]:
+        """Run code benchmark with ablated configurations."""
+        bench = CodeBenchmark(max_problems=50, seed=self.seed)
+        bench.load_data()
+        base_agents = [
+            SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
+            SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
+            SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
+        ]
+        results = {}
+        # 1. Full OCC
+        results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5)
+        # 2. No credit ledger (oracle score only)
+        # Simulate by running baseline_fixed but with oracle scoring
+        results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3)
+        # 3. No cost penalty (effectively baseline)
+        # Approximate by increasing compute budget so cost penalty vanishes
+        bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed)
+        bench_no_cost.load_data()
+        bench_no_cost.oracle.compute_budget = 1e12
+        results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5)
+        # 4. No anti-gaming penalty
+        bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed)
+        bench_no_game.load_data()
+        bench_no_game.oracle.gaming_weight = 0.0
+        gaming_agents = [
+            SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3),
+            SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3),
+            SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3),
+        ]
+        results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5)
+        # 5. No broker (oracle score only)
+        bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed)
+        bench_no_broker.load_data()
+        results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5)
+        return results
+    # ------------------------------------------------------------------
+    # Ablations for Retrieval QA
+    # ------------------------------------------------------------------
+    def ablation_retrieval_qa(self) -> Dict[str, Dict]:
+        """Run retrieval QA benchmark with ablated configurations."""
+        bench = RetrievalQABenchmark(n_questions=100, seed=self.seed)
+        bench.generate_questions()
+        agent = SimulatedRetrievalAgent(
+            agent_id="rag_agent",
+            accuracy=0.65,
+            hallucination_rate=0.12,
+            calibration_error=0.15,
+            abstention_rate=0.1,
+        )
+        results = {}
+        results["full_occ"] = bench.run_occ(agent)
+        results["direct_answer"] = bench.run_direct_answer(agent)
+        results["rag_baseline"] = bench.run_rag_baseline(agent)
+        results["rag_verifier"] = bench.run_rag_verifier(agent)
+        # Ablation: no abstention reward
+        # Approximate by setting abstention rate very low
+        agent_no_abstain = SimulatedRetrievalAgent(
+            agent_id="rag_agent_no_abstain",
+            accuracy=0.65,
+            hallucination_rate=0.12,
+            calibration_error=0.15,
+            abstention_rate=0.0,
+        )
+        results["no_abstention"] = bench.run_occ(agent_no_abstain)
+        # Ablation: no calibration penalty
+        agent_no_calib = SimulatedRetrievalAgent(
+            agent_id="rag_agent_no_calib",
+            accuracy=0.65,
+            hallucination_rate=0.12,
+            calibration_error=0.0,
+            abstention_rate=0.1,
+        )
+        results["no_calibration"] = bench.run_occ(agent_no_calib)
+        return results
+    # ------------------------------------------------------------------
+    # Anti-Gaming Tests
+    # ------------------------------------------------------------------
+    def anti_gaming_tests(self) -> Dict[str, Dict]:
+        """Run adversarial tests against the credit system."""
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        results = {}
+        # 1. Spam low-value actions
+        bench = CodeBenchmark(max_problems=50, seed=self.seed)
+        bench.load_data()
+        spam_agents = [
+            SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50),
+            SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50),
+        ]
+        results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10)
+        # 2. Hoarding credits
+        ledger = CreditLedger(decay_lambda=0.0)  # no decay = hoarding
+        # We'll simulate this via a custom run
+        bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed)
+        bench_hoard.load_data()
+        hoard_agents = [
+            SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100),
+        ]
+        # Force many initial successes to build credit, then stop earning
+        results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10)
+        # 3. Hidden test gaming
+        bench_game = CodeBenchmark(max_problems=50, seed=self.seed)
+        bench_game.load_data()
+        gaming_agents = [
+            SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True),
+        ]
+        results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5)
+        # 4. Over-abstention in retrieval
+        bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
+        bench_qa.generate_questions()
+        abstain_agent = SimulatedRetrievalAgent(
+            agent_id="abstainer",
+            accuracy=0.65,
+            hallucination_rate=0.12,
+            calibration_error=0.15,
+            abstention_rate=0.9,  # over-abstain
+        )
+        results["over_abstention"] = bench_qa.run_occ(abstain_agent)
+        # 5. Collusion in debate
+        bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
+        bench_debate.generate_topics()
+        colluding_agents = [
+            SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
+            SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
+            SimulatedDebateAgent("honest_1", accuracy=0.6),
+            SimulatedDebateAgent("honest_2", accuracy=0.6),
+        ]
+        # Run equal turns to simulate collusion effect
+        topic_results = []
+        for topic in bench_debate.topics:
+            topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic))
+        results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns")
+        # OCC with colluders
+        topic_results_occ = []
+        for topic in bench_debate.topics:
+            topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic))
+        results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
+        return results
+    # ------------------------------------------------------------------
+    # Consolidated run
+    # ------------------------------------------------------------------
+    def run_all(self) -> Dict:
+        print("Running code ablations...")
+        code_ablations = self.ablation_code()
+        print("Running retrieval QA ablations...")
+        qa_ablations = self.ablation_retrieval_qa()
+        print("Running anti-gaming tests...")
+        anti_gaming = self.anti_gaming_tests()
+        report = {
+            "code_ablations": code_ablations,
+            "qa_ablations": qa_ablations,
+            "anti_gaming": anti_gaming,
+        }
+        Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
+        with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f:
+            json.dump(report, f, indent=2, default=str)
+        print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json")
+        return report
+def main():
+    runner = AblationRunner(seed=42)
+    report = runner.run_all()
+    print("\n" + "=" * 60)
+    print("ABLATION SUMMARY")
+    print("=" * 60)
+    print("\n--- Code Ablations ---")
+    for k, v in report["code_ablations"].items():
+        print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
+    print("\n--- QA Ablations ---")
+    for k, v in report["qa_ablations"].items():
+        print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
+    print("\n--- Anti-Gaming ---")
+    for k, v in report["anti_gaming"].items():
+        if "accuracy" in v:
+            print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
+        elif "pass@1" in v:
+            print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
+if __name__ == "__main__":
+    main()