narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 26 days ago

Commit

9312235

verified ·

1 Parent(s): fc4adc2

Upload eval_runner.py

Browse files

Files changed (1) hide show

eval_runner.py +90 -110

eval_runner.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """
 Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
-Produces consolidated reports.
 """
 import json
 import random
 from pathlib import Path
@@ -27,56 +26,37 @@ class AblationRunner:
         np.random.seed(seed)
     # ------------------------------------------------------------------
-    # Ablations for Code Benchmark
     # ------------------------------------------------------------------
     def ablation_code(self) -> Dict[str, Dict]:
         """Run code benchmark with ablated configurations."""
-        bench = CodeBenchmark(max_problems=50, seed=self.seed)
-        bench.load_data()
-        base_agents = [
-            SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
-            SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
-            SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
-        ]
         results = {}
-        # 1. Full OCC
-        results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5)
-        # 2. No credit ledger (oracle score only)
-        # Simulate by running baseline_fixed but with oracle scoring
-        results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3)
-        # 3. No cost penalty (effectively baseline)
-        # Approximate by increasing compute budget so cost penalty vanishes
-        bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed)
-        bench_no_cost.load_data()
-        bench_no_cost.oracle.compute_budget = 1e12
-        results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5)
-        # 4. No anti-gaming penalty
-        bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed)
-        bench_no_game.load_data()
-        bench_no_game.oracle.gaming_weight = 0.0
-        gaming_agents = [
-            SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3),
-            SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3),
-            SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3),
-        ]
-        results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5)
-        # 5. No broker (oracle score only)
-        bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed)
-        bench_no_broker.load_data()
-        results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5)
         return results
     # ------------------------------------------------------------------
-    # Ablations for Retrieval QA
     # ------------------------------------------------------------------
     def ablation_retrieval_qa(self) -> Dict[str, Dict]:
@@ -94,28 +74,27 @@ class AblationRunner:
         results = {}
         results["full_occ"] = bench.run_occ(agent)
-        results["direct_answer"] = bench.run_direct_answer(agent)
-        results["rag_baseline"] = bench.run_rag_baseline(agent)
-        results["rag_verifier"] = bench.run_rag_verifier(agent)
-        # Ablation: no abstention reward
-        # Approximate by setting abstention rate very low
         agent_no_abstain = SimulatedRetrievalAgent(
-            agent_id="rag_agent_no_abstain",
-            accuracy=0.65,
-            hallucination_rate=0.12,
-            calibration_error=0.15,
-            abstention_rate=0.0,
         )
         results["no_abstention"] = bench.run_occ(agent_no_abstain)
-        # Ablation: no calibration penalty
         agent_no_calib = SimulatedRetrievalAgent(
-            agent_id="rag_agent_no_calib",
-            accuracy=0.65,
-            hallucination_rate=0.12,
-            calibration_error=0.0,
-            abstention_rate=0.1,
         )
         results["no_calibration"] = bench.run_occ(agent_no_calib)
@@ -129,74 +108,67 @@ class AblationRunner:
         """Run adversarial tests against the credit system."""
         random.seed(self.seed)
         np.random.seed(self.seed)
         results = {}
         # 1. Spam low-value actions
-        bench = CodeBenchmark(max_problems=50, seed=self.seed)
-        bench.load_data()
-        spam_agents = [
-            SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50),
-            SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50),
-        ]
-        results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10)
-        # 2. Hoarding credits
-        ledger = CreditLedger(decay_lambda=0.0)  # no decay = hoarding
-        # We'll simulate this via a custom run
-        bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed)
-        bench_hoard.load_data()
-        hoard_agents = [
-            SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100),
         ]
-        # Force many initial successes to build credit, then stop earning
-        results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10)
-        # 3. Hidden test gaming
-        bench_game = CodeBenchmark(max_problems=50, seed=self.seed)
-        bench_game.load_data()
-        gaming_agents = [
-            SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True),
-        ]
-        results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5)
-        # 4. Over-abstention in retrieval
         bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
         bench_qa.generate_questions()
-        abstain_agent = SimulatedRetrievalAgent(
             agent_id="abstainer",
-            accuracy=0.65,
-            hallucination_rate=0.12,
-            calibration_error=0.15,
-            abstention_rate=0.9,  # over-abstain
         )
-        results["over_abstention"] = bench_qa.run_occ(abstain_agent)
-        # 5. Collusion in debate
         bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
         bench_debate.generate_topics()
-        colluding_agents = [
             SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
             SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
             SimulatedDebateAgent("honest_1", accuracy=0.6),
             SimulatedDebateAgent("honest_2", accuracy=0.6),
         ]
-        # Run equal turns to simulate collusion effect
-        topic_results = []
-        for topic in bench_debate.topics:
-            topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic))
-        results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns")
-        # OCC with colluders
         topic_results_occ = []
         for topic in bench_debate.topics:
-            topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic))
         results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
         return results
     # ------------------------------------------------------------------
-    # Consolidated run
     # ------------------------------------------------------------------
     def run_all(self) -> Dict:
@@ -215,10 +187,12 @@ class AblationRunner:
             "anti_gaming": anti_gaming,
         }
-        Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
-        with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f:
             json.dump(report, f, indent=2, default=str)
-        print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json")
         return report
@@ -232,18 +206,24 @@ def main():
     print("\n--- Code Ablations ---")
     for k, v in report["code_ablations"].items():
-        print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
     print("\n--- QA Ablations ---")
     for k, v in report["qa_ablations"].items():
-        print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
     print("\n--- Anti-Gaming ---")
     for k, v in report["anti_gaming"].items():
         if "accuracy" in v:
-            print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
-        elif "pass@1" in v:
-            print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
 if __name__ == "__main__":

 """
 Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
+Produces consolidated reports compatible with the current benchmark APIs.
 """
 import json
 import random
 from pathlib import Path
         np.random.seed(seed)
     # ------------------------------------------------------------------
+    # Code Benchmark Ablations
     # ------------------------------------------------------------------
     def ablation_code(self) -> Dict[str, Dict]:
         """Run code benchmark with ablated configurations."""
+        bench = CodeBenchmark(n_problems=50, seed=self.seed)
+        cheap = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15,
+                                    cost_per_attempt=60, hidden_test_falloff=0.20)
+        medium = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35,
+                                     cost_per_attempt=150, hidden_test_falloff=0.15)
+        expensive = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65,
+                                        cost_per_attempt=350, hidden_test_falloff=0.10)
         results = {}
+        results["full_occ"] = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
+        results["fixed_budget"] = bench.run_fixed_budget(expensive, max_attempts=1)
+        results["verifier_guided"] = bench.run_verifier_guided(
+            SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65,
+                               cost_per_attempt=350, hidden_test_falloff=0.10),
+            max_attempts=3)
+        # No cost penalty: inflate budget to near-zero penalty
+        bench_no_cost = CodeBenchmark(n_problems=50, seed=self.seed)
+        bench_no_cost.oracle.compute_penalty_rate = 1e-12
+        results["no_cost_penalty"] = bench_no_cost.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
         return results
     # ------------------------------------------------------------------
+    # Retrieval QA Ablations
     # ------------------------------------------------------------------
     def ablation_retrieval_qa(self) -> Dict[str, Dict]:
         results = {}
         results["full_occ"] = bench.run_occ(agent)
+        results["direct_answer"] = bench.run_direct_answer(
+            SimulatedRetrievalAgent("direct", accuracy=0.65, hallucination_rate=0.12,
+                                     calibration_error=0.15, abstention_rate=0.1))
+        results["rag_baseline"] = bench.run_rag_baseline(
+            SimulatedRetrievalAgent("rag", accuracy=0.65, hallucination_rate=0.12,
+                                    calibration_error=0.15, abstention_rate=0.1))
+        results["rag_verifier"] = bench.run_rag_verifier(
+            SimulatedRetrievalAgent("verifier", accuracy=0.65, hallucination_rate=0.12,
+                                    calibration_error=0.15, abstention_rate=0.1))
+        # No abstention reward
         agent_no_abstain = SimulatedRetrievalAgent(
+            agent_id="rag_no_abstain",
+            accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.0,
         )
         results["no_abstention"] = bench.run_occ(agent_no_abstain)
+        # No calibration penalty
         agent_no_calib = SimulatedRetrievalAgent(
+            agent_id="rag_no_calib",
+            accuracy=0.65, hallucination_rate=0.12, calibration_error=0.0, abstention_rate=0.1,
         )
         results["no_calibration"] = bench.run_occ(agent_no_calib)
         """Run adversarial tests against the credit system."""
         random.seed(self.seed)
         np.random.seed(self.seed)
         results = {}
         # 1. Spam low-value actions
+        bench = CodeBenchmark(n_problems=50, seed=self.seed)
+        spam = [
+            SimulatedCodeAgent("spam_1", pass_rate_easy=0.05, pass_rate_hard=0.0,
+                               cost_per_attempt=50, hidden_test_falloff=0.0),
+            SimulatedCodeAgent("spam_2", pass_rate_easy=0.05, pass_rate_hard=0.0,
+                               cost_per_attempt=50, hidden_test_falloff=0.0),
         ]
+        results["spam"] = bench.run_occ_allocation(spam, max_attempts=10)
+        # 2. Hidden-test gaming: public pass but hidden fail
+        bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
+        # Simulate gaming by creating an agent that always passes public but fails hidden
+        # We approximate this by making hidden_test_falloff huge so hidden always fails
+        gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
+                                    cost_per_attempt=100, hidden_test_falloff=1.0)
+        results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
+        # 3. Over-abstention in retrieval QA
         bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
         bench_qa.generate_questions()
+        abstainer = SimulatedRetrievalAgent(
             agent_id="abstainer",
+            accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.9,
         )
+        results["over_abstention"] = bench_qa.run_occ(abstainer)
+        # 4. Collusion in debate
         bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
         bench_debate.generate_topics()
+        agents = [
             SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
             SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
             SimulatedDebateAgent("honest_1", accuracy=0.6),
             SimulatedDebateAgent("honest_2", accuracy=0.6),
         ]
+        # Use the internal resolution directly
+        topic_results_eq = []
         topic_results_occ = []
         for topic in bench_debate.topics:
+            topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
+            # Reset agents between strategies (token/turn counters)
+            for a in agents:
+                a.tokens_used = 0
+                a.turns_taken = 0
+                a.influence_score = 0.0
+            topic_results_occ.append(bench_debate._resolve_occ_allocation(agents, topic))
+            for a in agents:
+                a.tokens_used = 0
+                a.turns_taken = 0
+                a.influence_score = 0.0
+        results["collusion_equal_turns"] = bench_debate._summarize(topic_results_eq, "collusion_equal_turns")
         results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
         return results
     # ------------------------------------------------------------------
+    # Full run
     # ------------------------------------------------------------------
     def run_all(self) -> Dict:
             "anti_gaming": anti_gaming,
         }
+        out_dir = Path(__file__).parent / "reports"
+        out_dir.mkdir(parents=True, exist_ok=True)
+        out_path = out_dir / "ablation_and_anti_gaming.json"
+        with open(out_path, "w") as f:
             json.dump(report, f, indent=2, default=str)
+        print(f"\nSaved to {out_path}")
         return report
     print("\n--- Code Ablations ---")
     for k, v in report["code_ablations"].items():
+        p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
+        comp = v.get('total_compute', 'N/A')
+        print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
     print("\n--- QA Ablations ---")
     for k, v in report["qa_ablations"].items():
+        acc = v.get('accuracy', 'N/A')
+        ece = v.get('ece', 'N/A')
+        comp = v.get('total_compute', 'N/A')
+        print(f"{k:20s}: acc={acc if isinstance(acc, str) else f'{acc:.3f}'}, ECE={ece if isinstance(ece, str) else f'{ece:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
     print("\n--- Anti-Gaming ---")
     for k, v in report["anti_gaming"].items():
         if "accuracy" in v:
+            print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A')}")
+        elif "pass_at_1" in v or "pass@1" in v:
+            p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
+            print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={v.get('total_compute', 'N/A')}")
 if __name__ == "__main__":