narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 26 days ago

Commit

ae2b06a

verified ·

1 Parent(s): e83f963

Upload eval_runner.py

Browse files

Files changed (1) hide show

eval_runner.py +300 -205

eval_runner.py CHANGED Viewed

@@ -1,227 +1,322 @@
 """
-Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
-Produces consolidated reports compatible with the current benchmark APIs.
 """
 import json
 import random
 from pathlib import Path
-from typing import Dict, List
 import numpy as np
-from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
-from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent
-from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent
 from oracle.oracle import ImpactOracle
 from ledger.ledger import CreditLedger
-from broker.broker import ResourceBroker
-class AblationRunner:
-    """Run ablation studies by disabling OCC components one at a time."""
-    def __init__(self, seed: int = 42):
-        self.seed = seed
-        random.seed(seed)
-        np.random.seed(seed)
-    # ------------------------------------------------------------------
-    # Code Benchmark Ablations
-    # ------------------------------------------------------------------
-    def ablation_code(self) -> Dict[str, Dict]:
-        """Run code benchmark with ablated configurations."""
-        bench = CodeBenchmark(n_problems=50, seed=self.seed)
-        cheap = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15,
-                                    cost_per_attempt=60, hidden_test_falloff=0.20)
-        medium = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35,
-                                     cost_per_attempt=150, hidden_test_falloff=0.15)
-        expensive = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65,
-                                        cost_per_attempt=350, hidden_test_falloff=0.10)
-        results = {}
-        results["full_occ"] = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
-        results["fixed_budget"] = bench.run_fixed_budget(expensive, max_attempts=1)
-        results["verifier_guided"] = bench.run_verifier_guided(
-            SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65,
-                               cost_per_attempt=350, hidden_test_falloff=0.10),
-            max_attempts=3)
-        # No cost penalty: inflate budget to near-zero penalty
-        bench_no_cost = CodeBenchmark(n_problems=50, seed=self.seed)
-        bench_no_cost.oracle.compute_penalty_rate = 1e-12
-        results["no_cost_penalty"] = bench_no_cost.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
-        return results
-    # ------------------------------------------------------------------
-    # Retrieval QA Ablations
-    # ------------------------------------------------------------------
-    def ablation_retrieval_qa(self) -> Dict[str, Dict]:
-        """Run retrieval QA benchmark with ablated configurations."""
-        bench = RetrievalQABenchmark(n_questions=100, seed=self.seed)
-        bench.generate_questions()
-        agent = SimulatedRetrievalAgent(
-            agent_id="rag_agent",
-            accuracy=0.65,
-            hallucination_rate=0.12,
-            calibration_error=0.15,
-            abstention_rate=0.1,
-        )
-        results = {}
-        results["full_occ"] = bench.run_occ(agent)
-        results["direct_answer"] = bench.run_direct_answer(
-            SimulatedRetrievalAgent("direct", accuracy=0.65, hallucination_rate=0.12,
-                                     calibration_error=0.15, abstention_rate=0.1))
-        results["rag_baseline"] = bench.run_rag_baseline(
-            SimulatedRetrievalAgent("rag", accuracy=0.65, hallucination_rate=0.12,
-                                    calibration_error=0.15, abstention_rate=0.1))
-        results["rag_verifier"] = bench.run_rag_verifier(
-            SimulatedRetrievalAgent("verifier", accuracy=0.65, hallucination_rate=0.12,
-                                    calibration_error=0.15, abstention_rate=0.1))
-        # No abstention reward
-        agent_no_abstain = SimulatedRetrievalAgent(
-            agent_id="rag_no_abstain",
-            accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.0,
-        )
-        results["no_abstention"] = bench.run_occ(agent_no_abstain)
-        # No calibration penalty
-        agent_no_calib = SimulatedRetrievalAgent(
-            agent_id="rag_no_calib",
-            accuracy=0.65, hallucination_rate=0.12, calibration_error=0.0, abstention_rate=0.1,
-        )
-        results["no_calibration"] = bench.run_occ(agent_no_calib)
-        return results
-    # ------------------------------------------------------------------
-    # Anti-Gaming Tests
-    # ------------------------------------------------------------------
-    def anti_gaming_tests(self) -> Dict[str, Dict]:
-        """Run adversarial tests against the credit system."""
-        random.seed(self.seed)
-        np.random.seed(self.seed)
-        results = {}
-        # 1. Spam low-value actions
-        bench = CodeBenchmark(n_problems=50, seed=self.seed)
-        spam = [
-            SimulatedCodeAgent("spam_1", pass_rate_easy=0.05, pass_rate_hard=0.0,
-                               cost_per_attempt=50, hidden_test_falloff=0.0),
-            SimulatedCodeAgent("spam_2", pass_rate_easy=0.05, pass_rate_hard=0.0,
-                               cost_per_attempt=50, hidden_test_falloff=0.0),
-        ]
-        results["spam"] = bench.run_occ_allocation(spam, max_attempts=10)
-        # 2. Hidden-test gaming: public pass but hidden fail
-        bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
-        # Simulate gaming by creating an agent that always passes public but fails hidden
-        gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
-                                    cost_per_attempt=100, hidden_test_falloff=1.0)
-        results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
-        # 3. Over-abstention in retrieval QA
-        bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
-        bench_qa.generate_questions()
-        abstainer = SimulatedRetrievalAgent(
-            agent_id="abstainer",
-            accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.9,
-        )
-        results["over_abstention"] = bench_qa.run_occ(abstainer)
-        # 4. Collusion in debate
-        bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
-        bench_debate.generate_topics()
-        agents = [
-            SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
-            SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
-            SimulatedDebateAgent("honest_1", accuracy=0.6),
-            SimulatedDebateAgent("honest_2", accuracy=0.6),
-        ]
-        topic_results_eq = []
-        topic_results_occ = []
-        for topic in bench_debate.topics:
-            topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
-            for a in agents:
-                a.tokens_used = 0
-                a.turns_taken = 0
-                a.influence_score = 0.0
-            topic_results_occ.append(bench_debate._resolve_occ_allocation(agents, topic))
-            for a in agents:
-                a.tokens_used = 0
-                a.turns_taken = 0
-                a.influence_score = 0.0
-        results["collusion_equal_turns"] = bench_debate._summarize(topic_results_eq, "collusion_equal_turns")
-        results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
-        return results
-    # ------------------------------------------------------------------
-    # Full run
-    # ------------------------------------------------------------------
-    def run_all(self) -> Dict:
-        print("Running code ablations...")
-        code_ablations = self.ablation_code()
-        print("Running retrieval QA ablations...")
-        qa_ablations = self.ablation_retrieval_qa()
-        print("Running anti-gaming tests...")
-        anti_gaming = self.anti_gaming_tests()
-        report = {
-            "code_ablations": code_ablations,
-            "qa_ablations": qa_ablations,
-            "anti_gaming": anti_gaming,
         }
-        out_dir = Path(__file__).parent / "reports"
-        out_dir.mkdir(parents=True, exist_ok=True)
-        out_path = out_dir / "ablation_and_anti_gaming.json"
-        with open(out_path, "w") as f:
-            json.dump(report, f, indent=2, default=str)
-        print(f"\nSaved to {out_path}")
-        return report
-def main():
-    runner = AblationRunner(seed=42)
-    report = runner.run_all()
-    print("\n" + "=" * 60)
-    print("ABLATION SUMMARY")
     print("=" * 60)
-    print("\n--- Code Ablations ---")
-    for k, v in report["code_ablations"].items():
-        p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
-        comp = v.get('total_compute', 'N/A')
-        print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
-    print("\n--- QA Ablations ---")
-    for k, v in report["qa_ablations"].items():
-        acc = v.get('accuracy', 'N/A')
-        ece = v.get('ece', 'N/A')
-        comp = v.get('total_compute', 'N/A')
-        print(f"{k:20s}: acc={acc if isinstance(acc, str) else f'{acc:.3f}'}, ECE={ece if isinstance(ece, str) else f'{ece:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
-    print("\n--- Anti-Gaming ---")
-    for k, v in report["anti_gaming"].items():
-        if "accuracy" in v:
-            print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A')}")
-        elif "pass_at_1" in v or "pass@1" in v:
-            p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
-            print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={v.get('total_compute', 'N/A')}")
 if __name__ == "__main__":
-    main()

 """
+Unified evaluation runner: all ablations + anti-gaming tests.
+Runs simulated benchmarks under 10 ablation conditions and 6 anti-gaming attacks.
 """
 import json
 import random
+import sys
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Any, Dict, List, Tuple
 import numpy as np
+# Ensure imports work
+sys.path.insert(0, str(Path(__file__).parent))
 from oracle.oracle import ImpactOracle
 from ledger.ledger import CreditLedger
+from broker.broker import ResourceBroker, Decision
+from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
+from benchmarks.benchmark_retrieval_qa import (
+    QABenchmark,
+    SimulatedAgent,
+    create_qa_dataset,
+)
+from benchmarks.benchmark_debate_v2 import (
+    DebateBenchmark,
+    FactualAgent,
+    OverconfidentAgent,
+    UncertainAgent,
+    SycophantAgent,
+)
+@dataclass
+class AblConfig:
+    name: str
+    description: str
+    oracle_weights: Dict[str, Any]
+    broker_thresholds: Dict[str, float]
+    decay_lambda: float
+    gaming_penalty: float
+    compute_penalty_rate: float
+    anti_gaming_on: bool
+ABLATIONS = [
+    AblConfig("default", "Full OCC stack", {}, {}, 0.02, 2.0, 0.0001, True),
+    AblConfig("no_decay", "No credit decay (lambda=0)", {}, {}, 0.0, 2.0, 0.0001, True),
+    AblConfig("fast_decay", "Aggressive decay (lambda=0.1)", {}, {}, 0.1, 2.0, 0.0001, True),
+    AblConfig("no_gaming_penalty", "No gaming penalties", {}, {}, 0.02, 0.0, 0.0001, True),
+    AblConfig("high_gaming_penalty", "Severe gaming penalties (5.0)", {}, {}, 0.02, 5.0, 0.0001, True),
+    AblConfig("lenient_broker", "Lenient broker (thresholds x0.5)", {}, {"low": 0.25, "medium": 1.0, "high": 2.5}, 0.02, 2.0, 0.0001, True),
+    AblConfig("strict_broker", "Strict broker (thresholds x2.0)", {}, {"low": 1.0, "medium": 4.0, "high": 10.0}, 0.02, 2.0, 0.0001, True),
+    AblConfig("high_compute_cost", "High compute penalty (x10)", {}, {}, 0.02, 2.0, 0.001, True),
+    AblConfig("low_compute_cost", "Low compute penalty (x0.1)", {}, {}, 0.02, 2.0, 0.00001, True),
+    AblConfig("anti_gaming_off", "Disable all anti-gaming detectors", {}, {}, 0.02, 2.0, 0.0001, False),
+]
+def run_ablation_code(config: AblConfig, seed: int = 42, n_problems: int = 50) -> Dict:
+    random.seed(seed)
+    np.random.seed(seed)
+    oracle = ImpactOracle(
+        code_weights={"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001},
+        compute_penalty_rate=config.compute_penalty_rate,
+        gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
+    )
+    ledger = CreditLedger(decay_lambda=config.decay_lambda)
+    broker = ResourceBroker(thresholds=config.broker_thresholds)
+    bench = CodeBenchmark(n_problems=n_problems, seed=seed)
+    cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
+    medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
+    expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)
+    # Seed ledger
+    for a in [cheap, medium, expensive]:
+        q = (a.pass_rate_easy + a.pass_rate_hard) / 2
+        ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
+    # Override benchmark's oracle/ledger/broker
+    results = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
+    # (the benchmark internally uses its own instances; we use the standalone below)
+    # Actually the benchmark creates its own objects. Let's run standalone:
+    return _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed)
+def _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    bench = CodeBenchmark(n_problems=n_problems, seed=seed)
+    agents = [cheap, medium, expensive]
+    for a in agents:
+        q = (a.pass_rate_easy + a.pass_rate_hard) / 2
+        ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
+    total_compute = 0
+    results = []
+    for problem in bench.problems:
+        solved = False
+        cost = 0
+        used = []
+        ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
+        for agent in ranked:
+            if solved or len(used) >= 3:
+                break
+            a.attempts += 1
+            r = agent.solve(problem)
+            cost += r["compute_cost"]
+            total_compute += r["compute_cost"]
+            used.append(agent.agent_id)
+            solved = r["public_pass"]
+            hidden = r["hidden_pass"]
+            oracle_res = oracle.score(
+                "code", {"attempt": len(used)}, {},
+                {"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
+                 "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
+                agent_id=agent.agent_id,
+            )
+            if oracle_res.raw_score > 0:
+                ledger.earn(agent.agent_id, problem.task_id, "solve", oracle_res.raw_score * 5,
+                            oracle_res.raw_score, cost, "pass", "model_call")
+            else:
+                ledger.spend(agent.agent_id, problem.task_id, "solve", 1.0, "model_call", "fail")
+            if hidden:
+                break
+        results.append({"solved": solved, "cost": cost, "agents": used})
+    acc = sum(1 for r in results if r["solved"]) / len(results)
+    return {
+        "accuracy": acc,
+        "total_compute": total_compute,
+        "mean_compute": total_compute / len(results),
+        "mean_agents": sum(len(r["agents"]) for r in results) / len(results),
+    }
+def run_ablation_qa(config: AblConfig, seed: int = 42) -> Dict:
+    random.seed(seed)
+    np.random.seed(seed)
+    oracle = ImpactOracle(
+        compute_penalty_rate=config.compute_penalty_rate,
+        gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
+    )
+    ledger = CreditLedger(decay_lambda=config.decay_lambda)
+    broker = ResourceBroker(thresholds=config.broker_thresholds)
+    data = create_qa_dataset(seed=seed)
+    bench = QABenchmark(data, oracle, ledger, broker, seed=seed)
+    agent = SimulatedAgent("qa_agent", oracle, ledger, broker, 0.85)
+    agent.budget = 50000
+    agent.strategy = "adaptive"
+    results = bench.run_occ(agent)
+    return {
+        "accuracy": results["accuracy"],
+        "total_compute": results["total_compute"],
+        "mean_compute": results["mean_compute"],
+        "precision": results.get("precision", 0),
+        "recall": results.get("recall", 0),
+    }
+def run_ablation_debate(config: AblConfig, seed: int = 42, n_debates: int = 20) -> Dict:
+    random.seed(seed)
+    np.random.seed(seed)
+    oracle = ImpactOracle(
+        compute_penalty_rate=config.compute_penalty_rate,
+        gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
+    )
+    ledger = CreditLedger(decay_lambda=config.decay_lambda)
+    broker = ResourceBroker(thresholds=config.broker_thresholds)
+    bench = DebateBenchmark(
+        oracle=oracle, ledger=ledger, broker=broker,
+        factual_accuracy=0.9, adversarial_ratio=0.0, n_agents=3, seed=seed
+    )
+    agents = [FactualAgent(f"f{i}", 0.9) for i in range(3)]
+    results = bench.run_debate(agents)
+    return {
+        "accuracy": results["accuracy"],
+        "consensus_reached": results["consensus_reached"],
+        "total_compute": results["total_compute"],
+        "mean_compute": results["mean_compute"],
+    }
+def run_anti_gaming_code(seed: int = 42, n_problems: int = 50) -> Dict:
+    """Test anti-gaming: create a gaming agent that always passes public but fails hidden."""
+    random.seed(seed)
+    np.random.seed(seed)
+    oracle = ImpactOracle(gaming_penalty=2.0)
+    ledger = CreditLedger()
+    broker = ResourceBroker()
+    bench = CodeBenchmark(n_problems=n_problems, seed=seed)
+    # Normal agent
+    normal = SimulatedCodeAgent("normal", 0.9, 0.5, 0.1, 150)
+    # Gaming agent: high public pass, high hidden fail
+    gamer = SimulatedCodeAgent("gamer", 0.95, 0.05, 0.8, 100)
+    ledger.earn("normal", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
+    ledger.earn("gamer", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
+    results = {"normal": [], "gamer": []}
+    for problem in bench.problems:
+        for agent in [normal, gamer]:
+            r = agent.solve(problem)
+            oracle_res = oracle.score(
+                "code", {}, {},
+                {"correctness": 1.0 if r["public_pass"] else 0.0,
+                 "pass_at_k": 1.0 if r["hidden_pass"] else 0.0,
+                 "compute_cost": r["compute_cost"],
+                 "public_pass": r["public_pass"],
+                 "hidden_tests_pass": r["hidden_pass"]},
+                agent_id=agent.agent_id,
+            )
+            results[agent.agent_id].append({
+                "public_pass": r["public_pass"],
+                "hidden_pass": r["hidden_pass"],
+                "raw_score": oracle_res.raw_score,
+                "penalized": "gaming_hidden_tests" in oracle_res.failure_tags,
+            })
+    def summarize(who):
+        rs = results[who]
+        return {
+            "public_acc": sum(1 for r in rs if r["public_pass"]) / len(rs),
+            "hidden_acc": sum(1 for r in rs if r["hidden_pass"]) / len(rs),
+            "mean_raw": sum(r["raw_score"] for r in rs) / len(rs),
+            "penalized_rate": sum(1 for r in rs if r["penalized"]) / len(rs),
         }
+    return {"normal": summarize("normal"), "gamer": summarize("gamer")}
+def run_anti_gaming_collusion(seed: int = 42) -> Dict:
+    """Test that credit transfers are always blocked."""
+    ledger = CreditLedger()
+    ledger.earn("alice", "seed", "seed", 10, 0.0, 0.0, "initial")
+    ledger.earn("bob", "seed", "seed", 1, 0.0, 0.0, "initial")
+    ok = ledger.transfer("alice", "bob", 5.0, "global")
+    alice_bal = ledger.balance("alice")
+    bob_bal = ledger.balance("bob")
+    collusion = ledger.detect_collusion(window=10)
+    return {
+        "transfer_allowed": ok,
+        "alice_balance": alice_bal,
+        "bob_balance": bob_bal,
+        "collusion_detected": bool(collusion),
+        "transfer_blocked": not ok,
+    }
+def run_anti_gaming_abstention(seed: int = 42) -> Dict:
+    """Test over-abstention penalty."""
+    oracle = ImpactOracle()
+    # Agent abstains on everything
+    results = []
+    for i in range(10):
+        res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
+                           {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
+        results.append(res.reward_value)
+    return {"mean_reward": sum(results) / len(results), "expected_negative": sum(results) < 0}
+def run_anti_gaming_spam(seed: int = 42) -> Dict:
+    """Test spam detection: high compute, low score."""
+    oracle = ImpactOracle()
+    # High compute but wrong answer
+    res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
+                       {"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
+    return {"reward": res.reward_value, "tagged": bool(res.failure_tags), "tags": res.failure_tags}
+def run_all() -> Dict:
     print("=" * 60)
+    print("OCC UNIFIED EVALUATION RUNNER")
+    print("=" * 60)
+    all_results: Dict[str, Any] = {"ablations": {}, "anti_gaming": {}}
+    # Ablations
+    for abl in ABLATIONS:
+        print(f"\n--- ABLATION: {abl.name} ---")
+        print(f"  {abl.description}")
+        code_res = run_ablation_code(abl, seed=42, n_problems=50)
+        qa_res = run_ablation_qa(abl, seed=42)
+        debate_res = run_ablation_debate(abl, seed=42)
+        print(f"  Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")
+        print(f"  QA:   acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")
+        print(f"  Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")
+        all_results["ablations"][abl.name] = {
+            "config": abl.__dict__,
+            "code": code_res,
+            "qa": qa_res,
+            "debate": debate_res,
+        }
+    # Anti-gaming
+    print("\n--- ANTI-GAMING TESTS ---")
+    all_results["anti_gaming"]["hidden_test_gaming"] = run_anti_gaming_code(seed=42)
+    all_results["anti_gaming"]["collusion"] = run_anti_gaming_collusion(seed=42)
+    all_results["anti_gaming"]["abstention"] = run_anti_gaming_abstention(seed=42)
+    all_results["anti_gaming"]["spam"] = run_anti_gaming_spam(seed=42)
+    for test_name, res in all_results["anti_gaming"].items():
+        print(f"\n  {test_name}: {json.dumps(res, indent=2, default=str)}")
+    # Save
+    out = Path("/app/occ/reports")
+    out.mkdir(parents=True, exist_ok=True)
+    with open(out / "eval_runner_results.json", "w") as f:
+        json.dump(all_results, f, indent=2, default=str)
+    print(f"\nSaved to {out / 'eval_runner_results.json'}")
+    return all_results
 if __name__ == "__main__":
+    run_all()