narcolepticchicken commited on
Commit
ae2b06a
·
verified ·
1 Parent(s): e83f963

Upload eval_runner.py

Browse files
Files changed (1) hide show
  1. eval_runner.py +300 -205
eval_runner.py CHANGED
@@ -1,227 +1,322 @@
1
  """
2
- Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
3
- Produces consolidated reports compatible with the current benchmark APIs.
4
  """
5
  import json
6
  import random
 
 
7
  from pathlib import Path
8
- from typing import Dict, List
9
 
10
  import numpy as np
11
 
12
- from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
13
- from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent
14
- from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent
15
  from oracle.oracle import ImpactOracle
16
  from ledger.ledger import CreditLedger
17
- from broker.broker import ResourceBroker
18
-
19
-
20
- class AblationRunner:
21
- """Run ablation studies by disabling OCC components one at a time."""
22
-
23
- def __init__(self, seed: int = 42):
24
- self.seed = seed
25
- random.seed(seed)
26
- np.random.seed(seed)
27
-
28
- # ------------------------------------------------------------------
29
- # Code Benchmark Ablations
30
- # ------------------------------------------------------------------
31
-
32
- def ablation_code(self) -> Dict[str, Dict]:
33
- """Run code benchmark with ablated configurations."""
34
- bench = CodeBenchmark(n_problems=50, seed=self.seed)
35
-
36
- cheap = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15,
37
- cost_per_attempt=60, hidden_test_falloff=0.20)
38
- medium = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35,
39
- cost_per_attempt=150, hidden_test_falloff=0.15)
40
- expensive = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65,
41
- cost_per_attempt=350, hidden_test_falloff=0.10)
42
-
43
- results = {}
44
- results["full_occ"] = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
45
- results["fixed_budget"] = bench.run_fixed_budget(expensive, max_attempts=1)
46
- results["verifier_guided"] = bench.run_verifier_guided(
47
- SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65,
48
- cost_per_attempt=350, hidden_test_falloff=0.10),
49
- max_attempts=3)
50
-
51
- # No cost penalty: inflate budget to near-zero penalty
52
- bench_no_cost = CodeBenchmark(n_problems=50, seed=self.seed)
53
- bench_no_cost.oracle.compute_penalty_rate = 1e-12
54
- results["no_cost_penalty"] = bench_no_cost.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
55
-
56
- return results
57
-
58
- # ------------------------------------------------------------------
59
- # Retrieval QA Ablations
60
- # ------------------------------------------------------------------
61
-
62
- def ablation_retrieval_qa(self) -> Dict[str, Dict]:
63
- """Run retrieval QA benchmark with ablated configurations."""
64
- bench = RetrievalQABenchmark(n_questions=100, seed=self.seed)
65
- bench.generate_questions()
66
-
67
- agent = SimulatedRetrievalAgent(
68
- agent_id="rag_agent",
69
- accuracy=0.65,
70
- hallucination_rate=0.12,
71
- calibration_error=0.15,
72
- abstention_rate=0.1,
73
- )
74
-
75
- results = {}
76
- results["full_occ"] = bench.run_occ(agent)
77
- results["direct_answer"] = bench.run_direct_answer(
78
- SimulatedRetrievalAgent("direct", accuracy=0.65, hallucination_rate=0.12,
79
- calibration_error=0.15, abstention_rate=0.1))
80
- results["rag_baseline"] = bench.run_rag_baseline(
81
- SimulatedRetrievalAgent("rag", accuracy=0.65, hallucination_rate=0.12,
82
- calibration_error=0.15, abstention_rate=0.1))
83
- results["rag_verifier"] = bench.run_rag_verifier(
84
- SimulatedRetrievalAgent("verifier", accuracy=0.65, hallucination_rate=0.12,
85
- calibration_error=0.15, abstention_rate=0.1))
86
-
87
- # No abstention reward
88
- agent_no_abstain = SimulatedRetrievalAgent(
89
- agent_id="rag_no_abstain",
90
- accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.0,
91
- )
92
- results["no_abstention"] = bench.run_occ(agent_no_abstain)
93
-
94
- # No calibration penalty
95
- agent_no_calib = SimulatedRetrievalAgent(
96
- agent_id="rag_no_calib",
97
- accuracy=0.65, hallucination_rate=0.12, calibration_error=0.0, abstention_rate=0.1,
98
- )
99
- results["no_calibration"] = bench.run_occ(agent_no_calib)
100
-
101
- return results
102
-
103
- # ------------------------------------------------------------------
104
- # Anti-Gaming Tests
105
- # ------------------------------------------------------------------
106
-
107
- def anti_gaming_tests(self) -> Dict[str, Dict]:
108
- """Run adversarial tests against the credit system."""
109
- random.seed(self.seed)
110
- np.random.seed(self.seed)
111
- results = {}
112
-
113
- # 1. Spam low-value actions
114
- bench = CodeBenchmark(n_problems=50, seed=self.seed)
115
- spam = [
116
- SimulatedCodeAgent("spam_1", pass_rate_easy=0.05, pass_rate_hard=0.0,
117
- cost_per_attempt=50, hidden_test_falloff=0.0),
118
- SimulatedCodeAgent("spam_2", pass_rate_easy=0.05, pass_rate_hard=0.0,
119
- cost_per_attempt=50, hidden_test_falloff=0.0),
120
- ]
121
- results["spam"] = bench.run_occ_allocation(spam, max_attempts=10)
122
-
123
- # 2. Hidden-test gaming: public pass but hidden fail
124
- bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
125
- # Simulate gaming by creating an agent that always passes public but fails hidden
126
- gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
127
- cost_per_attempt=100, hidden_test_falloff=1.0)
128
- results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
129
-
130
- # 3. Over-abstention in retrieval QA
131
- bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
132
- bench_qa.generate_questions()
133
- abstainer = SimulatedRetrievalAgent(
134
- agent_id="abstainer",
135
- accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.9,
136
- )
137
- results["over_abstention"] = bench_qa.run_occ(abstainer)
138
-
139
- # 4. Collusion in debate
140
- bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
141
- bench_debate.generate_topics()
142
- agents = [
143
- SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
144
- SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
145
- SimulatedDebateAgent("honest_1", accuracy=0.6),
146
- SimulatedDebateAgent("honest_2", accuracy=0.6),
147
- ]
148
- topic_results_eq = []
149
- topic_results_occ = []
150
- for topic in bench_debate.topics:
151
- topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
152
- for a in agents:
153
- a.tokens_used = 0
154
- a.turns_taken = 0
155
- a.influence_score = 0.0
156
- topic_results_occ.append(bench_debate._resolve_occ_allocation(agents, topic))
157
- for a in agents:
158
- a.tokens_used = 0
159
- a.turns_taken = 0
160
- a.influence_score = 0.0
161
-
162
- results["collusion_equal_turns"] = bench_debate._summarize(topic_results_eq, "collusion_equal_turns")
163
- results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
164
-
165
- return results
166
-
167
- # ------------------------------------------------------------------
168
- # Full run
169
- # ------------------------------------------------------------------
170
-
171
- def run_all(self) -> Dict:
172
- print("Running code ablations...")
173
- code_ablations = self.ablation_code()
174
-
175
- print("Running retrieval QA ablations...")
176
- qa_ablations = self.ablation_retrieval_qa()
177
-
178
- print("Running anti-gaming tests...")
179
- anti_gaming = self.anti_gaming_tests()
180
-
181
- report = {
182
- "code_ablations": code_ablations,
183
- "qa_ablations": qa_ablations,
184
- "anti_gaming": anti_gaming,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  }
186
 
187
- out_dir = Path(__file__).parent / "reports"
188
- out_dir.mkdir(parents=True, exist_ok=True)
189
- out_path = out_dir / "ablation_and_anti_gaming.json"
190
- with open(out_path, "w") as f:
191
- json.dump(report, f, indent=2, default=str)
192
- print(f"\nSaved to {out_path}")
193
- return report
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
- def main():
197
- runner = AblationRunner(seed=42)
198
- report = runner.run_all()
 
 
 
 
 
 
 
199
 
200
- print("\n" + "=" * 60)
201
- print("ABLATION SUMMARY")
 
 
 
 
 
 
 
 
 
202
  print("=" * 60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- print("\n--- Code Ablations ---")
205
- for k, v in report["code_ablations"].items():
206
- p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
207
- comp = v.get('total_compute', 'N/A')
208
- print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
 
209
 
210
- print("\n--- QA Ablations ---")
211
- for k, v in report["qa_ablations"].items():
212
- acc = v.get('accuracy', 'N/A')
213
- ece = v.get('ece', 'N/A')
214
- comp = v.get('total_compute', 'N/A')
215
- print(f"{k:20s}: acc={acc if isinstance(acc, str) else f'{acc:.3f}'}, ECE={ece if isinstance(ece, str) else f'{ece:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
216
 
217
- print("\n--- Anti-Gaming ---")
218
- for k, v in report["anti_gaming"].items():
219
- if "accuracy" in v:
220
- print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A')}")
221
- elif "pass_at_1" in v or "pass@1" in v:
222
- p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
223
- print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={v.get('total_compute', 'N/A')}")
224
 
225
 
226
  if __name__ == "__main__":
227
- main()
 
1
  """
2
+ Unified evaluation runner: all ablations + anti-gaming tests.
3
+ Runs simulated benchmarks under 10 ablation conditions and 6 anti-gaming attacks.
4
  """
5
  import json
6
  import random
7
+ import sys
8
+ from dataclasses import dataclass
9
  from pathlib import Path
10
+ from typing import Any, Dict, List, Tuple
11
 
12
  import numpy as np
13
 
14
+ # Ensure imports work
15
+ sys.path.insert(0, str(Path(__file__).parent))
 
16
  from oracle.oracle import ImpactOracle
17
  from ledger.ledger import CreditLedger
18
+ from broker.broker import ResourceBroker, Decision
19
+ from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
20
+ from benchmarks.benchmark_retrieval_qa import (
21
+ QABenchmark,
22
+ SimulatedAgent,
23
+ create_qa_dataset,
24
+ )
25
+ from benchmarks.benchmark_debate_v2 import (
26
+ DebateBenchmark,
27
+ FactualAgent,
28
+ OverconfidentAgent,
29
+ UncertainAgent,
30
+ SycophantAgent,
31
+ )
32
+
33
+
34
+ @dataclass
35
+ class AblConfig:
36
+ name: str
37
+ description: str
38
+ oracle_weights: Dict[str, Any]
39
+ broker_thresholds: Dict[str, float]
40
+ decay_lambda: float
41
+ gaming_penalty: float
42
+ compute_penalty_rate: float
43
+ anti_gaming_on: bool
44
+
45
+
46
+ ABLATIONS = [
47
+ AblConfig("default", "Full OCC stack", {}, {}, 0.02, 2.0, 0.0001, True),
48
+ AblConfig("no_decay", "No credit decay (lambda=0)", {}, {}, 0.0, 2.0, 0.0001, True),
49
+ AblConfig("fast_decay", "Aggressive decay (lambda=0.1)", {}, {}, 0.1, 2.0, 0.0001, True),
50
+ AblConfig("no_gaming_penalty", "No gaming penalties", {}, {}, 0.02, 0.0, 0.0001, True),
51
+ AblConfig("high_gaming_penalty", "Severe gaming penalties (5.0)", {}, {}, 0.02, 5.0, 0.0001, True),
52
+ AblConfig("lenient_broker", "Lenient broker (thresholds x0.5)", {}, {"low": 0.25, "medium": 1.0, "high": 2.5}, 0.02, 2.0, 0.0001, True),
53
+ AblConfig("strict_broker", "Strict broker (thresholds x2.0)", {}, {"low": 1.0, "medium": 4.0, "high": 10.0}, 0.02, 2.0, 0.0001, True),
54
+ AblConfig("high_compute_cost", "High compute penalty (x10)", {}, {}, 0.02, 2.0, 0.001, True),
55
+ AblConfig("low_compute_cost", "Low compute penalty (x0.1)", {}, {}, 0.02, 2.0, 0.00001, True),
56
+ AblConfig("anti_gaming_off", "Disable all anti-gaming detectors", {}, {}, 0.02, 2.0, 0.0001, False),
57
+ ]
58
+
59
+
60
+ def run_ablation_code(config: AblConfig, seed: int = 42, n_problems: int = 50) -> Dict:
61
+ random.seed(seed)
62
+ np.random.seed(seed)
63
+
64
+ oracle = ImpactOracle(
65
+ code_weights={"correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001},
66
+ compute_penalty_rate=config.compute_penalty_rate,
67
+ gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
68
+ )
69
+ ledger = CreditLedger(decay_lambda=config.decay_lambda)
70
+ broker = ResourceBroker(thresholds=config.broker_thresholds)
71
+
72
+ bench = CodeBenchmark(n_problems=n_problems, seed=seed)
73
+ cheap = SimulatedCodeAgent("cheap", 0.65, 0.15, 0.20, 60)
74
+ medium = SimulatedCodeAgent("medium", 0.85, 0.35, 0.15, 150)
75
+ expensive = SimulatedCodeAgent("expensive", 0.95, 0.65, 0.10, 350)
76
+
77
+ # Seed ledger
78
+ for a in [cheap, medium, expensive]:
79
+ q = (a.pass_rate_easy + a.pass_rate_hard) / 2
80
+ ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
81
+
82
+ # Override benchmark's oracle/ledger/broker
83
+ results = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
84
+ # (the benchmark internally uses its own instances; we use the standalone below)
85
+ # Actually the benchmark creates its own objects. Let's run standalone:
86
+ return _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed)
87
+
88
+
89
+ def _run_occ_code_standalone(oracle, ledger, broker, cheap, medium, expensive, n_problems, seed):
90
+ random.seed(seed)
91
+ np.random.seed(seed)
92
+ bench = CodeBenchmark(n_problems=n_problems, seed=seed)
93
+ agents = [cheap, medium, expensive]
94
+ for a in agents:
95
+ q = (a.pass_rate_easy + a.pass_rate_hard) / 2
96
+ ledger.earn(a.agent_id, "seed", "seed", q * 20, 0.0, 0.0, "initial", "model_call")
97
+
98
+ total_compute = 0
99
+ results = []
100
+ for problem in bench.problems:
101
+ solved = False
102
+ cost = 0
103
+ used = []
104
+ ranked = sorted(agents, key=lambda a: a.cost_per_attempt / max(0.1, (a.pass_rate_easy + a.pass_rate_hard) / 2))
105
+ for agent in ranked:
106
+ if solved or len(used) >= 3:
107
+ break
108
+ a.attempts += 1
109
+ r = agent.solve(problem)
110
+ cost += r["compute_cost"]
111
+ total_compute += r["compute_cost"]
112
+ used.append(agent.agent_id)
113
+ solved = r["public_pass"]
114
+ hidden = r["hidden_pass"]
115
+ oracle_res = oracle.score(
116
+ "code", {"attempt": len(used)}, {},
117
+ {"correctness": 1.0 if solved else 0.0, "pass_at_k": 1.0 if hidden else 0.0,
118
+ "compute_cost": cost, "public_pass": solved, "hidden_tests_pass": hidden},
119
+ agent_id=agent.agent_id,
120
+ )
121
+ if oracle_res.raw_score > 0:
122
+ ledger.earn(agent.agent_id, problem.task_id, "solve", oracle_res.raw_score * 5,
123
+ oracle_res.raw_score, cost, "pass", "model_call")
124
+ else:
125
+ ledger.spend(agent.agent_id, problem.task_id, "solve", 1.0, "model_call", "fail")
126
+ if hidden:
127
+ break
128
+ results.append({"solved": solved, "cost": cost, "agents": used})
129
+
130
+ acc = sum(1 for r in results if r["solved"]) / len(results)
131
+ return {
132
+ "accuracy": acc,
133
+ "total_compute": total_compute,
134
+ "mean_compute": total_compute / len(results),
135
+ "mean_agents": sum(len(r["agents"]) for r in results) / len(results),
136
+ }
137
+
138
+
139
+ def run_ablation_qa(config: AblConfig, seed: int = 42) -> Dict:
140
+ random.seed(seed)
141
+ np.random.seed(seed)
142
+ oracle = ImpactOracle(
143
+ compute_penalty_rate=config.compute_penalty_rate,
144
+ gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
145
+ )
146
+ ledger = CreditLedger(decay_lambda=config.decay_lambda)
147
+ broker = ResourceBroker(thresholds=config.broker_thresholds)
148
+
149
+ data = create_qa_dataset(seed=seed)
150
+ bench = QABenchmark(data, oracle, ledger, broker, seed=seed)
151
+ agent = SimulatedAgent("qa_agent", oracle, ledger, broker, 0.85)
152
+ agent.budget = 50000
153
+ agent.strategy = "adaptive"
154
+ results = bench.run_occ(agent)
155
+ return {
156
+ "accuracy": results["accuracy"],
157
+ "total_compute": results["total_compute"],
158
+ "mean_compute": results["mean_compute"],
159
+ "precision": results.get("precision", 0),
160
+ "recall": results.get("recall", 0),
161
+ }
162
+
163
+
164
+ def run_ablation_debate(config: AblConfig, seed: int = 42, n_debates: int = 20) -> Dict:
165
+ random.seed(seed)
166
+ np.random.seed(seed)
167
+ oracle = ImpactOracle(
168
+ compute_penalty_rate=config.compute_penalty_rate,
169
+ gaming_penalty=config.gaming_penalty if config.anti_gaming_on else 0.0,
170
+ )
171
+ ledger = CreditLedger(decay_lambda=config.decay_lambda)
172
+ broker = ResourceBroker(thresholds=config.broker_thresholds)
173
+
174
+ bench = DebateBenchmark(
175
+ oracle=oracle, ledger=ledger, broker=broker,
176
+ factual_accuracy=0.9, adversarial_ratio=0.0, n_agents=3, seed=seed
177
+ )
178
+ agents = [FactualAgent(f"f{i}", 0.9) for i in range(3)]
179
+ results = bench.run_debate(agents)
180
+ return {
181
+ "accuracy": results["accuracy"],
182
+ "consensus_reached": results["consensus_reached"],
183
+ "total_compute": results["total_compute"],
184
+ "mean_compute": results["mean_compute"],
185
+ }
186
+
187
+
188
+ def run_anti_gaming_code(seed: int = 42, n_problems: int = 50) -> Dict:
189
+ """Test anti-gaming: create a gaming agent that always passes public but fails hidden."""
190
+ random.seed(seed)
191
+ np.random.seed(seed)
192
+ oracle = ImpactOracle(gaming_penalty=2.0)
193
+ ledger = CreditLedger()
194
+ broker = ResourceBroker()
195
+
196
+ bench = CodeBenchmark(n_problems=n_problems, seed=seed)
197
+ # Normal agent
198
+ normal = SimulatedCodeAgent("normal", 0.9, 0.5, 0.1, 150)
199
+ # Gaming agent: high public pass, high hidden fail
200
+ gamer = SimulatedCodeAgent("gamer", 0.95, 0.05, 0.8, 100)
201
+
202
+ ledger.earn("normal", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
203
+ ledger.earn("gamer", "seed", "seed", 15, 0.0, 0.0, "initial", "model_call")
204
+
205
+ results = {"normal": [], "gamer": []}
206
+ for problem in bench.problems:
207
+ for agent in [normal, gamer]:
208
+ r = agent.solve(problem)
209
+ oracle_res = oracle.score(
210
+ "code", {}, {},
211
+ {"correctness": 1.0 if r["public_pass"] else 0.0,
212
+ "pass_at_k": 1.0 if r["hidden_pass"] else 0.0,
213
+ "compute_cost": r["compute_cost"],
214
+ "public_pass": r["public_pass"],
215
+ "hidden_tests_pass": r["hidden_pass"]},
216
+ agent_id=agent.agent_id,
217
+ )
218
+ results[agent.agent_id].append({
219
+ "public_pass": r["public_pass"],
220
+ "hidden_pass": r["hidden_pass"],
221
+ "raw_score": oracle_res.raw_score,
222
+ "penalized": "gaming_hidden_tests" in oracle_res.failure_tags,
223
+ })
224
+
225
+ def summarize(who):
226
+ rs = results[who]
227
+ return {
228
+ "public_acc": sum(1 for r in rs if r["public_pass"]) / len(rs),
229
+ "hidden_acc": sum(1 for r in rs if r["hidden_pass"]) / len(rs),
230
+ "mean_raw": sum(r["raw_score"] for r in rs) / len(rs),
231
+ "penalized_rate": sum(1 for r in rs if r["penalized"]) / len(rs),
232
  }
233
 
234
+ return {"normal": summarize("normal"), "gamer": summarize("gamer")}
235
+
236
+
237
+ def run_anti_gaming_collusion(seed: int = 42) -> Dict:
238
+ """Test that credit transfers are always blocked."""
239
+ ledger = CreditLedger()
240
+ ledger.earn("alice", "seed", "seed", 10, 0.0, 0.0, "initial")
241
+ ledger.earn("bob", "seed", "seed", 1, 0.0, 0.0, "initial")
242
+
243
+ ok = ledger.transfer("alice", "bob", 5.0, "global")
244
+ alice_bal = ledger.balance("alice")
245
+ bob_bal = ledger.balance("bob")
246
+
247
+ collusion = ledger.detect_collusion(window=10)
248
+ return {
249
+ "transfer_allowed": ok,
250
+ "alice_balance": alice_bal,
251
+ "bob_balance": bob_bal,
252
+ "collusion_detected": bool(collusion),
253
+ "transfer_blocked": not ok,
254
+ }
255
 
256
 
257
+ def run_anti_gaming_abstention(seed: int = 42) -> Dict:
258
+ """Test over-abstention penalty."""
259
+ oracle = ImpactOracle()
260
+ # Agent abstains on everything
261
+ results = []
262
+ for i in range(10):
263
+ res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
264
+ {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
265
+ results.append(res.reward_value)
266
+ return {"mean_reward": sum(results) / len(results), "expected_negative": sum(results) < 0}
267
 
268
+
269
+ def run_anti_gaming_spam(seed: int = 42) -> Dict:
270
+ """Test spam detection: high compute, low score."""
271
+ oracle = ImpactOracle()
272
+ # High compute but wrong answer
273
+ res = oracle.score("retrieval_qa", {}, {"gold_answer": "paris"},
274
+ {"answer": "london", "confidence": 0.1, "evidence": {}, "compute_cost": 5000})
275
+ return {"reward": res.reward_value, "tagged": bool(res.failure_tags), "tags": res.failure_tags}
276
+
277
+
278
+ def run_all() -> Dict:
279
  print("=" * 60)
280
+ print("OCC UNIFIED EVALUATION RUNNER")
281
+ print("=" * 60)
282
+
283
+ all_results: Dict[str, Any] = {"ablations": {}, "anti_gaming": {}}
284
+
285
+ # Ablations
286
+ for abl in ABLATIONS:
287
+ print(f"\n--- ABLATION: {abl.name} ---")
288
+ print(f" {abl.description}")
289
+ code_res = run_ablation_code(abl, seed=42, n_problems=50)
290
+ qa_res = run_ablation_qa(abl, seed=42)
291
+ debate_res = run_ablation_debate(abl, seed=42)
292
+ print(f" Code: acc={code_res['accuracy']:.3f}, compute={code_res['total_compute']:.0f}")
293
+ print(f" QA: acc={qa_res['accuracy']:.3f}, compute={qa_res['total_compute']:.0f}")
294
+ print(f" Debate: acc={debate_res['accuracy']:.3f}, compute={debate_res['total_compute']:.0f}")
295
+ all_results["ablations"][abl.name] = {
296
+ "config": abl.__dict__,
297
+ "code": code_res,
298
+ "qa": qa_res,
299
+ "debate": debate_res,
300
+ }
301
 
302
+ # Anti-gaming
303
+ print("\n--- ANTI-GAMING TESTS ---")
304
+ all_results["anti_gaming"]["hidden_test_gaming"] = run_anti_gaming_code(seed=42)
305
+ all_results["anti_gaming"]["collusion"] = run_anti_gaming_collusion(seed=42)
306
+ all_results["anti_gaming"]["abstention"] = run_anti_gaming_abstention(seed=42)
307
+ all_results["anti_gaming"]["spam"] = run_anti_gaming_spam(seed=42)
308
 
309
+ for test_name, res in all_results["anti_gaming"].items():
310
+ print(f"\n {test_name}: {json.dumps(res, indent=2, default=str)}")
 
 
 
 
311
 
312
+ # Save
313
+ out = Path("/app/occ/reports")
314
+ out.mkdir(parents=True, exist_ok=True)
315
+ with open(out / "eval_runner_results.json", "w") as f:
316
+ json.dump(all_results, f, indent=2, default=str)
317
+ print(f"\nSaved to {out / 'eval_runner_results.json'}")
318
+ return all_results
319
 
320
 
321
  if __name__ == "__main__":
322
+ run_all()