narcolepticchicken commited on
Commit
bc02d39
·
verified ·
1 Parent(s): b40184a

Upload eval_runner.py

Browse files
Files changed (1) hide show
  1. eval_runner.py +250 -0
eval_runner.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
3
+ Produces consolidated reports.
4
+ """
5
+
6
+ import json
7
+ import random
8
+ from pathlib import Path
9
+ from typing import Dict, List
10
+
11
+ import numpy as np
12
+
13
+ from benchmarks.benchmark_code import CodeBenchmark, SimulatedCodeAgent
14
+ from benchmarks.benchmark_retrieval_qa import RetrievalQABenchmark, SimulatedRetrievalAgent
15
+ from benchmarks.benchmark_debate import DebateBenchmark, SimulatedDebateAgent
16
+ from oracle.oracle import ImpactOracle
17
+ from ledger.ledger import CreditLedger
18
+ from broker.broker import ResourceBroker
19
+
20
+
21
+ class AblationRunner:
22
+ """Run ablation studies by disabling OCC components one at a time."""
23
+
24
+ def __init__(self, seed: int = 42):
25
+ self.seed = seed
26
+ random.seed(seed)
27
+ np.random.seed(seed)
28
+
29
+ # ------------------------------------------------------------------
30
+ # Ablations for Code Benchmark
31
+ # ------------------------------------------------------------------
32
+
33
+ def ablation_code(self) -> Dict[str, Dict]:
34
+ """Run code benchmark with ablated configurations."""
35
+ bench = CodeBenchmark(max_problems=50, seed=self.seed)
36
+ bench.load_data()
37
+
38
+ base_agents = [
39
+ SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
40
+ SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
41
+ SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
42
+ ]
43
+
44
+ results = {}
45
+
46
+ # 1. Full OCC
47
+ results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5)
48
+
49
+ # 2. No credit ledger (oracle score only)
50
+ # Simulate by running baseline_fixed but with oracle scoring
51
+ results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3)
52
+
53
+ # 3. No cost penalty (effectively baseline)
54
+ # Approximate by increasing compute budget so cost penalty vanishes
55
+ bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed)
56
+ bench_no_cost.load_data()
57
+ bench_no_cost.oracle.compute_budget = 1e12
58
+ results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5)
59
+
60
+ # 4. No anti-gaming penalty
61
+ bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed)
62
+ bench_no_game.load_data()
63
+ bench_no_game.oracle.gaming_weight = 0.0
64
+ gaming_agents = [
65
+ SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3),
66
+ SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3),
67
+ SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3),
68
+ ]
69
+ results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5)
70
+
71
+ # 5. No broker (oracle score only)
72
+ bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed)
73
+ bench_no_broker.load_data()
74
+ results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5)
75
+
76
+ return results
77
+
78
+ # ------------------------------------------------------------------
79
+ # Ablations for Retrieval QA
80
+ # ------------------------------------------------------------------
81
+
82
+ def ablation_retrieval_qa(self) -> Dict[str, Dict]:
83
+ """Run retrieval QA benchmark with ablated configurations."""
84
+ bench = RetrievalQABenchmark(n_questions=100, seed=self.seed)
85
+ bench.generate_questions()
86
+
87
+ agent = SimulatedRetrievalAgent(
88
+ agent_id="rag_agent",
89
+ accuracy=0.65,
90
+ hallucination_rate=0.12,
91
+ calibration_error=0.15,
92
+ abstention_rate=0.1,
93
+ )
94
+
95
+ results = {}
96
+ results["full_occ"] = bench.run_occ(agent)
97
+ results["direct_answer"] = bench.run_direct_answer(agent)
98
+ results["rag_baseline"] = bench.run_rag_baseline(agent)
99
+ results["rag_verifier"] = bench.run_rag_verifier(agent)
100
+
101
+ # Ablation: no abstention reward
102
+ # Approximate by setting abstention rate very low
103
+ agent_no_abstain = SimulatedRetrievalAgent(
104
+ agent_id="rag_agent_no_abstain",
105
+ accuracy=0.65,
106
+ hallucination_rate=0.12,
107
+ calibration_error=0.15,
108
+ abstention_rate=0.0,
109
+ )
110
+ results["no_abstention"] = bench.run_occ(agent_no_abstain)
111
+
112
+ # Ablation: no calibration penalty
113
+ agent_no_calib = SimulatedRetrievalAgent(
114
+ agent_id="rag_agent_no_calib",
115
+ accuracy=0.65,
116
+ hallucination_rate=0.12,
117
+ calibration_error=0.0,
118
+ abstention_rate=0.1,
119
+ )
120
+ results["no_calibration"] = bench.run_occ(agent_no_calib)
121
+
122
+ return results
123
+
124
+ # ------------------------------------------------------------------
125
+ # Anti-Gaming Tests
126
+ # ------------------------------------------------------------------
127
+
128
+ def anti_gaming_tests(self) -> Dict[str, Dict]:
129
+ """Run adversarial tests against the credit system."""
130
+ random.seed(self.seed)
131
+ np.random.seed(self.seed)
132
+
133
+ results = {}
134
+
135
+ # 1. Spam low-value actions
136
+ bench = CodeBenchmark(max_problems=50, seed=self.seed)
137
+ bench.load_data()
138
+ spam_agents = [
139
+ SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50),
140
+ SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50),
141
+ ]
142
+ results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10)
143
+
144
+ # 2. Hoarding credits
145
+ ledger = CreditLedger(decay_lambda=0.0) # no decay = hoarding
146
+ # We'll simulate this via a custom run
147
+ bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed)
148
+ bench_hoard.load_data()
149
+ hoard_agents = [
150
+ SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100),
151
+ ]
152
+ # Force many initial successes to build credit, then stop earning
153
+ results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10)
154
+
155
+ # 3. Hidden test gaming
156
+ bench_game = CodeBenchmark(max_problems=50, seed=self.seed)
157
+ bench_game.load_data()
158
+ gaming_agents = [
159
+ SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True),
160
+ ]
161
+ results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5)
162
+
163
+ # 4. Over-abstention in retrieval
164
+ bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
165
+ bench_qa.generate_questions()
166
+ abstain_agent = SimulatedRetrievalAgent(
167
+ agent_id="abstainer",
168
+ accuracy=0.65,
169
+ hallucination_rate=0.12,
170
+ calibration_error=0.15,
171
+ abstention_rate=0.9, # over-abstain
172
+ )
173
+ results["over_abstention"] = bench_qa.run_occ(abstain_agent)
174
+
175
+ # 5. Collusion in debate
176
+ bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
177
+ bench_debate.generate_topics()
178
+ colluding_agents = [
179
+ SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
180
+ SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
181
+ SimulatedDebateAgent("honest_1", accuracy=0.6),
182
+ SimulatedDebateAgent("honest_2", accuracy=0.6),
183
+ ]
184
+ # Run equal turns to simulate collusion effect
185
+ topic_results = []
186
+ for topic in bench_debate.topics:
187
+ topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic))
188
+ results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns")
189
+
190
+ # OCC with colluders
191
+ topic_results_occ = []
192
+ for topic in bench_debate.topics:
193
+ topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic))
194
+ results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
195
+
196
+ return results
197
+
198
+ # ------------------------------------------------------------------
199
+ # Consolidated run
200
+ # ------------------------------------------------------------------
201
+
202
+ def run_all(self) -> Dict:
203
+ print("Running code ablations...")
204
+ code_ablations = self.ablation_code()
205
+
206
+ print("Running retrieval QA ablations...")
207
+ qa_ablations = self.ablation_retrieval_qa()
208
+
209
+ print("Running anti-gaming tests...")
210
+ anti_gaming = self.anti_gaming_tests()
211
+
212
+ report = {
213
+ "code_ablations": code_ablations,
214
+ "qa_ablations": qa_ablations,
215
+ "anti_gaming": anti_gaming,
216
+ }
217
+
218
+ Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
219
+ with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f:
220
+ json.dump(report, f, indent=2, default=str)
221
+ print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json")
222
+ return report
223
+
224
+
225
+ def main():
226
+ runner = AblationRunner(seed=42)
227
+ report = runner.run_all()
228
+
229
+ print("\n" + "=" * 60)
230
+ print("ABLATION SUMMARY")
231
+ print("=" * 60)
232
+
233
+ print("\n--- Code Ablations ---")
234
+ for k, v in report["code_ablations"].items():
235
+ print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
236
+
237
+ print("\n--- QA Ablations ---")
238
+ for k, v in report["qa_ablations"].items():
239
+ print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
240
+
241
+ print("\n--- Anti-Gaming ---")
242
+ for k, v in report["anti_gaming"].items():
243
+ if "accuracy" in v:
244
+ print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
245
+ elif "pass@1" in v:
246
+ print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
247
+
248
+
249
+ if __name__ == "__main__":
250
+ main()