narcolepticchicken commited on
Commit
9312235
·
verified ·
1 Parent(s): fc4adc2

Upload eval_runner.py

Browse files
Files changed (1) hide show
  1. eval_runner.py +90 -110
eval_runner.py CHANGED
@@ -1,8 +1,7 @@
1
  """
2
  Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
3
- Produces consolidated reports.
4
  """
5
-
6
  import json
7
  import random
8
  from pathlib import Path
@@ -27,56 +26,37 @@ class AblationRunner:
27
  np.random.seed(seed)
28
 
29
  # ------------------------------------------------------------------
30
- # Ablations for Code Benchmark
31
  # ------------------------------------------------------------------
32
 
33
  def ablation_code(self) -> Dict[str, Dict]:
34
  """Run code benchmark with ablated configurations."""
35
- bench = CodeBenchmark(max_problems=50, seed=self.seed)
36
- bench.load_data()
37
 
38
- base_agents = [
39
- SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80),
40
- SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60),
41
- SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120),
42
- ]
 
43
 
44
  results = {}
45
-
46
- # 1. Full OCC
47
- results["full_occ"] = bench.run_occ_allocation(base_agents, max_attempts=5)
48
-
49
- # 2. No credit ledger (oracle score only)
50
- # Simulate by running baseline_fixed but with oracle scoring
51
- results["no_ledger"] = bench.run_baseline_fixed(base_agents, fixed_attempts=3)
52
-
53
- # 3. No cost penalty (effectively baseline)
54
- # Approximate by increasing compute budget so cost penalty vanishes
55
- bench_no_cost = CodeBenchmark(max_problems=50, seed=self.seed)
56
- bench_no_cost.load_data()
57
- bench_no_cost.oracle.compute_budget = 1e12
58
- results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5)
59
-
60
- # 4. No anti-gaming penalty
61
- bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed)
62
- bench_no_game.load_data()
63
- bench_no_game.oracle.gaming_weight = 0.0
64
- gaming_agents = [
65
- SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3),
66
- SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3),
67
- SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3),
68
- ]
69
- results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5)
70
-
71
- # 5. No broker (oracle score only)
72
- bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed)
73
- bench_no_broker.load_data()
74
- results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5)
75
 
76
  return results
77
 
78
  # ------------------------------------------------------------------
79
- # Ablations for Retrieval QA
80
  # ------------------------------------------------------------------
81
 
82
  def ablation_retrieval_qa(self) -> Dict[str, Dict]:
@@ -94,28 +74,27 @@ class AblationRunner:
94
 
95
  results = {}
96
  results["full_occ"] = bench.run_occ(agent)
97
- results["direct_answer"] = bench.run_direct_answer(agent)
98
- results["rag_baseline"] = bench.run_rag_baseline(agent)
99
- results["rag_verifier"] = bench.run_rag_verifier(agent)
100
-
101
- # Ablation: no abstention reward
102
- # Approximate by setting abstention rate very low
 
 
 
 
 
103
  agent_no_abstain = SimulatedRetrievalAgent(
104
- agent_id="rag_agent_no_abstain",
105
- accuracy=0.65,
106
- hallucination_rate=0.12,
107
- calibration_error=0.15,
108
- abstention_rate=0.0,
109
  )
110
  results["no_abstention"] = bench.run_occ(agent_no_abstain)
111
 
112
- # Ablation: no calibration penalty
113
  agent_no_calib = SimulatedRetrievalAgent(
114
- agent_id="rag_agent_no_calib",
115
- accuracy=0.65,
116
- hallucination_rate=0.12,
117
- calibration_error=0.0,
118
- abstention_rate=0.1,
119
  )
120
  results["no_calibration"] = bench.run_occ(agent_no_calib)
121
 
@@ -129,74 +108,67 @@ class AblationRunner:
129
  """Run adversarial tests against the credit system."""
130
  random.seed(self.seed)
131
  np.random.seed(self.seed)
132
-
133
  results = {}
134
 
135
  # 1. Spam low-value actions
136
- bench = CodeBenchmark(max_problems=50, seed=self.seed)
137
- bench.load_data()
138
- spam_agents = [
139
- SimulatedCodeAgent("spam_1", quality=0.05, cost_per_attempt=50),
140
- SimulatedCodeAgent("spam_2", quality=0.05, cost_per_attempt=50),
141
- ]
142
- results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10)
143
-
144
- # 2. Hoarding credits
145
- ledger = CreditLedger(decay_lambda=0.0) # no decay = hoarding
146
- # We'll simulate this via a custom run
147
- bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed)
148
- bench_hoard.load_data()
149
- hoard_agents = [
150
- SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100),
151
  ]
152
- # Force many initial successes to build credit, then stop earning
153
- results["hoarding"] = bench_hoard.run_occ_allocation(hoard_agents, max_attempts=10)
154
-
155
- # 3. Hidden test gaming
156
- bench_game = CodeBenchmark(max_problems=50, seed=self.seed)
157
- bench_game.load_data()
158
- gaming_agents = [
159
- SimulatedCodeAgent("gamer", quality=0.5, cost_per_attempt=100, gaming_mode=True),
160
- ]
161
- results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5)
162
 
163
- # 4. Over-abstention in retrieval
164
  bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
165
  bench_qa.generate_questions()
166
- abstain_agent = SimulatedRetrievalAgent(
167
  agent_id="abstainer",
168
- accuracy=0.65,
169
- hallucination_rate=0.12,
170
- calibration_error=0.15,
171
- abstention_rate=0.9, # over-abstain
172
  )
173
- results["over_abstention"] = bench_qa.run_occ(abstain_agent)
174
 
175
- # 5. Collusion in debate
176
  bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
177
  bench_debate.generate_topics()
178
- colluding_agents = [
179
  SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
180
  SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
181
  SimulatedDebateAgent("honest_1", accuracy=0.6),
182
  SimulatedDebateAgent("honest_2", accuracy=0.6),
183
  ]
184
- # Run equal turns to simulate collusion effect
185
- topic_results = []
186
- for topic in bench_debate.topics:
187
- topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic))
188
- results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns")
189
-
190
- # OCC with colluders
191
  topic_results_occ = []
192
  for topic in bench_debate.topics:
193
- topic_results_occ.append(bench_debate._resolve_occ_allocation(colluding_agents, topic))
 
 
 
 
 
 
 
 
 
 
 
 
194
  results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
195
 
196
  return results
197
 
198
  # ------------------------------------------------------------------
199
- # Consolidated run
200
  # ------------------------------------------------------------------
201
 
202
  def run_all(self) -> Dict:
@@ -215,10 +187,12 @@ class AblationRunner:
215
  "anti_gaming": anti_gaming,
216
  }
217
 
218
- Path("/app/occ/reports").mkdir(parents=True, exist_ok=True)
219
- with open("/app/occ/reports/ablation_and_anti_gaming.json", "w") as f:
 
 
220
  json.dump(report, f, indent=2, default=str)
221
- print("\nSaved ablation/anti-gaming results to reports/ablation_and_anti_gaming.json")
222
  return report
223
 
224
 
@@ -232,18 +206,24 @@ def main():
232
 
233
  print("\n--- Code Ablations ---")
234
  for k, v in report["code_ablations"].items():
235
- print(f"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
 
 
236
 
237
  print("\n--- QA Ablations ---")
238
  for k, v in report["qa_ablations"].items():
239
- print(f"{k:20s}: acc={v.get('accuracy', 'N/A'):.3f}, ECE={v.get('ece', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
 
 
 
240
 
241
  print("\n--- Anti-Gaming ---")
242
  for k, v in report["anti_gaming"].items():
243
  if "accuracy" in v:
244
- print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
245
- elif "pass@1" in v:
246
- print(f"{k:20s}: pass@1={v['pass@1']:.3f}, compute={v.get('total_compute', 'N/A'):.0f}")
 
247
 
248
 
249
  if __name__ == "__main__":
 
1
  """
2
  Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
3
+ Produces consolidated reports compatible with the current benchmark APIs.
4
  """
 
5
  import json
6
  import random
7
  from pathlib import Path
 
26
  np.random.seed(seed)
27
 
28
  # ------------------------------------------------------------------
29
+ # Code Benchmark Ablations
30
  # ------------------------------------------------------------------
31
 
32
  def ablation_code(self) -> Dict[str, Dict]:
33
  """Run code benchmark with ablated configurations."""
34
+ bench = CodeBenchmark(n_problems=50, seed=self.seed)
 
35
 
36
+ cheap = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15,
37
+ cost_per_attempt=60, hidden_test_falloff=0.20)
38
+ medium = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35,
39
+ cost_per_attempt=150, hidden_test_falloff=0.15)
40
+ expensive = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65,
41
+ cost_per_attempt=350, hidden_test_falloff=0.10)
42
 
43
  results = {}
44
+ results["full_occ"] = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
45
+ results["fixed_budget"] = bench.run_fixed_budget(expensive, max_attempts=1)
46
+ results["verifier_guided"] = bench.run_verifier_guided(
47
+ SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65,
48
+ cost_per_attempt=350, hidden_test_falloff=0.10),
49
+ max_attempts=3)
50
+
51
+ # No cost penalty: inflate budget to near-zero penalty
52
+ bench_no_cost = CodeBenchmark(n_problems=50, seed=self.seed)
53
+ bench_no_cost.oracle.compute_penalty_rate = 1e-12
54
+ results["no_cost_penalty"] = bench_no_cost.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  return results
57
 
58
  # ------------------------------------------------------------------
59
+ # Retrieval QA Ablations
60
  # ------------------------------------------------------------------
61
 
62
  def ablation_retrieval_qa(self) -> Dict[str, Dict]:
 
74
 
75
  results = {}
76
  results["full_occ"] = bench.run_occ(agent)
77
+ results["direct_answer"] = bench.run_direct_answer(
78
+ SimulatedRetrievalAgent("direct", accuracy=0.65, hallucination_rate=0.12,
79
+ calibration_error=0.15, abstention_rate=0.1))
80
+ results["rag_baseline"] = bench.run_rag_baseline(
81
+ SimulatedRetrievalAgent("rag", accuracy=0.65, hallucination_rate=0.12,
82
+ calibration_error=0.15, abstention_rate=0.1))
83
+ results["rag_verifier"] = bench.run_rag_verifier(
84
+ SimulatedRetrievalAgent("verifier", accuracy=0.65, hallucination_rate=0.12,
85
+ calibration_error=0.15, abstention_rate=0.1))
86
+
87
+ # No abstention reward
88
  agent_no_abstain = SimulatedRetrievalAgent(
89
+ agent_id="rag_no_abstain",
90
+ accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.0,
 
 
 
91
  )
92
  results["no_abstention"] = bench.run_occ(agent_no_abstain)
93
 
94
+ # No calibration penalty
95
  agent_no_calib = SimulatedRetrievalAgent(
96
+ agent_id="rag_no_calib",
97
+ accuracy=0.65, hallucination_rate=0.12, calibration_error=0.0, abstention_rate=0.1,
 
 
 
98
  )
99
  results["no_calibration"] = bench.run_occ(agent_no_calib)
100
 
 
108
  """Run adversarial tests against the credit system."""
109
  random.seed(self.seed)
110
  np.random.seed(self.seed)
 
111
  results = {}
112
 
113
  # 1. Spam low-value actions
114
+ bench = CodeBenchmark(n_problems=50, seed=self.seed)
115
+ spam = [
116
+ SimulatedCodeAgent("spam_1", pass_rate_easy=0.05, pass_rate_hard=0.0,
117
+ cost_per_attempt=50, hidden_test_falloff=0.0),
118
+ SimulatedCodeAgent("spam_2", pass_rate_easy=0.05, pass_rate_hard=0.0,
119
+ cost_per_attempt=50, hidden_test_falloff=0.0),
 
 
 
 
 
 
 
 
 
120
  ]
121
+ results["spam"] = bench.run_occ_allocation(spam, max_attempts=10)
122
+
123
+ # 2. Hidden-test gaming: public pass but hidden fail
124
+ bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
125
+ # Simulate gaming by creating an agent that always passes public but fails hidden
126
+ # We approximate this by making hidden_test_falloff huge so hidden always fails
127
+ gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
128
+ cost_per_attempt=100, hidden_test_falloff=1.0)
129
+ results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
 
130
 
131
+ # 3. Over-abstention in retrieval QA
132
  bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
133
  bench_qa.generate_questions()
134
+ abstainer = SimulatedRetrievalAgent(
135
  agent_id="abstainer",
136
+ accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.9,
 
 
 
137
  )
138
+ results["over_abstention"] = bench_qa.run_occ(abstainer)
139
 
140
+ # 4. Collusion in debate
141
  bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
142
  bench_debate.generate_topics()
143
+ agents = [
144
  SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
145
  SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
146
  SimulatedDebateAgent("honest_1", accuracy=0.6),
147
  SimulatedDebateAgent("honest_2", accuracy=0.6),
148
  ]
149
+ # Use the internal resolution directly
150
+ topic_results_eq = []
 
 
 
 
 
151
  topic_results_occ = []
152
  for topic in bench_debate.topics:
153
+ topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
154
+ # Reset agents between strategies (token/turn counters)
155
+ for a in agents:
156
+ a.tokens_used = 0
157
+ a.turns_taken = 0
158
+ a.influence_score = 0.0
159
+ topic_results_occ.append(bench_debate._resolve_occ_allocation(agents, topic))
160
+ for a in agents:
161
+ a.tokens_used = 0
162
+ a.turns_taken = 0
163
+ a.influence_score = 0.0
164
+
165
+ results["collusion_equal_turns"] = bench_debate._summarize(topic_results_eq, "collusion_equal_turns")
166
  results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
167
 
168
  return results
169
 
170
  # ------------------------------------------------------------------
171
+ # Full run
172
  # ------------------------------------------------------------------
173
 
174
  def run_all(self) -> Dict:
 
187
  "anti_gaming": anti_gaming,
188
  }
189
 
190
+ out_dir = Path(__file__).parent / "reports"
191
+ out_dir.mkdir(parents=True, exist_ok=True)
192
+ out_path = out_dir / "ablation_and_anti_gaming.json"
193
+ with open(out_path, "w") as f:
194
  json.dump(report, f, indent=2, default=str)
195
+ print(f"\nSaved to {out_path}")
196
  return report
197
 
198
 
 
206
 
207
  print("\n--- Code Ablations ---")
208
  for k, v in report["code_ablations"].items():
209
+ p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
210
+ comp = v.get('total_compute', 'N/A')
211
+ print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
212
 
213
  print("\n--- QA Ablations ---")
214
  for k, v in report["qa_ablations"].items():
215
+ acc = v.get('accuracy', 'N/A')
216
+ ece = v.get('ece', 'N/A')
217
+ comp = v.get('total_compute', 'N/A')
218
+ print(f"{k:20s}: acc={acc if isinstance(acc, str) else f'{acc:.3f}'}, ECE={ece if isinstance(ece, str) else f'{ece:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
219
 
220
  print("\n--- Anti-Gaming ---")
221
  for k, v in report["anti_gaming"].items():
222
  if "accuracy" in v:
223
+ print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A')}")
224
+ elif "pass_at_1" in v or "pass@1" in v:
225
+ p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
226
+ print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={v.get('total_compute', 'N/A')}")
227
 
228
 
229
  if __name__ == "__main__":