Upload eval_runner.py
Browse files- eval_runner.py +90 -110
eval_runner.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
|
| 3 |
-
Produces consolidated reports.
|
| 4 |
"""
|
| 5 |
-
|
| 6 |
import json
|
| 7 |
import random
|
| 8 |
from pathlib import Path
|
|
@@ -27,56 +26,37 @@ class AblationRunner:
|
|
| 27 |
np.random.seed(seed)
|
| 28 |
|
| 29 |
# ------------------------------------------------------------------
|
| 30 |
-
#
|
| 31 |
# ------------------------------------------------------------------
|
| 32 |
|
| 33 |
def ablation_code(self) -> Dict[str, Dict]:
|
| 34 |
"""Run code benchmark with ablated configurations."""
|
| 35 |
-
bench = CodeBenchmark(
|
| 36 |
-
bench.load_data()
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
| 43 |
|
| 44 |
results = {}
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
results["
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
bench_no_cost.load_data()
|
| 57 |
-
bench_no_cost.oracle.compute_budget = 1e12
|
| 58 |
-
results["no_cost_penalty"] = bench_no_cost.run_occ_allocation(base_agents, max_attempts=5)
|
| 59 |
-
|
| 60 |
-
# 4. No anti-gaming penalty
|
| 61 |
-
bench_no_game = CodeBenchmark(max_problems=50, seed=self.seed)
|
| 62 |
-
bench_no_game.load_data()
|
| 63 |
-
bench_no_game.oracle.gaming_weight = 0.0
|
| 64 |
-
gaming_agents = [
|
| 65 |
-
SimulatedCodeAgent("agent_A", quality=0.30, cost_per_attempt=80, verbose_padding_prob=0.3),
|
| 66 |
-
SimulatedCodeAgent("agent_B", quality=0.22, cost_per_attempt=60, verbose_padding_prob=0.3),
|
| 67 |
-
SimulatedCodeAgent("agent_C", quality=0.40, cost_per_attempt=120, verbose_padding_prob=0.3),
|
| 68 |
-
]
|
| 69 |
-
results["no_anti_gaming"] = bench_no_game.run_occ_allocation(gaming_agents, max_attempts=5)
|
| 70 |
-
|
| 71 |
-
# 5. No broker (oracle score only)
|
| 72 |
-
bench_no_broker = CodeBenchmark(max_problems=50, seed=self.seed)
|
| 73 |
-
bench_no_broker.load_data()
|
| 74 |
-
results["no_broker"] = bench_no_broker.run_baseline_fixed(base_agents, fixed_attempts=5)
|
| 75 |
|
| 76 |
return results
|
| 77 |
|
| 78 |
# ------------------------------------------------------------------
|
| 79 |
-
#
|
| 80 |
# ------------------------------------------------------------------
|
| 81 |
|
| 82 |
def ablation_retrieval_qa(self) -> Dict[str, Dict]:
|
|
@@ -94,28 +74,27 @@ class AblationRunner:
|
|
| 94 |
|
| 95 |
results = {}
|
| 96 |
results["full_occ"] = bench.run_occ(agent)
|
| 97 |
-
results["direct_answer"] = bench.run_direct_answer(
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
agent_no_abstain = SimulatedRetrievalAgent(
|
| 104 |
-
agent_id="
|
| 105 |
-
accuracy=0.65,
|
| 106 |
-
hallucination_rate=0.12,
|
| 107 |
-
calibration_error=0.15,
|
| 108 |
-
abstention_rate=0.0,
|
| 109 |
)
|
| 110 |
results["no_abstention"] = bench.run_occ(agent_no_abstain)
|
| 111 |
|
| 112 |
-
#
|
| 113 |
agent_no_calib = SimulatedRetrievalAgent(
|
| 114 |
-
agent_id="
|
| 115 |
-
accuracy=0.65,
|
| 116 |
-
hallucination_rate=0.12,
|
| 117 |
-
calibration_error=0.0,
|
| 118 |
-
abstention_rate=0.1,
|
| 119 |
)
|
| 120 |
results["no_calibration"] = bench.run_occ(agent_no_calib)
|
| 121 |
|
|
@@ -129,74 +108,67 @@ class AblationRunner:
|
|
| 129 |
"""Run adversarial tests against the credit system."""
|
| 130 |
random.seed(self.seed)
|
| 131 |
np.random.seed(self.seed)
|
| 132 |
-
|
| 133 |
results = {}
|
| 134 |
|
| 135 |
# 1. Spam low-value actions
|
| 136 |
-
bench = CodeBenchmark(
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
SimulatedCodeAgent("spam_2",
|
| 141 |
-
|
| 142 |
-
results["spam"] = bench.run_occ_allocation(spam_agents, max_attempts=10)
|
| 143 |
-
|
| 144 |
-
# 2. Hoarding credits
|
| 145 |
-
ledger = CreditLedger(decay_lambda=0.0) # no decay = hoarding
|
| 146 |
-
# We'll simulate this via a custom run
|
| 147 |
-
bench_hoard = CodeBenchmark(max_problems=50, seed=self.seed)
|
| 148 |
-
bench_hoard.load_data()
|
| 149 |
-
hoard_agents = [
|
| 150 |
-
SimulatedCodeAgent("hoarder", quality=0.5, cost_per_attempt=100),
|
| 151 |
]
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
]
|
| 161 |
-
results["hidden_test_gaming"] = bench_game.run_occ_allocation(gaming_agents, max_attempts=5)
|
| 162 |
|
| 163 |
-
#
|
| 164 |
bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
|
| 165 |
bench_qa.generate_questions()
|
| 166 |
-
|
| 167 |
agent_id="abstainer",
|
| 168 |
-
accuracy=0.65,
|
| 169 |
-
hallucination_rate=0.12,
|
| 170 |
-
calibration_error=0.15,
|
| 171 |
-
abstention_rate=0.9, # over-abstain
|
| 172 |
)
|
| 173 |
-
results["over_abstention"] = bench_qa.run_occ(
|
| 174 |
|
| 175 |
-
#
|
| 176 |
bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
|
| 177 |
bench_debate.generate_topics()
|
| 178 |
-
|
| 179 |
SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
|
| 180 |
SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
|
| 181 |
SimulatedDebateAgent("honest_1", accuracy=0.6),
|
| 182 |
SimulatedDebateAgent("honest_2", accuracy=0.6),
|
| 183 |
]
|
| 184 |
-
#
|
| 185 |
-
|
| 186 |
-
for topic in bench_debate.topics:
|
| 187 |
-
topic_results.append(bench_debate._resolve_equal_turns(colluding_agents, topic))
|
| 188 |
-
results["collusion_equal_turns"] = bench_debate._summarize(topic_results, "collusion_equal_turns")
|
| 189 |
-
|
| 190 |
-
# OCC with colluders
|
| 191 |
topic_results_occ = []
|
| 192 |
for topic in bench_debate.topics:
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
|
| 195 |
|
| 196 |
return results
|
| 197 |
|
| 198 |
# ------------------------------------------------------------------
|
| 199 |
-
#
|
| 200 |
# ------------------------------------------------------------------
|
| 201 |
|
| 202 |
def run_all(self) -> Dict:
|
|
@@ -215,10 +187,12 @@ class AblationRunner:
|
|
| 215 |
"anti_gaming": anti_gaming,
|
| 216 |
}
|
| 217 |
|
| 218 |
-
Path(
|
| 219 |
-
|
|
|
|
|
|
|
| 220 |
json.dump(report, f, indent=2, default=str)
|
| 221 |
-
print("\nSaved
|
| 222 |
return report
|
| 223 |
|
| 224 |
|
|
@@ -232,18 +206,24 @@ def main():
|
|
| 232 |
|
| 233 |
print("\n--- Code Ablations ---")
|
| 234 |
for k, v in report["code_ablations"].items():
|
| 235 |
-
|
|
|
|
|
|
|
| 236 |
|
| 237 |
print("\n--- QA Ablations ---")
|
| 238 |
for k, v in report["qa_ablations"].items():
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
print("\n--- Anti-Gaming ---")
|
| 242 |
for k, v in report["anti_gaming"].items():
|
| 243 |
if "accuracy" in v:
|
| 244 |
-
print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A')
|
| 245 |
-
elif "pass@1" in v:
|
| 246 |
-
|
|
|
|
| 247 |
|
| 248 |
|
| 249 |
if __name__ == "__main__":
|
|
|
|
| 1 |
"""
|
| 2 |
Evaluation runner: executes all benchmarks, ablations, and anti-gaming tests.
|
| 3 |
+
Produces consolidated reports compatible with the current benchmark APIs.
|
| 4 |
"""
|
|
|
|
| 5 |
import json
|
| 6 |
import random
|
| 7 |
from pathlib import Path
|
|
|
|
| 26 |
np.random.seed(seed)
|
| 27 |
|
| 28 |
# ------------------------------------------------------------------
|
| 29 |
+
# Code Benchmark Ablations
|
| 30 |
# ------------------------------------------------------------------
|
| 31 |
|
| 32 |
def ablation_code(self) -> Dict[str, Dict]:
|
| 33 |
"""Run code benchmark with ablated configurations."""
|
| 34 |
+
bench = CodeBenchmark(n_problems=50, seed=self.seed)
|
|
|
|
| 35 |
|
| 36 |
+
cheap = SimulatedCodeAgent("cheap", pass_rate_easy=0.65, pass_rate_hard=0.15,
|
| 37 |
+
cost_per_attempt=60, hidden_test_falloff=0.20)
|
| 38 |
+
medium = SimulatedCodeAgent("medium", pass_rate_easy=0.85, pass_rate_hard=0.35,
|
| 39 |
+
cost_per_attempt=150, hidden_test_falloff=0.15)
|
| 40 |
+
expensive = SimulatedCodeAgent("expensive", pass_rate_easy=0.95, pass_rate_hard=0.65,
|
| 41 |
+
cost_per_attempt=350, hidden_test_falloff=0.10)
|
| 42 |
|
| 43 |
results = {}
|
| 44 |
+
results["full_occ"] = bench.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
|
| 45 |
+
results["fixed_budget"] = bench.run_fixed_budget(expensive, max_attempts=1)
|
| 46 |
+
results["verifier_guided"] = bench.run_verifier_guided(
|
| 47 |
+
SimulatedCodeAgent("verifier", pass_rate_easy=0.95, pass_rate_hard=0.65,
|
| 48 |
+
cost_per_attempt=350, hidden_test_falloff=0.10),
|
| 49 |
+
max_attempts=3)
|
| 50 |
+
|
| 51 |
+
# No cost penalty: inflate budget to near-zero penalty
|
| 52 |
+
bench_no_cost = CodeBenchmark(n_problems=50, seed=self.seed)
|
| 53 |
+
bench_no_cost.oracle.compute_penalty_rate = 1e-12
|
| 54 |
+
results["no_cost_penalty"] = bench_no_cost.run_occ_allocation([cheap, medium, expensive], max_attempts=3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
return results
|
| 57 |
|
| 58 |
# ------------------------------------------------------------------
|
| 59 |
+
# Retrieval QA Ablations
|
| 60 |
# ------------------------------------------------------------------
|
| 61 |
|
| 62 |
def ablation_retrieval_qa(self) -> Dict[str, Dict]:
|
|
|
|
| 74 |
|
| 75 |
results = {}
|
| 76 |
results["full_occ"] = bench.run_occ(agent)
|
| 77 |
+
results["direct_answer"] = bench.run_direct_answer(
|
| 78 |
+
SimulatedRetrievalAgent("direct", accuracy=0.65, hallucination_rate=0.12,
|
| 79 |
+
calibration_error=0.15, abstention_rate=0.1))
|
| 80 |
+
results["rag_baseline"] = bench.run_rag_baseline(
|
| 81 |
+
SimulatedRetrievalAgent("rag", accuracy=0.65, hallucination_rate=0.12,
|
| 82 |
+
calibration_error=0.15, abstention_rate=0.1))
|
| 83 |
+
results["rag_verifier"] = bench.run_rag_verifier(
|
| 84 |
+
SimulatedRetrievalAgent("verifier", accuracy=0.65, hallucination_rate=0.12,
|
| 85 |
+
calibration_error=0.15, abstention_rate=0.1))
|
| 86 |
+
|
| 87 |
+
# No abstention reward
|
| 88 |
agent_no_abstain = SimulatedRetrievalAgent(
|
| 89 |
+
agent_id="rag_no_abstain",
|
| 90 |
+
accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.0,
|
|
|
|
|
|
|
|
|
|
| 91 |
)
|
| 92 |
results["no_abstention"] = bench.run_occ(agent_no_abstain)
|
| 93 |
|
| 94 |
+
# No calibration penalty
|
| 95 |
agent_no_calib = SimulatedRetrievalAgent(
|
| 96 |
+
agent_id="rag_no_calib",
|
| 97 |
+
accuracy=0.65, hallucination_rate=0.12, calibration_error=0.0, abstention_rate=0.1,
|
|
|
|
|
|
|
|
|
|
| 98 |
)
|
| 99 |
results["no_calibration"] = bench.run_occ(agent_no_calib)
|
| 100 |
|
|
|
|
| 108 |
"""Run adversarial tests against the credit system."""
|
| 109 |
random.seed(self.seed)
|
| 110 |
np.random.seed(self.seed)
|
|
|
|
| 111 |
results = {}
|
| 112 |
|
| 113 |
# 1. Spam low-value actions
|
| 114 |
+
bench = CodeBenchmark(n_problems=50, seed=self.seed)
|
| 115 |
+
spam = [
|
| 116 |
+
SimulatedCodeAgent("spam_1", pass_rate_easy=0.05, pass_rate_hard=0.0,
|
| 117 |
+
cost_per_attempt=50, hidden_test_falloff=0.0),
|
| 118 |
+
SimulatedCodeAgent("spam_2", pass_rate_easy=0.05, pass_rate_hard=0.0,
|
| 119 |
+
cost_per_attempt=50, hidden_test_falloff=0.0),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
]
|
| 121 |
+
results["spam"] = bench.run_occ_allocation(spam, max_attempts=10)
|
| 122 |
+
|
| 123 |
+
# 2. Hidden-test gaming: public pass but hidden fail
|
| 124 |
+
bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
|
| 125 |
+
# Simulate gaming by creating an agent that always passes public but fails hidden
|
| 126 |
+
# We approximate this by making hidden_test_falloff huge so hidden always fails
|
| 127 |
+
gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
|
| 128 |
+
cost_per_attempt=100, hidden_test_falloff=1.0)
|
| 129 |
+
results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
|
|
|
|
| 130 |
|
| 131 |
+
# 3. Over-abstention in retrieval QA
|
| 132 |
bench_qa = RetrievalQABenchmark(n_questions=100, seed=self.seed)
|
| 133 |
bench_qa.generate_questions()
|
| 134 |
+
abstainer = SimulatedRetrievalAgent(
|
| 135 |
agent_id="abstainer",
|
| 136 |
+
accuracy=0.65, hallucination_rate=0.12, calibration_error=0.15, abstention_rate=0.9,
|
|
|
|
|
|
|
|
|
|
| 137 |
)
|
| 138 |
+
results["over_abstention"] = bench_qa.run_occ(abstainer)
|
| 139 |
|
| 140 |
+
# 4. Collusion in debate
|
| 141 |
bench_debate = DebateBenchmark(n_topics=50, n_agents=4, seed=self.seed)
|
| 142 |
bench_debate.generate_topics()
|
| 143 |
+
agents = [
|
| 144 |
SimulatedDebateAgent("collude_1", accuracy=0.6, collude_with="collude_2"),
|
| 145 |
SimulatedDebateAgent("collude_2", accuracy=0.6, collude_with="collude_1"),
|
| 146 |
SimulatedDebateAgent("honest_1", accuracy=0.6),
|
| 147 |
SimulatedDebateAgent("honest_2", accuracy=0.6),
|
| 148 |
]
|
| 149 |
+
# Use the internal resolution directly
|
| 150 |
+
topic_results_eq = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
topic_results_occ = []
|
| 152 |
for topic in bench_debate.topics:
|
| 153 |
+
topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
|
| 154 |
+
# Reset agents between strategies (token/turn counters)
|
| 155 |
+
for a in agents:
|
| 156 |
+
a.tokens_used = 0
|
| 157 |
+
a.turns_taken = 0
|
| 158 |
+
a.influence_score = 0.0
|
| 159 |
+
topic_results_occ.append(bench_debate._resolve_occ_allocation(agents, topic))
|
| 160 |
+
for a in agents:
|
| 161 |
+
a.tokens_used = 0
|
| 162 |
+
a.turns_taken = 0
|
| 163 |
+
a.influence_score = 0.0
|
| 164 |
+
|
| 165 |
+
results["collusion_equal_turns"] = bench_debate._summarize(topic_results_eq, "collusion_equal_turns")
|
| 166 |
results["collusion_occ"] = bench_debate._summarize(topic_results_occ, "collusion_occ")
|
| 167 |
|
| 168 |
return results
|
| 169 |
|
| 170 |
# ------------------------------------------------------------------
|
| 171 |
+
# Full run
|
| 172 |
# ------------------------------------------------------------------
|
| 173 |
|
| 174 |
def run_all(self) -> Dict:
|
|
|
|
| 187 |
"anti_gaming": anti_gaming,
|
| 188 |
}
|
| 189 |
|
| 190 |
+
out_dir = Path(__file__).parent / "reports"
|
| 191 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 192 |
+
out_path = out_dir / "ablation_and_anti_gaming.json"
|
| 193 |
+
with open(out_path, "w") as f:
|
| 194 |
json.dump(report, f, indent=2, default=str)
|
| 195 |
+
print(f"\nSaved to {out_path}")
|
| 196 |
return report
|
| 197 |
|
| 198 |
|
|
|
|
| 206 |
|
| 207 |
print("\n--- Code Ablations ---")
|
| 208 |
for k, v in report["code_ablations"].items():
|
| 209 |
+
p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
|
| 210 |
+
comp = v.get('total_compute', 'N/A')
|
| 211 |
+
print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
|
| 212 |
|
| 213 |
print("\n--- QA Ablations ---")
|
| 214 |
for k, v in report["qa_ablations"].items():
|
| 215 |
+
acc = v.get('accuracy', 'N/A')
|
| 216 |
+
ece = v.get('ece', 'N/A')
|
| 217 |
+
comp = v.get('total_compute', 'N/A')
|
| 218 |
+
print(f"{k:20s}: acc={acc if isinstance(acc, str) else f'{acc:.3f}'}, ECE={ece if isinstance(ece, str) else f'{ece:.3f}'}, compute={comp if isinstance(comp, str) else f'{comp:.0f}'}")
|
| 219 |
|
| 220 |
print("\n--- Anti-Gaming ---")
|
| 221 |
for k, v in report["anti_gaming"].items():
|
| 222 |
if "accuracy" in v:
|
| 223 |
+
print(f"{k:20s}: acc={v['accuracy']:.3f}, compute={v.get('total_compute', 'N/A')}")
|
| 224 |
+
elif "pass_at_1" in v or "pass@1" in v:
|
| 225 |
+
p1 = v.get('pass_at_1', v.get('pass@1', 'N/A'))
|
| 226 |
+
print(f"{k:20s}: pass@1={p1 if isinstance(p1, str) else f'{p1:.3f}'}, compute={v.get('total_compute', 'N/A')}")
|
| 227 |
|
| 228 |
|
| 229 |
if __name__ == "__main__":
|