Upload eval_runner.py
Browse files- eval_runner.py +0 -3
eval_runner.py
CHANGED
|
@@ -123,7 +123,6 @@ class AblationRunner:
|
|
| 123 |
# 2. Hidden-test gaming: public pass but hidden fail
|
| 124 |
bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
|
| 125 |
# Simulate gaming by creating an agent that always passes public but fails hidden
|
| 126 |
-
# We approximate this by making hidden_test_falloff huge so hidden always fails
|
| 127 |
gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
|
| 128 |
cost_per_attempt=100, hidden_test_falloff=1.0)
|
| 129 |
results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
|
|
@@ -146,12 +145,10 @@ class AblationRunner:
|
|
| 146 |
SimulatedDebateAgent("honest_1", accuracy=0.6),
|
| 147 |
SimulatedDebateAgent("honest_2", accuracy=0.6),
|
| 148 |
]
|
| 149 |
-
# Use the internal resolution directly
|
| 150 |
topic_results_eq = []
|
| 151 |
topic_results_occ = []
|
| 152 |
for topic in bench_debate.topics:
|
| 153 |
topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
|
| 154 |
-
# Reset agents between strategies (token/turn counters)
|
| 155 |
for a in agents:
|
| 156 |
a.tokens_used = 0
|
| 157 |
a.turns_taken = 0
|
|
|
|
| 123 |
# 2. Hidden-test gaming: public pass but hidden fail
|
| 124 |
bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
|
| 125 |
# Simulate gaming by creating an agent that always passes public but fails hidden
|
|
|
|
| 126 |
gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
|
| 127 |
cost_per_attempt=100, hidden_test_falloff=1.0)
|
| 128 |
results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
|
|
|
|
| 145 |
SimulatedDebateAgent("honest_1", accuracy=0.6),
|
| 146 |
SimulatedDebateAgent("honest_2", accuracy=0.6),
|
| 147 |
]
|
|
|
|
| 148 |
topic_results_eq = []
|
| 149 |
topic_results_occ = []
|
| 150 |
for topic in bench_debate.topics:
|
| 151 |
topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
|
|
|
|
| 152 |
for a in agents:
|
| 153 |
a.tokens_used = 0
|
| 154 |
a.turns_taken = 0
|