narcolepticchicken commited on
Commit
69dc3e0
·
verified ·
1 Parent(s): 9312235

Upload eval_runner.py

Browse files
Files changed (1) hide show
  1. eval_runner.py +0 -3
eval_runner.py CHANGED
@@ -123,7 +123,6 @@ class AblationRunner:
123
  # 2. Hidden-test gaming: public pass but hidden fail
124
  bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
125
  # Simulate gaming by creating an agent that always passes public but fails hidden
126
- # We approximate this by making hidden_test_falloff huge so hidden always fails
127
  gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
128
  cost_per_attempt=100, hidden_test_falloff=1.0)
129
  results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
@@ -146,12 +145,10 @@ class AblationRunner:
146
  SimulatedDebateAgent("honest_1", accuracy=0.6),
147
  SimulatedDebateAgent("honest_2", accuracy=0.6),
148
  ]
149
- # Use the internal resolution directly
150
  topic_results_eq = []
151
  topic_results_occ = []
152
  for topic in bench_debate.topics:
153
  topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
154
- # Reset agents between strategies (token/turn counters)
155
  for a in agents:
156
  a.tokens_used = 0
157
  a.turns_taken = 0
 
123
  # 2. Hidden-test gaming: public pass but hidden fail
124
  bench_game = CodeBenchmark(n_problems=50, seed=self.seed)
125
  # Simulate gaming by creating an agent that always passes public but fails hidden
 
126
  gamer = SimulatedCodeAgent("gamer", pass_rate_easy=1.0, pass_rate_hard=0.0,
127
  cost_per_attempt=100, hidden_test_falloff=1.0)
128
  results["hidden_test_gaming"] = bench_game.run_occ_allocation([gamer], max_attempts=5)
 
145
  SimulatedDebateAgent("honest_1", accuracy=0.6),
146
  SimulatedDebateAgent("honest_2", accuracy=0.6),
147
  ]
 
148
  topic_results_eq = []
149
  topic_results_occ = []
150
  for topic in bench_debate.topics:
151
  topic_results_eq.append(bench_debate._resolve_equal_turns(agents, topic))
 
152
  for a in agents:
153
  a.tokens_used = 0
154
  a.turns_taken = 0