Hassan Shaikh commited on
Commit
2916eb9
·
1 Parent(s): d586ce5

fix: enforce strict open-interval task scores

Browse files
inference.py CHANGED
@@ -26,6 +26,8 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "12"))
26
  TEMPERATURE = 0.0
27
  MAX_TOKENS = 260
28
  BENCHMARK = "code_security_auditor_env"
 
 
29
 
30
  SYSTEM_PROMPT = (
31
  "You are a senior application security reviewer. Produce strictly valid JSON for the next action. "
@@ -202,7 +204,7 @@ async def run_task(env: CodeSecurityAuditorEnv, client: OpenAI, task_id: str) ->
202
  break
203
 
204
  score = float(obs.reward or 0.0)
205
- score = min(max(score, 0.0), 1.0)
206
  success = score >= 0.6
207
  except Exception as exc:
208
  # Keep evaluator contract: do not crash inference.py on transient/runtime errors.
@@ -210,7 +212,7 @@ async def run_task(env: CodeSecurityAuditorEnv, client: OpenAI, task_id: str) ->
210
  if not rewards:
211
  rewards.append(0.0)
212
  steps_taken = max(1, steps_taken)
213
- score = 0.0
214
  success = False
215
  finally:
216
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
@@ -232,7 +234,7 @@ async def main() -> None:
232
  for task_id in TASK_IDS:
233
  log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
234
  log_step(step=1, action="{}", reward=0.0, done=True, error=err)
235
- log_end(success=False, steps=1, score=0.0, rewards=[0.0])
236
  return
237
 
238
  try:
 
26
  TEMPERATURE = 0.0
27
  MAX_TOKENS = 260
28
  BENCHMARK = "code_security_auditor_env"
29
+ MIN_STRICT_SCORE = 0.001
30
+ MAX_STRICT_SCORE = 0.999
31
 
32
  SYSTEM_PROMPT = (
33
  "You are a senior application security reviewer. Produce strictly valid JSON for the next action. "
 
204
  break
205
 
206
  score = float(obs.reward or 0.0)
207
+ score = min(max(score, MIN_STRICT_SCORE), MAX_STRICT_SCORE)
208
  success = score >= 0.6
209
  except Exception as exc:
210
  # Keep evaluator contract: do not crash inference.py on transient/runtime errors.
 
212
  if not rewards:
213
  rewards.append(0.0)
214
  steps_taken = max(1, steps_taken)
215
+ score = MIN_STRICT_SCORE
216
  success = False
217
  finally:
218
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
234
  for task_id in TASK_IDS:
235
  log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
236
  log_step(step=1, action="{}", reward=0.0, done=True, error=err)
237
+ log_end(success=False, steps=1, score=MIN_STRICT_SCORE, rewards=[MIN_STRICT_SCORE])
238
  return
239
 
240
  try:
server/grader.py CHANGED
@@ -5,6 +5,9 @@ from typing import Iterable, Optional
5
 
6
  from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec
7
 
 
 
 
8
 
9
  @dataclass(frozen=True)
10
  class FindingEvaluation:
@@ -178,4 +181,4 @@ def final_grade(
178
  )
179
  score -= fp_penalty + dup_penalty + volume_penalty
180
 
181
- return max(0.0, min(1.0, score))
 
5
 
6
  from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec
7
 
8
+ MIN_STRICT_SCORE = 0.001
9
+ MAX_STRICT_SCORE = 0.999
10
+
11
 
12
  @dataclass(frozen=True)
13
  class FindingEvaluation:
 
181
  )
182
  score -= fp_penalty + dup_penalty + volume_penalty
183
 
184
+ return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, score))
server/security_environment.py CHANGED
@@ -38,6 +38,8 @@ class CodeSecurityAuditorEnvironment(
38
  """Real-world code security auditing simulator with deterministic graders."""
39
 
40
  SUPPORTS_CONCURRENT_SESSIONS = True
 
 
41
 
42
  def __init__(self, default_task_id: str = "easy"):
43
  self._default_task_id = default_task_id
@@ -320,7 +322,7 @@ class CodeSecurityAuditorEnvironment(
320
  # This quality factor makes spam and random guesses strictly dominated,
321
  # limiting reward hacking while preserving partial-credit gradients.
322
  score *= self._state.quality_multiplier
323
- return max(0.0, min(1.0, score))
324
 
325
  def _build_observation(
326
  self,
 
38
  """Real-world code security auditing simulator with deterministic graders."""
39
 
40
  SUPPORTS_CONCURRENT_SESSIONS = True
41
+ MIN_STRICT_SCORE = 0.001
42
+ MAX_STRICT_SCORE = 0.999
43
 
44
  def __init__(self, default_task_id: str = "easy"):
45
  self._default_task_id = default_task_id
 
322
  # This quality factor makes spam and random guesses strictly dominated,
323
  # limiting reward hacking while preserving partial-credit gradients.
324
  score *= self._state.quality_multiplier
325
+ return max(self.MIN_STRICT_SCORE, min(self.MAX_STRICT_SCORE, score))
326
 
327
  def _build_observation(
328
  self,