Spaces:
Sleeping
Sleeping
Hassan Shaikh commited on
Commit ·
2916eb9
1
Parent(s): d586ce5
fix: enforce strict open-interval task scores
Browse files- inference.py +5 -3
- server/grader.py +4 -1
- server/security_environment.py +3 -1
inference.py
CHANGED
|
@@ -26,6 +26,8 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "12"))
|
|
| 26 |
TEMPERATURE = 0.0
|
| 27 |
MAX_TOKENS = 260
|
| 28 |
BENCHMARK = "code_security_auditor_env"
|
|
|
|
|
|
|
| 29 |
|
| 30 |
SYSTEM_PROMPT = (
|
| 31 |
"You are a senior application security reviewer. Produce strictly valid JSON for the next action. "
|
|
@@ -202,7 +204,7 @@ async def run_task(env: CodeSecurityAuditorEnv, client: OpenAI, task_id: str) ->
|
|
| 202 |
break
|
| 203 |
|
| 204 |
score = float(obs.reward or 0.0)
|
| 205 |
-
score = min(max(score,
|
| 206 |
success = score >= 0.6
|
| 207 |
except Exception as exc:
|
| 208 |
# Keep evaluator contract: do not crash inference.py on transient/runtime errors.
|
|
@@ -210,7 +212,7 @@ async def run_task(env: CodeSecurityAuditorEnv, client: OpenAI, task_id: str) ->
|
|
| 210 |
if not rewards:
|
| 211 |
rewards.append(0.0)
|
| 212 |
steps_taken = max(1, steps_taken)
|
| 213 |
-
score =
|
| 214 |
success = False
|
| 215 |
finally:
|
| 216 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
|
@@ -232,7 +234,7 @@ async def main() -> None:
|
|
| 232 |
for task_id in TASK_IDS:
|
| 233 |
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 234 |
log_step(step=1, action="{}", reward=0.0, done=True, error=err)
|
| 235 |
-
log_end(success=False, steps=1, score=
|
| 236 |
return
|
| 237 |
|
| 238 |
try:
|
|
|
|
| 26 |
TEMPERATURE = 0.0
|
| 27 |
MAX_TOKENS = 260
|
| 28 |
BENCHMARK = "code_security_auditor_env"
|
| 29 |
+
MIN_STRICT_SCORE = 0.001
|
| 30 |
+
MAX_STRICT_SCORE = 0.999
|
| 31 |
|
| 32 |
SYSTEM_PROMPT = (
|
| 33 |
"You are a senior application security reviewer. Produce strictly valid JSON for the next action. "
|
|
|
|
| 204 |
break
|
| 205 |
|
| 206 |
score = float(obs.reward or 0.0)
|
| 207 |
+
score = min(max(score, MIN_STRICT_SCORE), MAX_STRICT_SCORE)
|
| 208 |
success = score >= 0.6
|
| 209 |
except Exception as exc:
|
| 210 |
# Keep evaluator contract: do not crash inference.py on transient/runtime errors.
|
|
|
|
| 212 |
if not rewards:
|
| 213 |
rewards.append(0.0)
|
| 214 |
steps_taken = max(1, steps_taken)
|
| 215 |
+
score = MIN_STRICT_SCORE
|
| 216 |
success = False
|
| 217 |
finally:
|
| 218 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
|
|
|
| 234 |
for task_id in TASK_IDS:
|
| 235 |
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 236 |
log_step(step=1, action="{}", reward=0.0, done=True, error=err)
|
| 237 |
+
log_end(success=False, steps=1, score=MIN_STRICT_SCORE, rewards=[MIN_STRICT_SCORE])
|
| 238 |
return
|
| 239 |
|
| 240 |
try:
|
server/grader.py
CHANGED
|
@@ -5,6 +5,9 @@ from typing import Iterable, Optional
|
|
| 5 |
|
| 6 |
from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
@dataclass(frozen=True)
|
| 10 |
class FindingEvaluation:
|
|
@@ -178,4 +181,4 @@ def final_grade(
|
|
| 178 |
)
|
| 179 |
score -= fp_penalty + dup_penalty + volume_penalty
|
| 180 |
|
| 181 |
-
return max(
|
|
|
|
| 5 |
|
| 6 |
from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec
|
| 7 |
|
| 8 |
+
MIN_STRICT_SCORE = 0.001
|
| 9 |
+
MAX_STRICT_SCORE = 0.999
|
| 10 |
+
|
| 11 |
|
| 12 |
@dataclass(frozen=True)
|
| 13 |
class FindingEvaluation:
|
|
|
|
| 181 |
)
|
| 182 |
score -= fp_penalty + dup_penalty + volume_penalty
|
| 183 |
|
| 184 |
+
return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, score))
|
server/security_environment.py
CHANGED
|
@@ -38,6 +38,8 @@ class CodeSecurityAuditorEnvironment(
|
|
| 38 |
"""Real-world code security auditing simulator with deterministic graders."""
|
| 39 |
|
| 40 |
SUPPORTS_CONCURRENT_SESSIONS = True
|
|
|
|
|
|
|
| 41 |
|
| 42 |
def __init__(self, default_task_id: str = "easy"):
|
| 43 |
self._default_task_id = default_task_id
|
|
@@ -320,7 +322,7 @@ class CodeSecurityAuditorEnvironment(
|
|
| 320 |
# This quality factor makes spam and random guesses strictly dominated,
|
| 321 |
# limiting reward hacking while preserving partial-credit gradients.
|
| 322 |
score *= self._state.quality_multiplier
|
| 323 |
-
return max(
|
| 324 |
|
| 325 |
def _build_observation(
|
| 326 |
self,
|
|
|
|
| 38 |
"""Real-world code security auditing simulator with deterministic graders."""
|
| 39 |
|
| 40 |
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 41 |
+
MIN_STRICT_SCORE = 0.001
|
| 42 |
+
MAX_STRICT_SCORE = 0.999
|
| 43 |
|
| 44 |
def __init__(self, default_task_id: str = "easy"):
|
| 45 |
self._default_task_id = default_task_id
|
|
|
|
| 322 |
# This quality factor makes spam and random guesses strictly dominated,
|
| 323 |
# limiting reward hacking while preserving partial-credit gradients.
|
| 324 |
score *= self._state.quality_multiplier
|
| 325 |
+
return max(self.MIN_STRICT_SCORE, min(self.MAX_STRICT_SCORE, score))
|
| 326 |
|
| 327 |
def _build_observation(
|
| 328 |
self,
|