Ajayyy00 commited on
Commit ·
f7f5220
1
Parent(s): 533611a
Gracefully handle invalid LLM actions with -0.2 penalty instead of 500 crash
Browse files- server/play_environment.py +18 -2
server/play_environment.py
CHANGED
|
@@ -407,8 +407,24 @@ class CyberSOCEnvironment(Environment):
|
|
| 407 |
action: SOCActionWrapper,
|
| 408 |
) -> SOCObservation:
|
| 409 |
"""Execute one Blue turn."""
|
| 410 |
-
# Convert wrapper to typed action
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
args = typed_action.model_dump(exclude={"metadata", "type"})
|
| 413 |
|
| 414 |
# Pre-flight validation — penalise without consuming a step
|
|
|
|
| 407 |
action: SOCActionWrapper,
|
| 408 |
) -> SOCObservation:
|
| 409 |
"""Execute one Blue turn."""
|
| 410 |
+
# Convert wrapper to typed action — gracefully handle hallucinated
|
| 411 |
+
# action types or wrong parameters from the LLM instead of crashing.
|
| 412 |
+
try:
|
| 413 |
+
typed_action = action.to_typed_action()
|
| 414 |
+
except Exception as exc:
|
| 415 |
+
# Return a negative reward signal so GRPO can learn from the mistake
|
| 416 |
+
penalty = -0.2
|
| 417 |
+
self._state.total_reward += penalty
|
| 418 |
+
self._state.timeline.append({
|
| 419 |
+
"step": self._state.step_count + 1,
|
| 420 |
+
"action_type": getattr(action, "type", "unknown"),
|
| 421 |
+
"target": "N/A",
|
| 422 |
+
"result": f"INVALID_ACTION: {exc}",
|
| 423 |
+
"reward": penalty,
|
| 424 |
+
})
|
| 425 |
+
self._state.step_count += 1
|
| 426 |
+
return self._build_observation(reward=penalty, done=False)
|
| 427 |
+
|
| 428 |
args = typed_action.model_dump(exclude={"metadata", "type"})
|
| 429 |
|
| 430 |
# Pre-flight validation — penalise without consuming a step
|