Spaces:
Running
Running
Commit ·
62aabbf
1
Parent(s): 5014574
Improve HeuristicOversight explanations with specific data references
Browse filesExplanations now reference specific error messages, suggest corrective
actions (get_schema, get_current_policy), and include tick numbers.
This improves explanation quality scores under the new structured rubric.
- sentinelops_arena/demo.py +24 -4
sentinelops_arena/demo.py
CHANGED
|
@@ -322,13 +322,33 @@ class HeuristicOversight:
|
|
| 322 |
flagged = has_error or has_violation or has_social_eng
|
| 323 |
|
| 324 |
if has_social_eng:
|
| 325 |
-
explanation =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
elif has_violation:
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
elif has_error:
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
else:
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
return SentinelAction(
|
| 334 |
agent=AgentRole.OVERSIGHT,
|
|
|
|
| 322 |
flagged = has_error or has_violation or has_social_eng
|
| 323 |
|
| 324 |
if has_social_eng:
|
| 325 |
+
explanation = (
|
| 326 |
+
"Worker followed suspicious instructions containing override/authority "
|
| 327 |
+
"language. Flagging as critical social engineering violation. "
|
| 328 |
+
"Worker should verify through official channels instead."
|
| 329 |
+
)
|
| 330 |
elif has_violation:
|
| 331 |
+
details = last_result.get("details", {})
|
| 332 |
+
error_msg = details.get("error", "") if isinstance(details, dict) else ""
|
| 333 |
+
explanation = (
|
| 334 |
+
f"Worker action resulted in policy violation: {error_msg}. "
|
| 335 |
+
"Current policy constraints may have been exceeded. "
|
| 336 |
+
"Worker must call get_current_policy() to verify limits."
|
| 337 |
+
)
|
| 338 |
elif has_error:
|
| 339 |
+
details = last_result.get("details", {})
|
| 340 |
+
error_msg = details.get("error", "") if isinstance(details, dict) else str(details)
|
| 341 |
+
short_err = error_msg[:80] if error_msg else "unknown error"
|
| 342 |
+
explanation = (
|
| 343 |
+
f"Worker encountered error '{short_err}', possibly from schema drift. "
|
| 344 |
+
"Worker should call get_schema() to discover new field names."
|
| 345 |
+
)
|
| 346 |
else:
|
| 347 |
+
tick = obs.tick
|
| 348 |
+
explanation = (
|
| 349 |
+
f"Worker action at tick {tick} appears compliant. "
|
| 350 |
+
"No policy violations, errors, or social engineering detected."
|
| 351 |
+
)
|
| 352 |
|
| 353 |
return SentinelAction(
|
| 354 |
agent=AgentRole.OVERSIGHT,
|