Spaces:
Running
Running
Add process-aware reward engine reports
Browse files- README.md +13 -4
- app.py +11 -5
- docs/ROLL_OUT.md +2 -2
- docs/diagrams/VISUAL_SYSTEM.md +31 -3
- environment.py +115 -1
- graders.py +95 -8
- openenv.yaml +20 -2
- outputs/baseline_comparison.png +0 -0
- outputs/evaluation_results.json +0 -0
- static/index.html +11 -11
- tests/test_app.py +26 -1
- tests/test_environment.py +21 -0
- tests/test_graders.py +6 -1
README.md
CHANGED
|
@@ -126,6 +126,14 @@ Task 3 terminal score:
|
|
| 126 |
|
| 127 |
The episode `score` exposed in `info` and inference logs is the mean reward over emitted grading events, normalized to `0.0-1.0`. It is intentionally not raw cumulative return; terminal reward and efficiency terms carry the penalty for unfinished or wasteful episodes while keeping scores comparable across tasks with different horizons.
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
## WOW Factor Features
|
| 130 |
|
| 131 |
SENTINEL now includes three judge-facing upgrades:
|
|
@@ -160,6 +168,7 @@ curl "http://localhost:7860/mission?task_type=task3"
|
|
| 160 |
curl http://localhost:7860/metadata
|
| 161 |
curl http://localhost:7860/tasks
|
| 162 |
curl http://localhost:7860/schema
|
|
|
|
| 163 |
curl http://localhost:7860/difficulty
|
| 164 |
```
|
| 165 |
|
|
@@ -352,9 +361,9 @@ Latest local comparison, 20 episodes per task and policy:
|
|
| 352 |
|
| 353 |
| Policy | Overall | Task 1 | Task 2 | Task 3 |
|
| 354 |
| --- | ---: | ---: | ---: | ---: |
|
| 355 |
-
| Random | 0.
|
| 356 |
-
| Heuristic trust-weighted | 0.
|
| 357 |
-
| Oracle-lite upper bound | 0.
|
| 358 |
|
| 359 |
The demo story is the score gap: the reward function distinguishes blind delegation from trust-aware routing, and the oracle-lite upper bound shows room for onsite RL training.
|
| 360 |
|
|
@@ -384,7 +393,7 @@ Title: `SENTINEL: Training AI to Trust Wisely in Multi-Agent Systems`
|
|
| 384 |
|
| 385 |
SENTINEL is an OpenEnv RL environment for one failure mode: multi-agent systems delegate blindly. One orchestrator must complete long tasks by routing work across five specialist agents whose reliability profiles are hidden and reshuffled every episode. The orchestrator only sees behavior, confidence, stakes, and history, so it must learn skepticism, verification, recovery, and calibrated trust.
|
| 386 |
|
| 387 |
-
The specialists are deterministic FSMs on purpose: they give stable reward signals while the orchestrator remains the trainable target.
|
| 388 |
|
| 389 |
## Hackathon Alignment
|
| 390 |
|
|
|
|
| 126 |
|
| 127 |
The episode `score` exposed in `info` and inference logs is the mean reward over emitted grading events, normalized to `0.0-1.0`. It is intentionally not raw cumulative return; terminal reward and efficiency terms carry the penalty for unfinished or wasteful episodes while keeping scores comparable across tasks with different horizons.
|
| 128 |
|
| 129 |
+
Reward Engine v2 adds process-aware signals on top of outcome scoring:
|
| 130 |
+
|
| 131 |
+
- `confidence_alignment`: penalizes confident wrong outputs.
|
| 132 |
+
- `domain_routing`: rewards domain-bound behavior only when it is actually in-domain.
|
| 133 |
+
- `verification_quality`: rewards verification when it catches real high-stakes risk, and discourages blind verification everywhere.
|
| 134 |
+
|
| 135 |
+
The active step formulas are exposed at `/grader`, and each active episode exposes a full component trace at `/reward-report?session_id=<id>`.
|
| 136 |
+
|
| 137 |
## WOW Factor Features
|
| 138 |
|
| 139 |
SENTINEL now includes three judge-facing upgrades:
|
|
|
|
| 168 |
curl http://localhost:7860/metadata
|
| 169 |
curl http://localhost:7860/tasks
|
| 170 |
curl http://localhost:7860/schema
|
| 171 |
+
curl "http://localhost:7860/reward-report?session_id=<session_id>"
|
| 172 |
curl http://localhost:7860/difficulty
|
| 173 |
```
|
| 174 |
|
|
|
|
| 361 |
|
| 362 |
| Policy | Overall | Task 1 | Task 2 | Task 3 |
|
| 363 |
| --- | ---: | ---: | ---: | ---: |
|
| 364 |
+
| Random | 0.6954 | 0.7702 | 0.6505 | 0.6655 |
|
| 365 |
+
| Heuristic trust-weighted | 0.7960 | 0.8690 | 0.7677 | 0.7513 |
|
| 366 |
+
| Oracle-lite upper bound | 0.8553 | 0.9180 | 0.7801 | 0.8678 |
|
| 367 |
|
| 368 |
The demo story is the score gap: the reward function distinguishes blind delegation from trust-aware routing, and the oracle-lite upper bound shows room for onsite RL training.
|
| 369 |
|
|
|
|
| 393 |
|
| 394 |
SENTINEL is an OpenEnv RL environment for one failure mode: multi-agent systems delegate blindly. One orchestrator must complete long tasks by routing work across five specialist agents whose reliability profiles are hidden and reshuffled every episode. The orchestrator only sees behavior, confidence, stakes, and history, so it must learn skepticism, verification, recovery, and calibrated trust.
|
| 395 |
|
| 396 |
+
The specialists are deterministic FSMs on purpose: they give stable reward signals while the orchestrator remains the trainable target. Under Reward Engine v2, random routing scores `0.6954`, trust-weighted routing scores `0.7960`, and oracle-lite reaches `0.8553`, showing the environment has a meaningful learning signal before onsite GRPO training.
|
| 397 |
|
| 398 |
## Hackathon Alignment
|
| 399 |
|
app.py
CHANGED
|
@@ -170,7 +170,7 @@ def root():
|
|
| 170 |
),
|
| 171 |
"routes": [
|
| 172 |
"/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
|
| 173 |
-
"/grader", "/difficulty", "/stream", "/trust-dashboard",
|
| 174 |
"/reset", "/step", "/state",
|
| 175 |
],
|
| 176 |
}
|
|
@@ -204,7 +204,7 @@ def api_root():
|
|
| 204 |
),
|
| 205 |
"routes": [
|
| 206 |
"/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
|
| 207 |
-
"/grader", "/difficulty", "/stream", "/trust-dashboard",
|
| 208 |
"/reset", "/step", "/state",
|
| 209 |
],
|
| 210 |
}
|
|
@@ -304,20 +304,26 @@ def schema():
|
|
| 304 |
def grader():
|
| 305 |
return {
|
| 306 |
"task1": {
|
| 307 |
-
"step": "
|
| 308 |
"terminal": "same as last step",
|
| 309 |
},
|
| 310 |
"task2": {
|
| 311 |
-
"step": "
|
| 312 |
"terminal": "completion_rate×0.65 + trust_calibration×0.35",
|
| 313 |
},
|
| 314 |
"task3": {
|
| 315 |
-
"step": "
|
| 316 |
"terminal": "completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10",
|
| 317 |
},
|
| 318 |
}
|
| 319 |
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
@app.get("/difficulty")
|
| 322 |
def difficulty():
|
| 323 |
return {
|
|
|
|
| 170 |
),
|
| 171 |
"routes": [
|
| 172 |
"/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
|
| 173 |
+
"/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
|
| 174 |
"/reset", "/step", "/state",
|
| 175 |
],
|
| 176 |
}
|
|
|
|
| 204 |
),
|
| 205 |
"routes": [
|
| 206 |
"/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
|
| 207 |
+
"/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
|
| 208 |
"/reset", "/step", "/state",
|
| 209 |
],
|
| 210 |
}
|
|
|
|
| 304 |
def grader():
|
| 305 |
return {
|
| 306 |
"task1": {
|
| 307 |
+
"step": "accuracy×0.43 + stakes×0.30 + efficiency×0.12 + confidence×0.07 + domain×0.04 + verify×0.04",
|
| 308 |
"terminal": "same as last step",
|
| 309 |
},
|
| 310 |
"task2": {
|
| 311 |
+
"step": "accuracy×0.55 + efficiency×0.25 + confidence×0.10 + domain×0.10",
|
| 312 |
"terminal": "completion_rate×0.65 + trust_calibration×0.35",
|
| 313 |
},
|
| 314 |
"task3": {
|
| 315 |
+
"step": "accuracy×0.32 + stakes×0.33 + efficiency×0.10 + confidence×0.10 + verify×0.10 + domain×0.05",
|
| 316 |
"terminal": "completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10",
|
| 317 |
},
|
| 318 |
}
|
| 319 |
|
| 320 |
|
| 321 |
+
@app.get("/reward-report")
|
| 322 |
+
def reward_report(session_id: str = Query(...)):
|
| 323 |
+
env = _get_env(session_id)
|
| 324 |
+
return env.reward_report()
|
| 325 |
+
|
| 326 |
+
|
| 327 |
@app.get("/difficulty")
|
| 328 |
def difficulty():
|
| 329 |
return {
|
docs/ROLL_OUT.md
CHANGED
|
@@ -14,11 +14,11 @@ SENTINEL wins if the repo, Space, README, UI, and pitch all tell the same story:
|
|
| 14 |
|
| 15 |
| Area | Status | Notes |
|
| 16 |
| --- | --- | --- |
|
| 17 |
-
| Environment core | Strong | `reset()`, `step()`, `state()`,
|
| 18 |
| OpenEnv / deploy | Strong | Space live, Docker passing, validation passing |
|
| 19 |
| UI clarity | Improving | Trust Mission Control is live, but still needs full judge-demo mode |
|
| 20 |
| Presentation assets | Partial | Story exists, but diagrams and finale pack need stronger structure |
|
| 21 |
-
| Training evidence | Partial | Baselines are
|
| 22 |
| Submission completeness | Partial | Mini-blog/video and final finale package still needed |
|
| 23 |
|
| 24 |
## What We Borrow From MiroFish
|
|
|
|
| 14 |
|
| 15 |
| Area | Status | Notes |
|
| 16 |
| --- | --- | --- |
|
| 17 |
+
| Environment core | Strong | `reset()`, `step()`, `state()`, reward v2, task graph, specialists, trust ledger |
|
| 18 |
| OpenEnv / deploy | Strong | Space live, Docker passing, validation passing |
|
| 19 |
| UI clarity | Improving | Trust Mission Control is live, but still needs full judge-demo mode |
|
| 20 |
| Presentation assets | Partial | Story exists, but diagrams and finale pack need stronger structure |
|
| 21 |
+
| Training evidence | Partial | Baselines are refreshed under Reward Engine v2; final onsite GRPO curve still missing |
|
| 22 |
| Submission completeness | Partial | Mini-blog/video and final finale package still needed |
|
| 23 |
|
| 24 |
## What We Borrow From MiroFish
|
docs/diagrams/VISUAL_SYSTEM.md
CHANGED
|
@@ -9,6 +9,7 @@ This file is the diagram source of truth. Every diagram used in README, UI, blog
|
|
| 9 |
| System stack | show the code architecture | ready |
|
| 10 |
| Episode lifecycle | explain `reset()` to terminal reward | ready |
|
| 11 |
| Trust and reward flow | show how state turns into learning signal | ready |
|
|
|
|
| 12 |
| Before / after | show why SENTINEL matters | ready |
|
| 13 |
| Theme fit | map the project to the hackathon | ready |
|
| 14 |
| Training loop | show OpenEnv -> TRL / Unsloth pipeline | ready |
|
|
@@ -77,7 +78,34 @@ flowchart LR
|
|
| 77 |
|
| 78 |
---
|
| 79 |
|
| 80 |
-
## 4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
```mermaid
|
| 83 |
flowchart LR
|
|
@@ -98,7 +126,7 @@ flowchart LR
|
|
| 98 |
|
| 99 |
---
|
| 100 |
|
| 101 |
-
##
|
| 102 |
|
| 103 |
```mermaid
|
| 104 |
flowchart TD
|
|
@@ -115,7 +143,7 @@ flowchart TD
|
|
| 115 |
|
| 116 |
---
|
| 117 |
|
| 118 |
-
##
|
| 119 |
|
| 120 |
```mermaid
|
| 121 |
flowchart LR
|
|
|
|
| 9 |
| System stack | show the code architecture | ready |
|
| 10 |
| Episode lifecycle | explain `reset()` to terminal reward | ready |
|
| 11 |
| Trust and reward flow | show how state turns into learning signal | ready |
|
| 12 |
+
| Reward engine v2 | show process-aware reward components | ready |
|
| 13 |
| Before / after | show why SENTINEL matters | ready |
|
| 14 |
| Theme fit | map the project to the hackathon | ready |
|
| 15 |
| Training loop | show OpenEnv -> TRL / Unsloth pipeline | ready |
|
|
|
|
| 78 |
|
| 79 |
---
|
| 80 |
|
| 81 |
+
## 4. Reward Engine V2
|
| 82 |
+
|
| 83 |
+
```mermaid
|
| 84 |
+
flowchart LR
|
| 85 |
+
A["Specialist result<br/>outcome, confidence, metadata"] --> B["Step reward"]
|
| 86 |
+
C["TaskGraph<br/>completion, detections, poisonings"] --> D["Terminal reward"]
|
| 87 |
+
E["TrustLedger<br/>calibration, fingerprints"] --> D
|
| 88 |
+
|
| 89 |
+
B --> B1["task accuracy"]
|
| 90 |
+
B --> B2["stakes awareness"]
|
| 91 |
+
B --> B3["efficiency"]
|
| 92 |
+
B --> B4["confidence alignment"]
|
| 93 |
+
B --> B5["verification quality"]
|
| 94 |
+
B --> B6["domain routing"]
|
| 95 |
+
|
| 96 |
+
D --> D1["completion rate"]
|
| 97 |
+
D --> D2["detection rate"]
|
| 98 |
+
D --> D3["trust calibration"]
|
| 99 |
+
D --> D4["episode efficiency"]
|
| 100 |
+
|
| 101 |
+
B --> R["reward-report endpoint"]
|
| 102 |
+
D --> R
|
| 103 |
+
R --> T["component trace for judges"]
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
## 5. Before / After
|
| 109 |
|
| 110 |
```mermaid
|
| 111 |
flowchart LR
|
|
|
|
| 126 |
|
| 127 |
---
|
| 128 |
|
| 129 |
+
## 6. Theme Fit
|
| 130 |
|
| 131 |
```mermaid
|
| 132 |
flowchart TD
|
|
|
|
| 143 |
|
| 144 |
---
|
| 145 |
|
| 146 |
+
## 7. Training Loop
|
| 147 |
|
| 148 |
```mermaid
|
| 149 |
flowchart LR
|
environment.py
CHANGED
|
@@ -73,6 +73,7 @@ class SentinelEnv:
|
|
| 73 |
self.done: bool = False
|
| 74 |
self.episode_status: str = "active"
|
| 75 |
self.last_action_summary: str | None = None
|
|
|
|
| 76 |
|
| 77 |
self._graph: TaskGraph | None = None
|
| 78 |
self._ledger: TrustLedger = TrustLedger()
|
|
@@ -119,6 +120,7 @@ class SentinelEnv:
|
|
| 119 |
self.done = False
|
| 120 |
self.episode_status = "active"
|
| 121 |
self.last_action_summary = None
|
|
|
|
| 122 |
|
| 123 |
# Reset subcomponents
|
| 124 |
self._graph = TaskGraph(scenario)
|
|
@@ -172,6 +174,9 @@ class SentinelEnv:
|
|
| 172 |
|
| 173 |
subtask = node.subtask
|
| 174 |
stakes = subtask["stakes"]
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
step_cost = 1
|
| 177 |
|
|
@@ -203,6 +208,8 @@ class SentinelEnv:
|
|
| 203 |
step_cost = int(result.metadata.get("step_cost", 1)) + VERIFY_EXTRA_STEP_COST
|
| 204 |
outcome = result.outcome if not result.is_adversarial else 0.0
|
| 205 |
was_adversarial = result.is_adversarial
|
|
|
|
|
|
|
| 206 |
# Verification means agent caught adversarial — treat as detection
|
| 207 |
if result.is_adversarial:
|
| 208 |
outcome = 1.0 # successfully avoided
|
|
@@ -230,6 +237,8 @@ class SentinelEnv:
|
|
| 230 |
step_cost = int(result.metadata.get("step_cost", 1))
|
| 231 |
was_adversarial = result.is_adversarial
|
| 232 |
outcome = 0.0 if was_adversarial else result.outcome
|
|
|
|
|
|
|
| 233 |
self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
|
| 234 |
self._ledger.update(
|
| 235 |
specialist_id,
|
|
@@ -245,12 +254,26 @@ class SentinelEnv:
|
|
| 245 |
# --- Grade this step ---
|
| 246 |
reward_value, reason, breakdown = self._grade_step(
|
| 247 |
task_type, action_type, specialist_id, outcome,
|
| 248 |
-
stakes, was_adversarial,
|
| 249 |
)
|
| 250 |
|
| 251 |
self.last_reward = reward_value
|
| 252 |
self.total_reward += reward_value
|
| 253 |
self.reward_events += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
# --- Check episode end ---
|
| 256 |
all_done = self._graph.is_done()
|
|
@@ -309,6 +332,9 @@ class SentinelEnv:
|
|
| 309 |
outcome: float,
|
| 310 |
stakes: float,
|
| 311 |
was_adversarial: bool,
|
|
|
|
|
|
|
|
|
|
| 312 |
) -> tuple[float, str, dict]:
|
| 313 |
|
| 314 |
if task_type == "task1":
|
|
@@ -318,6 +344,9 @@ class SentinelEnv:
|
|
| 318 |
stakes=stakes,
|
| 319 |
was_adversarial=was_adversarial,
|
| 320 |
action_type=action_type,
|
|
|
|
|
|
|
|
|
|
| 321 |
)
|
| 322 |
elif task_type == "task2":
|
| 323 |
return grade_task2_step(
|
|
@@ -325,6 +354,8 @@ class SentinelEnv:
|
|
| 325 |
action_type=action_type,
|
| 326 |
step_count=self.step_count,
|
| 327 |
max_steps=self.max_steps,
|
|
|
|
|
|
|
| 328 |
)
|
| 329 |
else: # task3
|
| 330 |
return grade_task3_step(
|
|
@@ -334,6 +365,9 @@ class SentinelEnv:
|
|
| 334 |
action_type=action_type,
|
| 335 |
step_count=self.step_count,
|
| 336 |
max_steps=self.max_steps,
|
|
|
|
|
|
|
|
|
|
| 337 |
)
|
| 338 |
|
| 339 |
def _terminal_reward(
|
|
@@ -376,6 +410,20 @@ class SentinelEnv:
|
|
| 376 |
self.reward_events += 1
|
| 377 |
self.done = True
|
| 378 |
self.episode_status = "failed" if forced_end else "completed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
if self._difficulty_profile.adaptive:
|
| 380 |
self._difficulty_controller.update(
|
| 381 |
{
|
|
@@ -396,6 +444,7 @@ class SentinelEnv:
|
|
| 396 |
"trust_snapshot": self._ledger.snapshot(),
|
| 397 |
"forced_end": forced_end,
|
| 398 |
"difficulty_profile": self._difficulty_profile.to_dict(),
|
|
|
|
| 399 |
},
|
| 400 |
)
|
| 401 |
|
|
@@ -486,6 +535,71 @@ class SentinelEnv:
|
|
| 486 |
"last_reward": round(self.last_reward, 4),
|
| 487 |
}
|
| 488 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
def _apply_difficulty_profile(
|
| 490 |
self,
|
| 491 |
scenario: Scenario,
|
|
|
|
| 73 |
self.done: bool = False
|
| 74 |
self.episode_status: str = "active"
|
| 75 |
self.last_action_summary: str | None = None
|
| 76 |
+
self._reward_trace: list[dict[str, Any]] = []
|
| 77 |
|
| 78 |
self._graph: TaskGraph | None = None
|
| 79 |
self._ledger: TrustLedger = TrustLedger()
|
|
|
|
| 120 |
self.done = False
|
| 121 |
self.episode_status = "active"
|
| 122 |
self.last_action_summary = None
|
| 123 |
+
self._reward_trace = []
|
| 124 |
|
| 125 |
# Reset subcomponents
|
| 126 |
self._graph = TaskGraph(scenario)
|
|
|
|
| 174 |
|
| 175 |
subtask = node.subtask
|
| 176 |
stakes = subtask["stakes"]
|
| 177 |
+
confidence: float | None = None
|
| 178 |
+
result_metadata: dict[str, Any] = {}
|
| 179 |
+
trust_before = self._ledger.trust(specialist_id) if specialist_id else None
|
| 180 |
|
| 181 |
step_cost = 1
|
| 182 |
|
|
|
|
| 208 |
step_cost = int(result.metadata.get("step_cost", 1)) + VERIFY_EXTRA_STEP_COST
|
| 209 |
outcome = result.outcome if not result.is_adversarial else 0.0
|
| 210 |
was_adversarial = result.is_adversarial
|
| 211 |
+
confidence = result.confidence
|
| 212 |
+
result_metadata = dict(result.metadata)
|
| 213 |
# Verification means agent caught adversarial — treat as detection
|
| 214 |
if result.is_adversarial:
|
| 215 |
outcome = 1.0 # successfully avoided
|
|
|
|
| 237 |
step_cost = int(result.metadata.get("step_cost", 1))
|
| 238 |
was_adversarial = result.is_adversarial
|
| 239 |
outcome = 0.0 if was_adversarial else result.outcome
|
| 240 |
+
confidence = result.confidence
|
| 241 |
+
result_metadata = dict(result.metadata)
|
| 242 |
self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
|
| 243 |
self._ledger.update(
|
| 244 |
specialist_id,
|
|
|
|
| 254 |
# --- Grade this step ---
|
| 255 |
reward_value, reason, breakdown = self._grade_step(
|
| 256 |
task_type, action_type, specialist_id, outcome,
|
| 257 |
+
stakes, was_adversarial, confidence, result_metadata, trust_before,
|
| 258 |
)
|
| 259 |
|
| 260 |
self.last_reward = reward_value
|
| 261 |
self.total_reward += reward_value
|
| 262 |
self.reward_events += 1
|
| 263 |
+
self._record_reward_event(
|
| 264 |
+
kind="step",
|
| 265 |
+
action_type=action_type,
|
| 266 |
+
specialist_id=specialist_id,
|
| 267 |
+
subtask=subtask,
|
| 268 |
+
stakes=stakes,
|
| 269 |
+
reward_value=reward_value,
|
| 270 |
+
reason=reason,
|
| 271 |
+
breakdown=breakdown,
|
| 272 |
+
was_adversarial=was_adversarial,
|
| 273 |
+
confidence=confidence,
|
| 274 |
+
result_metadata=result_metadata,
|
| 275 |
+
trust_before=trust_before,
|
| 276 |
+
)
|
| 277 |
|
| 278 |
# --- Check episode end ---
|
| 279 |
all_done = self._graph.is_done()
|
|
|
|
| 332 |
outcome: float,
|
| 333 |
stakes: float,
|
| 334 |
was_adversarial: bool,
|
| 335 |
+
confidence: float | None,
|
| 336 |
+
result_metadata: dict[str, Any],
|
| 337 |
+
trust_score: float | None,
|
| 338 |
) -> tuple[float, str, dict]:
|
| 339 |
|
| 340 |
if task_type == "task1":
|
|
|
|
| 344 |
stakes=stakes,
|
| 345 |
was_adversarial=was_adversarial,
|
| 346 |
action_type=action_type,
|
| 347 |
+
confidence=confidence,
|
| 348 |
+
result_metadata=result_metadata,
|
| 349 |
+
trust_score=trust_score,
|
| 350 |
)
|
| 351 |
elif task_type == "task2":
|
| 352 |
return grade_task2_step(
|
|
|
|
| 354 |
action_type=action_type,
|
| 355 |
step_count=self.step_count,
|
| 356 |
max_steps=self.max_steps,
|
| 357 |
+
confidence=confidence,
|
| 358 |
+
result_metadata=result_metadata,
|
| 359 |
)
|
| 360 |
else: # task3
|
| 361 |
return grade_task3_step(
|
|
|
|
| 365 |
action_type=action_type,
|
| 366 |
step_count=self.step_count,
|
| 367 |
max_steps=self.max_steps,
|
| 368 |
+
confidence=confidence,
|
| 369 |
+
result_metadata=result_metadata,
|
| 370 |
+
trust_score=trust_score,
|
| 371 |
)
|
| 372 |
|
| 373 |
def _terminal_reward(
|
|
|
|
| 410 |
self.reward_events += 1
|
| 411 |
self.done = True
|
| 412 |
self.episode_status = "failed" if forced_end else "completed"
|
| 413 |
+
self._record_reward_event(
|
| 414 |
+
kind="terminal",
|
| 415 |
+
action_type="terminal",
|
| 416 |
+
specialist_id=None,
|
| 417 |
+
subtask=None,
|
| 418 |
+
stakes=0.0,
|
| 419 |
+
reward_value=terminal_value,
|
| 420 |
+
reason=terminal_reason,
|
| 421 |
+
breakdown=terminal_breakdown,
|
| 422 |
+
was_adversarial=False,
|
| 423 |
+
confidence=None,
|
| 424 |
+
result_metadata={},
|
| 425 |
+
trust_before=None,
|
| 426 |
+
)
|
| 427 |
if self._difficulty_profile.adaptive:
|
| 428 |
self._difficulty_controller.update(
|
| 429 |
{
|
|
|
|
| 444 |
"trust_snapshot": self._ledger.snapshot(),
|
| 445 |
"forced_end": forced_end,
|
| 446 |
"difficulty_profile": self._difficulty_profile.to_dict(),
|
| 447 |
+
"reward_report": self.reward_report(),
|
| 448 |
},
|
| 449 |
)
|
| 450 |
|
|
|
|
| 535 |
"last_reward": round(self.last_reward, 4),
|
| 536 |
}
|
| 537 |
|
| 538 |
+
def reward_report(self) -> dict:
|
| 539 |
+
return {
|
| 540 |
+
"episode_id": self.episode_id,
|
| 541 |
+
"session_id": self.session_id,
|
| 542 |
+
"task_type": self.current_scenario["task_type"] if self.current_scenario else "",
|
| 543 |
+
"score": round(self.normalized_score(), 4),
|
| 544 |
+
"total_reward": round(self.total_reward, 4),
|
| 545 |
+
"reward_events": self.reward_events,
|
| 546 |
+
"component_averages": self._reward_component_averages(),
|
| 547 |
+
"events": list(self._reward_trace),
|
| 548 |
+
"formula": {
|
| 549 |
+
"task1_step": "0.43 accuracy + 0.30 stakes + 0.12 efficiency + 0.07 confidence + 0.04 domain + 0.04 verify",
|
| 550 |
+
"task2_step": "0.55 accuracy + 0.25 efficiency + 0.10 confidence + 0.10 domain",
|
| 551 |
+
"task3_step": "0.32 accuracy + 0.33 stakes + 0.10 efficiency + 0.10 confidence + 0.10 verify + 0.05 domain",
|
| 552 |
+
"task3_terminal": "0.35 completion + 0.30 detection + 0.25 calibration + 0.10 efficiency",
|
| 553 |
+
},
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
def _record_reward_event(
|
| 557 |
+
self,
|
| 558 |
+
kind: str,
|
| 559 |
+
action_type: str,
|
| 560 |
+
specialist_id: str | None,
|
| 561 |
+
subtask: dict[str, Any] | None,
|
| 562 |
+
stakes: float,
|
| 563 |
+
reward_value: float,
|
| 564 |
+
reason: str,
|
| 565 |
+
breakdown: dict,
|
| 566 |
+
was_adversarial: bool,
|
| 567 |
+
confidence: float | None,
|
| 568 |
+
result_metadata: dict[str, Any],
|
| 569 |
+
trust_before: float | None,
|
| 570 |
+
) -> None:
|
| 571 |
+
event = {
|
| 572 |
+
"kind": kind,
|
| 573 |
+
"step_count": self.step_count,
|
| 574 |
+
"action_type": action_type,
|
| 575 |
+
"specialist_id": specialist_id,
|
| 576 |
+
"subtask_id": subtask.get("id") if subtask else None,
|
| 577 |
+
"domain": subtask.get("domain") if subtask else None,
|
| 578 |
+
"stakes": round(stakes, 3),
|
| 579 |
+
"reward": round(reward_value, 4),
|
| 580 |
+
"reason": reason,
|
| 581 |
+
"signal_breakdown": breakdown,
|
| 582 |
+
"was_adversarial": was_adversarial,
|
| 583 |
+
"confidence": round(confidence, 3) if confidence is not None else None,
|
| 584 |
+
"trust_before": round(trust_before, 3) if trust_before is not None else None,
|
| 585 |
+
"trust_after": self._ledger.snapshot().get(specialist_id) if specialist_id else None,
|
| 586 |
+
"result_metadata": result_metadata,
|
| 587 |
+
}
|
| 588 |
+
self._reward_trace.append(event)
|
| 589 |
+
|
| 590 |
+
def _reward_component_averages(self) -> dict[str, float]:
|
| 591 |
+
totals: dict[str, float] = {}
|
| 592 |
+
counts: dict[str, int] = {}
|
| 593 |
+
for event in self._reward_trace:
|
| 594 |
+
for key, value in event.get("signal_breakdown", {}).items():
|
| 595 |
+
if isinstance(value, (int, float)):
|
| 596 |
+
totals[key] = totals.get(key, 0.0) + float(value)
|
| 597 |
+
counts[key] = counts.get(key, 0) + 1
|
| 598 |
+
return {
|
| 599 |
+
key: round(total / max(1, counts[key]), 4)
|
| 600 |
+
for key, total in sorted(totals.items())
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
def _apply_difficulty_profile(
|
| 604 |
self,
|
| 605 |
scenario: Scenario,
|
graders.py
CHANGED
|
@@ -14,6 +14,58 @@ def clamp(value: float) -> float:
|
|
| 14 |
return max(0.01, min(0.99, value))
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# ---------------------------------------------------------------------------
|
| 18 |
# Task 1 — Single-step trust decision (Easy)
|
| 19 |
#
|
|
@@ -27,6 +79,9 @@ def grade_task1_step(
|
|
| 27 |
stakes: float,
|
| 28 |
was_adversarial: bool,
|
| 29 |
action_type: str, # delegate | verify | solve_independently | skip
|
|
|
|
|
|
|
|
|
|
| 30 |
) -> tuple[float, str, dict]:
|
| 31 |
|
| 32 |
if action_type == "skip":
|
|
@@ -49,16 +104,26 @@ def grade_task1_step(
|
|
| 49 |
# Efficiency: verify and solve_independently cost extra steps
|
| 50 |
efficiency = {"delegate": 1.0, "verify": 0.7, "solve_independently": 0.5, "skip": 0.0}.get(action_type, 0.5)
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
score = (
|
| 53 |
-
0.
|
| 54 |
-
+ 0.
|
| 55 |
-
+ 0.
|
|
|
|
|
|
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
breakdown = {
|
| 59 |
"task_accuracy": round(task_accuracy, 3),
|
| 60 |
"stakes_awareness": round(stakes_awareness, 3),
|
| 61 |
"efficiency": round(efficiency, 3),
|
|
|
|
|
|
|
|
|
|
| 62 |
}
|
| 63 |
return clamp(score), reason, breakdown
|
| 64 |
|
|
@@ -75,6 +140,8 @@ def grade_task2_step(
|
|
| 75 |
action_type: str,
|
| 76 |
step_count: int,
|
| 77 |
max_steps: int,
|
|
|
|
|
|
|
| 78 |
) -> tuple[float, str, dict]:
|
| 79 |
|
| 80 |
if action_type == "skip":
|
|
@@ -88,9 +155,14 @@ def grade_task2_step(
|
|
| 88 |
action_cost = {"delegate": 0.0, "verify": 0.10, "solve_independently": 0.20}.get(action_type, 0.0)
|
| 89 |
efficiency = max(0.0, efficiency - action_cost)
|
| 90 |
|
|
|
|
|
|
|
|
|
|
| 91 |
score = (
|
| 92 |
-
0.
|
| 93 |
-
+ 0.
|
|
|
|
|
|
|
| 94 |
)
|
| 95 |
|
| 96 |
reason = (
|
|
@@ -100,6 +172,8 @@ def grade_task2_step(
|
|
| 100 |
breakdown = {
|
| 101 |
"task_accuracy": round(task_accuracy, 3),
|
| 102 |
"efficiency": round(efficiency, 3),
|
|
|
|
|
|
|
| 103 |
}
|
| 104 |
return clamp(score), reason, breakdown
|
| 105 |
|
|
@@ -143,6 +217,9 @@ def grade_task3_step(
|
|
| 143 |
action_type: str,
|
| 144 |
step_count: int,
|
| 145 |
max_steps: int,
|
|
|
|
|
|
|
|
|
|
| 146 |
) -> tuple[float, str, dict]:
|
| 147 |
|
| 148 |
if action_type == "skip":
|
|
@@ -169,16 +246,26 @@ def grade_task3_step(
|
|
| 169 |
step_budget_used = step_count / max_steps
|
| 170 |
efficiency = max(0.0, 1.0 - step_budget_used)
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
score = (
|
| 173 |
-
0.
|
| 174 |
-
+ 0.
|
| 175 |
-
+ 0.
|
|
|
|
|
|
|
|
|
|
| 176 |
)
|
| 177 |
|
| 178 |
breakdown = {
|
| 179 |
"task_accuracy": round(task_accuracy, 3),
|
| 180 |
"stakes_awareness": round(stakes_awareness, 3),
|
| 181 |
"efficiency": round(efficiency, 3),
|
|
|
|
|
|
|
|
|
|
| 182 |
}
|
| 183 |
return clamp(score), reason, breakdown
|
| 184 |
|
|
|
|
| 14 |
return max(0.01, min(0.99, value))
|
| 15 |
|
| 16 |
|
| 17 |
+
def confidence_alignment(confidence: float | None, outcome: float) -> float:
|
| 18 |
+
"""
|
| 19 |
+
Reward honest confidence. High confidence on wrong output is suspicious.
|
| 20 |
+
|
| 21 |
+
Returns 0.0-1.0. Unknown confidence is neutral so older callers remain stable.
|
| 22 |
+
"""
|
| 23 |
+
if confidence is None:
|
| 24 |
+
return 0.5
|
| 25 |
+
overconfidence_gap = max(0.0, confidence - outcome)
|
| 26 |
+
return max(0.0, min(1.0, 1.0 - overconfidence_gap))
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def domain_routing_score(result_metadata: dict | None) -> float:
|
| 30 |
+
"""
|
| 31 |
+
Reward using a domain-bound specialist only when it is actually in-domain.
|
| 32 |
+
|
| 33 |
+
If the specialist does not expose domain metadata, return neutral.
|
| 34 |
+
"""
|
| 35 |
+
metadata = result_metadata or {}
|
| 36 |
+
if "in_domain" not in metadata:
|
| 37 |
+
return 0.5
|
| 38 |
+
return 1.0 if metadata.get("in_domain") else 0.1
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def verification_quality(
|
| 42 |
+
action_type: str,
|
| 43 |
+
was_adversarial: bool,
|
| 44 |
+
stakes: float,
|
| 45 |
+
trust_score: float | None = None,
|
| 46 |
+
) -> float:
|
| 47 |
+
"""
|
| 48 |
+
Reward verification when it catches real risk, not as a blind habit.
|
| 49 |
+
"""
|
| 50 |
+
trust = 0.5 if trust_score is None else trust_score
|
| 51 |
+
high_stakes = stakes >= ADVERSARIAL_AWARENESS_STAKES
|
| 52 |
+
|
| 53 |
+
if action_type == "verify":
|
| 54 |
+
if was_adversarial and high_stakes:
|
| 55 |
+
return 1.0
|
| 56 |
+
if high_stakes:
|
| 57 |
+
return 0.75
|
| 58 |
+
if trust < 0.45:
|
| 59 |
+
return 0.65
|
| 60 |
+
return 0.25
|
| 61 |
+
|
| 62 |
+
if action_type == "delegate" and was_adversarial and high_stakes:
|
| 63 |
+
return 0.0
|
| 64 |
+
if action_type == "solve_independently" and high_stakes:
|
| 65 |
+
return 0.7
|
| 66 |
+
return 0.55
|
| 67 |
+
|
| 68 |
+
|
| 69 |
# ---------------------------------------------------------------------------
|
| 70 |
# Task 1 — Single-step trust decision (Easy)
|
| 71 |
#
|
|
|
|
| 79 |
stakes: float,
|
| 80 |
was_adversarial: bool,
|
| 81 |
action_type: str, # delegate | verify | solve_independently | skip
|
| 82 |
+
confidence: float | None = None,
|
| 83 |
+
result_metadata: dict | None = None,
|
| 84 |
+
trust_score: float | None = None,
|
| 85 |
) -> tuple[float, str, dict]:
|
| 86 |
|
| 87 |
if action_type == "skip":
|
|
|
|
| 104 |
# Efficiency: verify and solve_independently cost extra steps
|
| 105 |
efficiency = {"delegate": 1.0, "verify": 0.7, "solve_independently": 0.5, "skip": 0.0}.get(action_type, 0.5)
|
| 106 |
|
| 107 |
+
confidence_score = confidence_alignment(confidence, task_accuracy)
|
| 108 |
+
domain_score = domain_routing_score(result_metadata)
|
| 109 |
+
verify_score = verification_quality(action_type, was_adversarial, stakes, trust_score)
|
| 110 |
+
|
| 111 |
score = (
|
| 112 |
+
0.43 * task_accuracy
|
| 113 |
+
+ 0.30 * stakes_awareness
|
| 114 |
+
+ 0.12 * efficiency
|
| 115 |
+
+ 0.07 * confidence_score
|
| 116 |
+
+ 0.04 * domain_score
|
| 117 |
+
+ 0.04 * verify_score
|
| 118 |
)
|
| 119 |
|
| 120 |
breakdown = {
|
| 121 |
"task_accuracy": round(task_accuracy, 3),
|
| 122 |
"stakes_awareness": round(stakes_awareness, 3),
|
| 123 |
"efficiency": round(efficiency, 3),
|
| 124 |
+
"confidence_alignment": round(confidence_score, 3),
|
| 125 |
+
"domain_routing": round(domain_score, 3),
|
| 126 |
+
"verification_quality": round(verify_score, 3),
|
| 127 |
}
|
| 128 |
return clamp(score), reason, breakdown
|
| 129 |
|
|
|
|
| 140 |
action_type: str,
|
| 141 |
step_count: int,
|
| 142 |
max_steps: int,
|
| 143 |
+
confidence: float | None = None,
|
| 144 |
+
result_metadata: dict | None = None,
|
| 145 |
) -> tuple[float, str, dict]:
|
| 146 |
|
| 147 |
if action_type == "skip":
|
|
|
|
| 155 |
action_cost = {"delegate": 0.0, "verify": 0.10, "solve_independently": 0.20}.get(action_type, 0.0)
|
| 156 |
efficiency = max(0.0, efficiency - action_cost)
|
| 157 |
|
| 158 |
+
confidence_score = confidence_alignment(confidence, task_accuracy)
|
| 159 |
+
domain_score = domain_routing_score(result_metadata)
|
| 160 |
+
|
| 161 |
score = (
|
| 162 |
+
0.55 * task_accuracy
|
| 163 |
+
+ 0.25 * efficiency
|
| 164 |
+
+ 0.10 * confidence_score
|
| 165 |
+
+ 0.10 * domain_score
|
| 166 |
)
|
| 167 |
|
| 168 |
reason = (
|
|
|
|
| 172 |
breakdown = {
|
| 173 |
"task_accuracy": round(task_accuracy, 3),
|
| 174 |
"efficiency": round(efficiency, 3),
|
| 175 |
+
"confidence_alignment": round(confidence_score, 3),
|
| 176 |
+
"domain_routing": round(domain_score, 3),
|
| 177 |
}
|
| 178 |
return clamp(score), reason, breakdown
|
| 179 |
|
|
|
|
| 217 |
action_type: str,
|
| 218 |
step_count: int,
|
| 219 |
max_steps: int,
|
| 220 |
+
confidence: float | None = None,
|
| 221 |
+
result_metadata: dict | None = None,
|
| 222 |
+
trust_score: float | None = None,
|
| 223 |
) -> tuple[float, str, dict]:
|
| 224 |
|
| 225 |
if action_type == "skip":
|
|
|
|
| 246 |
step_budget_used = step_count / max_steps
|
| 247 |
efficiency = max(0.0, 1.0 - step_budget_used)
|
| 248 |
|
| 249 |
+
confidence_score = confidence_alignment(confidence, task_accuracy)
|
| 250 |
+
domain_score = domain_routing_score(result_metadata)
|
| 251 |
+
verify_score = verification_quality(action_type, was_adversarial, stakes, trust_score)
|
| 252 |
+
|
| 253 |
score = (
|
| 254 |
+
0.32 * task_accuracy
|
| 255 |
+
+ 0.33 * stakes_awareness
|
| 256 |
+
+ 0.10 * efficiency
|
| 257 |
+
+ 0.10 * confidence_score
|
| 258 |
+
+ 0.10 * verify_score
|
| 259 |
+
+ 0.05 * domain_score
|
| 260 |
)
|
| 261 |
|
| 262 |
breakdown = {
|
| 263 |
"task_accuracy": round(task_accuracy, 3),
|
| 264 |
"stakes_awareness": round(stakes_awareness, 3),
|
| 265 |
"efficiency": round(efficiency, 3),
|
| 266 |
+
"confidence_alignment": round(confidence_score, 3),
|
| 267 |
+
"verification_quality": round(verify_score, 3),
|
| 268 |
+
"domain_routing": round(domain_score, 3),
|
| 269 |
}
|
| 270 |
return clamp(score), reason, breakdown
|
| 271 |
|
openenv.yaml
CHANGED
|
@@ -97,6 +97,15 @@ api:
|
|
| 97 |
required: true
|
| 98 |
returns: SentinelState with trust_snapshot, completion, adversarial stats
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
difficulty:
|
| 101 |
method: GET
|
| 102 |
path: /difficulty
|
|
@@ -144,7 +153,7 @@ tasks:
|
|
| 144 |
subtasks: 15
|
| 145 |
max_steps: 30
|
| 146 |
adversary_active: false
|
| 147 |
-
reward: "per-step accuracy + efficiency | terminal completion×0.65 + calibration×0.35"
|
| 148 |
|
| 149 |
task3:
|
| 150 |
name: Full Adversarial Episode
|
|
@@ -152,7 +161,16 @@ tasks:
|
|
| 152 |
subtasks: 20
|
| 153 |
max_steps: 45
|
| 154 |
adversary_active: true
|
| 155 |
-
reward: "terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
specialists:
|
| 158 |
S0: "AccurateSlow — 90% accurate, costs 2 steps"
|
|
|
|
| 97 |
required: true
|
| 98 |
returns: SentinelState with trust_snapshot, completion, adversarial stats
|
| 99 |
|
| 100 |
+
reward_report:
|
| 101 |
+
method: GET
|
| 102 |
+
path: /reward-report
|
| 103 |
+
params:
|
| 104 |
+
session_id:
|
| 105 |
+
type: string
|
| 106 |
+
required: true
|
| 107 |
+
returns: Reward component trace with per-step process-aware signals
|
| 108 |
+
|
| 109 |
difficulty:
|
| 110 |
method: GET
|
| 111 |
path: /difficulty
|
|
|
|
| 153 |
subtasks: 15
|
| 154 |
max_steps: 30
|
| 155 |
adversary_active: false
|
| 156 |
+
reward: "per-step accuracy + efficiency + confidence alignment + domain routing | terminal completion×0.65 + calibration×0.35"
|
| 157 |
|
| 158 |
task3:
|
| 159 |
name: Full Adversarial Episode
|
|
|
|
| 161 |
subtasks: 20
|
| 162 |
max_steps: 45
|
| 163 |
adversary_active: true
|
| 164 |
+
reward: "step accuracy + stakes awareness + efficiency + confidence alignment + verification quality + domain routing | terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
|
| 165 |
+
|
| 166 |
+
reward_engine_v2:
|
| 167 |
+
source: verifier/execution-style behavioral outcomes
|
| 168 |
+
granularity: step plus terminal trajectory
|
| 169 |
+
aggregation: fixed weighted multi-signal reward
|
| 170 |
+
process_signals:
|
| 171 |
+
confidence_alignment: penalizes high confidence on wrong outputs
|
| 172 |
+
domain_routing: rewards in-domain specialist behavior when metadata exists
|
| 173 |
+
verification_quality: rewards verification when it catches high-stakes adversarial risk
|
| 174 |
|
| 175 |
specialists:
|
| 176 |
S0: "AccurateSlow — 90% accurate, costs 2 steps"
|
outputs/baseline_comparison.png
CHANGED
|
|
outputs/evaluation_results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/index.html
CHANGED
|
@@ -1867,11 +1867,11 @@
|
|
| 1867 |
<div class="hero-stats">
|
| 1868 |
<div class="hero-stat">
|
| 1869 |
<div class="label">Random overall</div>
|
| 1870 |
-
<div id="heroRandomScore" class="value">0.
|
| 1871 |
</div>
|
| 1872 |
<div class="hero-stat">
|
| 1873 |
<div class="label">Heuristic overall</div>
|
| 1874 |
-
<div id="heroHeuristicScore" class="value">0.
|
| 1875 |
</div>
|
| 1876 |
<div class="hero-stat">
|
| 1877 |
<div class="label">Task 3 detect</div>
|
|
@@ -2114,7 +2114,7 @@
|
|
| 2114 |
<div class="story-lane before">
|
| 2115 |
<div class="story-title">
|
| 2116 |
<strong>Without SENTINEL</strong>
|
| 2117 |
-
<span id="storyBeforeScore" class="story-score">task3 random 0.
|
| 2118 |
</div>
|
| 2119 |
<div class="story-flow">
|
| 2120 |
<div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
|
|
@@ -2154,12 +2154,12 @@
|
|
| 2154 |
<div class="judge-stats">
|
| 2155 |
<div class="judge-card bad">
|
| 2156 |
<div class="label">Random baseline</div>
|
| 2157 |
-
<div id="judgeRandomScore" class="value">0.
|
| 2158 |
<span class="muted">Blind delegation baseline. Good enough to move, weak at skepticism.</span>
|
| 2159 |
</div>
|
| 2160 |
<div class="judge-card warn">
|
| 2161 |
<div class="label">Heuristic policy</div>
|
| 2162 |
-
<div id="judgeHeuristicScore" class="value">0.
|
| 2163 |
<span class="muted">Trust-weighted routing plus verification at risky gates.</span>
|
| 2164 |
</div>
|
| 2165 |
<div class="judge-card good">
|
|
@@ -2228,18 +2228,18 @@
|
|
| 2228 |
<div class="baseline-table">
|
| 2229 |
<div class="baseline-row">
|
| 2230 |
<span>Random</span>
|
| 2231 |
-
<div class="mini-bar"><span id="proofRandomBar" style="width:
|
| 2232 |
-
<strong id="proofRandomScore">0.
|
| 2233 |
</div>
|
| 2234 |
<div class="baseline-row">
|
| 2235 |
<span>Heuristic</span>
|
| 2236 |
-
<div class="mini-bar"><span id="proofHeuristicBar" style="width:
|
| 2237 |
-
<strong id="proofHeuristicScore">0.
|
| 2238 |
</div>
|
| 2239 |
<div class="baseline-row">
|
| 2240 |
<span>Oracle-lite</span>
|
| 2241 |
-
<div class="mini-bar"><span id="proofOracleBar" style="width:
|
| 2242 |
-
<strong id="proofOracleScore">0.
|
| 2243 |
</div>
|
| 2244 |
<div class="baseline-row">
|
| 2245 |
<span>T3 detect</span>
|
|
|
|
| 1867 |
<div class="hero-stats">
|
| 1868 |
<div class="hero-stat">
|
| 1869 |
<div class="label">Random overall</div>
|
| 1870 |
+
<div id="heroRandomScore" class="value">0.695</div>
|
| 1871 |
</div>
|
| 1872 |
<div class="hero-stat">
|
| 1873 |
<div class="label">Heuristic overall</div>
|
| 1874 |
+
<div id="heroHeuristicScore" class="value">0.796</div>
|
| 1875 |
</div>
|
| 1876 |
<div class="hero-stat">
|
| 1877 |
<div class="label">Task 3 detect</div>
|
|
|
|
| 2114 |
<div class="story-lane before">
|
| 2115 |
<div class="story-title">
|
| 2116 |
<strong>Without SENTINEL</strong>
|
| 2117 |
+
<span id="storyBeforeScore" class="story-score">task3 random 0.666</span>
|
| 2118 |
</div>
|
| 2119 |
<div class="story-flow">
|
| 2120 |
<div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
|
|
|
|
| 2154 |
<div class="judge-stats">
|
| 2155 |
<div class="judge-card bad">
|
| 2156 |
<div class="label">Random baseline</div>
|
| 2157 |
+
<div id="judgeRandomScore" class="value">0.695</div>
|
| 2158 |
<span class="muted">Blind delegation baseline. Good enough to move, weak at skepticism.</span>
|
| 2159 |
</div>
|
| 2160 |
<div class="judge-card warn">
|
| 2161 |
<div class="label">Heuristic policy</div>
|
| 2162 |
+
<div id="judgeHeuristicScore" class="value">0.796</div>
|
| 2163 |
<span class="muted">Trust-weighted routing plus verification at risky gates.</span>
|
| 2164 |
</div>
|
| 2165 |
<div class="judge-card good">
|
|
|
|
| 2228 |
<div class="baseline-table">
|
| 2229 |
<div class="baseline-row">
|
| 2230 |
<span>Random</span>
|
| 2231 |
+
<div class="mini-bar"><span id="proofRandomBar" style="width:69.5%;background:#ff5f45"></span></div>
|
| 2232 |
+
<strong id="proofRandomScore">0.695</strong>
|
| 2233 |
</div>
|
| 2234 |
<div class="baseline-row">
|
| 2235 |
<span>Heuristic</span>
|
| 2236 |
+
<div class="mini-bar"><span id="proofHeuristicBar" style="width:79.6%;background:#73a7ff"></span></div>
|
| 2237 |
+
<strong id="proofHeuristicScore">0.796</strong>
|
| 2238 |
</div>
|
| 2239 |
<div class="baseline-row">
|
| 2240 |
<span>Oracle-lite</span>
|
| 2241 |
+
<div class="mini-bar"><span id="proofOracleBar" style="width:85.5%;background:#27e0a1"></span></div>
|
| 2242 |
+
<strong id="proofOracleScore">0.855</strong>
|
| 2243 |
</div>
|
| 2244 |
<div class="baseline-row">
|
| 2245 |
<span>T3 detect</span>
|
tests/test_app.py
CHANGED
|
@@ -4,7 +4,9 @@ import time
|
|
| 4 |
import unittest
|
| 5 |
|
| 6 |
from app import SessionStore
|
|
|
|
| 7 |
from environment import SentinelEnv
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class SessionStoreTests(unittest.TestCase):
|
|
@@ -29,7 +31,30 @@ class SessionStoreTests(unittest.TestCase):
|
|
| 29 |
self.assertIsNone(store.get("first"))
|
| 30 |
self.assertIs(store.get("second"), second)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
if __name__ == "__main__":
|
| 34 |
unittest.main()
|
| 35 |
-
|
|
|
|
| 4 |
import unittest
|
| 5 |
|
| 6 |
from app import SessionStore
|
| 7 |
+
from app import app
|
| 8 |
from environment import SentinelEnv
|
| 9 |
+
from fastapi.testclient import TestClient
|
| 10 |
|
| 11 |
|
| 12 |
class SessionStoreTests(unittest.TestCase):
|
|
|
|
| 31 |
self.assertIsNone(store.get("first"))
|
| 32 |
self.assertIs(store.get("second"), second)
|
| 33 |
|
| 34 |
+
def test_reward_report_endpoint_returns_active_trace(self) -> None:
|
| 35 |
+
client = TestClient(app)
|
| 36 |
+
reset = client.post("/reset", json={"task_type": "task3", "seed": 42})
|
| 37 |
+
self.assertEqual(reset.status_code, 200)
|
| 38 |
+
payload = reset.json()
|
| 39 |
+
sid = payload["info"]["session_id"]
|
| 40 |
+
obs = payload["observation"]
|
| 41 |
+
|
| 42 |
+
step = client.post(
|
| 43 |
+
f"/step?session_id={sid}",
|
| 44 |
+
json={
|
| 45 |
+
"session_id": sid,
|
| 46 |
+
"task_type": obs["task_type"],
|
| 47 |
+
"action_type": "delegate",
|
| 48 |
+
"specialist_id": "S0",
|
| 49 |
+
},
|
| 50 |
+
)
|
| 51 |
+
self.assertEqual(step.status_code, 200)
|
| 52 |
+
|
| 53 |
+
report = client.get(f"/reward-report?session_id={sid}")
|
| 54 |
+
|
| 55 |
+
self.assertEqual(report.status_code, 200)
|
| 56 |
+
self.assertEqual(report.json()["reward_events"], 1)
|
| 57 |
+
|
| 58 |
|
| 59 |
if __name__ == "__main__":
|
| 60 |
unittest.main()
|
|
|
tests/test_environment.py
CHANGED
|
@@ -67,6 +67,27 @@ class EnvironmentTests(unittest.TestCase):
|
|
| 67 |
self.assertGreater(result["info"]["step_count"], 2)
|
| 68 |
self.assertGreaterEqual(result["info"]["score"], 0.0)
|
| 69 |
self.assertLessEqual(result["info"]["score"], 1.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
if __name__ == "__main__":
|
|
|
|
| 67 |
self.assertGreater(result["info"]["step_count"], 2)
|
| 68 |
self.assertGreaterEqual(result["info"]["score"], 0.0)
|
| 69 |
self.assertLessEqual(result["info"]["score"], 1.0)
|
| 70 |
+
self.assertIn("reward_report", result["info"])
|
| 71 |
+
self.assertGreater(result["info"]["reward_report"]["reward_events"], 0)
|
| 72 |
+
|
| 73 |
+
def test_reward_report_tracks_process_components(self) -> None:
|
| 74 |
+
env = SentinelEnv()
|
| 75 |
+
result = env.reset(task_type="task3", seed=42)
|
| 76 |
+
obs = result["observation"]
|
| 77 |
+
|
| 78 |
+
result = env.step({
|
| 79 |
+
"session_id": obs["session_id"],
|
| 80 |
+
"task_type": "task3",
|
| 81 |
+
"action_type": "delegate",
|
| 82 |
+
"specialist_id": "S0",
|
| 83 |
+
})
|
| 84 |
+
|
| 85 |
+
report = env.reward_report()
|
| 86 |
+
|
| 87 |
+
self.assertEqual(report["reward_events"], 1)
|
| 88 |
+
self.assertIn("confidence_alignment", report["component_averages"])
|
| 89 |
+
self.assertIn("domain_routing", report["component_averages"])
|
| 90 |
+
self.assertEqual(report["events"][0]["action_type"], "delegate")
|
| 91 |
|
| 92 |
|
| 93 |
if __name__ == "__main__":
|
tests/test_graders.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import unittest
|
| 4 |
|
| 5 |
-
from graders import clamp, grade_task3_step
|
| 6 |
from scenarios import get_scenario
|
| 7 |
from task_graph import TaskGraph
|
| 8 |
|
|
@@ -26,6 +26,11 @@ class GraderAndGraphTests(unittest.TestCase):
|
|
| 26 |
self.assertGreater(reward, 0.8)
|
| 27 |
self.assertIn("Adversarial detected", reason)
|
| 28 |
self.assertEqual(breakdown["stakes_awareness"], 0.99)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def test_failed_nodes_are_retriable_then_resolved(self) -> None:
|
| 31 |
graph = TaskGraph(get_scenario("SCN-TASK1-001"))
|
|
|
|
| 2 |
|
| 3 |
import unittest
|
| 4 |
|
| 5 |
+
from graders import clamp, confidence_alignment, grade_task3_step
|
| 6 |
from scenarios import get_scenario
|
| 7 |
from task_graph import TaskGraph
|
| 8 |
|
|
|
|
| 26 |
self.assertGreater(reward, 0.8)
|
| 27 |
self.assertIn("Adversarial detected", reason)
|
| 28 |
self.assertEqual(breakdown["stakes_awareness"], 0.99)
|
| 29 |
+
self.assertIn("verification_quality", breakdown)
|
| 30 |
+
|
| 31 |
+
def test_overconfident_wrong_answer_is_penalized(self) -> None:
|
| 32 |
+
self.assertLess(confidence_alignment(0.95, 0.0), 0.1)
|
| 33 |
+
self.assertGreater(confidence_alignment(0.85, 1.0), 0.8)
|
| 34 |
|
| 35 |
def test_failed_nodes_are_retriable_then_resolved(self) -> None:
|
| 36 |
graph = TaskGraph(get_scenario("SCN-TASK1-001"))
|