Spaces:

XcodeAddy
/

sentinel-env

Running

App Files Files Community

XcodeAddy commited on 19 days ago

Commit

b3b9bbd

1 Parent(s): 74b74f1

Add process-aware reward engine reports

Browse files

Files changed (13) hide show

README.md +13 -4
app.py +11 -5
docs/ROLL_OUT.md +2 -2
docs/diagrams/VISUAL_SYSTEM.md +31 -3
environment.py +115 -1
graders.py +95 -8
openenv.yaml +20 -2
outputs/baseline_comparison.png +0 -0
outputs/evaluation_results.json +0 -0
static/index.html +11 -11
tests/test_app.py +26 -1
tests/test_environment.py +21 -0
tests/test_graders.py +6 -1

README.md CHANGED Viewed

@@ -126,6 +126,14 @@ Task 3 terminal score:
 The episode `score` exposed in `info` and inference logs is the mean reward over emitted grading events, normalized to `0.0-1.0`. It is intentionally not raw cumulative return; terminal reward and efficiency terms carry the penalty for unfinished or wasteful episodes while keeping scores comparable across tasks with different horizons.
 ## WOW Factor Features
 SENTINEL now includes three judge-facing upgrades:
@@ -160,6 +168,7 @@ curl "http://localhost:7860/mission?task_type=task3"
 curl http://localhost:7860/metadata
 curl http://localhost:7860/tasks
 curl http://localhost:7860/schema
 curl http://localhost:7860/difficulty
 ```
@@ -352,9 +361,9 @@ Latest local comparison, 20 episodes per task and policy:
 | Policy | Overall | Task 1 | Task 2 | Task 3 |
 | --- | ---: | ---: | ---: | ---: |
-| Random | 0.7144 | 0.7948 | 0.6493 | 0.6990 |
-| Heuristic trust-weighted | 0.8162 | 0.8911 | 0.7736 | 0.7838 |
-| Oracle-lite upper bound | 0.8718 | 0.9445 | 0.7760 | 0.8950 |
 The demo story is the score gap: the reward function distinguishes blind delegation from trust-aware routing, and the oracle-lite upper bound shows room for onsite RL training.
@@ -384,7 +393,7 @@ Title: `SENTINEL: Training AI to Trust Wisely in Multi-Agent Systems`
 SENTINEL is an OpenEnv RL environment for one failure mode: multi-agent systems delegate blindly. One orchestrator must complete long tasks by routing work across five specialist agents whose reliability profiles are hidden and reshuffled every episode. The orchestrator only sees behavior, confidence, stakes, and history, so it must learn skepticism, verification, recovery, and calibrated trust.
-The specialists are deterministic FSMs on purpose: they give stable reward signals while the orchestrator remains the trainable target. Random routing scores `0.7144`, trust-weighted routing scores `0.8162`, and oracle-lite reaches `0.8718`, showing the environment has a meaningful learning signal before onsite GRPO training.
 ## Hackathon Alignment

 The episode `score` exposed in `info` and inference logs is the mean reward over emitted grading events, normalized to `0.0-1.0`. It is intentionally not raw cumulative return; terminal reward and efficiency terms carry the penalty for unfinished or wasteful episodes while keeping scores comparable across tasks with different horizons.
+Reward Engine v2 adds process-aware signals on top of outcome scoring:
+- `confidence_alignment`: penalizes confident wrong outputs.
+- `domain_routing`: rewards domain-bound behavior only when it is actually in-domain.
+- `verification_quality`: rewards verification when it catches real high-stakes risk, and discourages blind verification everywhere.
+The active step formulas are exposed at `/grader`, and each active episode exposes a full component trace at `/reward-report?session_id=<id>`.
 ## WOW Factor Features
 SENTINEL now includes three judge-facing upgrades:
 curl http://localhost:7860/metadata
 curl http://localhost:7860/tasks
 curl http://localhost:7860/schema
+curl "http://localhost:7860/reward-report?session_id=<session_id>"
 curl http://localhost:7860/difficulty
 ```
 | Policy | Overall | Task 1 | Task 2 | Task 3 |
 | --- | ---: | ---: | ---: | ---: |
+| Random | 0.6954 | 0.7702 | 0.6505 | 0.6655 |
+| Heuristic trust-weighted | 0.7960 | 0.8690 | 0.7677 | 0.7513 |
+| Oracle-lite upper bound | 0.8553 | 0.9180 | 0.7801 | 0.8678 |
 The demo story is the score gap: the reward function distinguishes blind delegation from trust-aware routing, and the oracle-lite upper bound shows room for onsite RL training.
 SENTINEL is an OpenEnv RL environment for one failure mode: multi-agent systems delegate blindly. One orchestrator must complete long tasks by routing work across five specialist agents whose reliability profiles are hidden and reshuffled every episode. The orchestrator only sees behavior, confidence, stakes, and history, so it must learn skepticism, verification, recovery, and calibrated trust.
+The specialists are deterministic FSMs on purpose: they give stable reward signals while the orchestrator remains the trainable target. Under Reward Engine v2, random routing scores `0.6954`, trust-weighted routing scores `0.7960`, and oracle-lite reaches `0.8553`, showing the environment has a meaningful learning signal before onsite GRPO training.
 ## Hackathon Alignment

app.py CHANGED Viewed

@@ -170,7 +170,7 @@ def root():
             ),
             "routes": [
                 "/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
-                "/grader", "/difficulty", "/stream", "/trust-dashboard",
                 "/reset", "/step", "/state",
             ],
         }
@@ -204,7 +204,7 @@ def api_root():
         ),
         "routes": [
             "/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
-            "/grader", "/difficulty", "/stream", "/trust-dashboard",
             "/reset", "/step", "/state",
         ],
     }
@@ -304,20 +304,26 @@ def schema():
 def grader():
     return {
         "task1": {
-            "step":     "task_accuracy×0.50 + stakes_awareness×0.35 + efficiency×0.15",
             "terminal": "same as last step",
         },
         "task2": {
-            "step":     "task_accuracy×0.65 + efficiency×0.35",
             "terminal": "completion_rate×0.65 + trust_calibration×0.35",
         },
         "task3": {
-            "step":     "task_accuracy×0.40 + stakes_awareness×0.45 + efficiency×0.15",
             "terminal": "completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10",
         },
     }
 @app.get("/difficulty")
 def difficulty():
     return {

             ),
             "routes": [
                 "/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
+                "/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
                 "/reset", "/step", "/state",
             ],
         }
         ),
         "routes": [
             "/health", "/problem", "/mission", "/metadata", "/tasks", "/schema",
+            "/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
             "/reset", "/step", "/state",
         ],
     }
 def grader():
     return {
         "task1": {
+            "step":     "accuracy×0.43 + stakes×0.30 + efficiency×0.12 + confidence×0.07 + domain×0.04 + verify×0.04",
             "terminal": "same as last step",
         },
         "task2": {
+            "step":     "accuracy×0.55 + efficiency×0.25 + confidence×0.10 + domain×0.10",
             "terminal": "completion_rate×0.65 + trust_calibration×0.35",
         },
         "task3": {
+            "step":     "accuracy×0.32 + stakes×0.33 + efficiency×0.10 + confidence×0.10 + verify×0.10 + domain×0.05",
             "terminal": "completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10",
         },
     }
+@app.get("/reward-report")
+def reward_report(session_id: str = Query(...)):
+    env = _get_env(session_id)
+    return env.reward_report()
 @app.get("/difficulty")
 def difficulty():
     return {

docs/ROLL_OUT.md CHANGED Viewed

@@ -14,11 +14,11 @@ SENTINEL wins if the repo, Space, README, UI, and pitch all tell the same story:
 | Area | Status | Notes |
 | --- | --- | --- |
-| Environment core | Strong | `reset()`, `step()`, `state()`, rewards, task graph, specialists, trust ledger |
 | OpenEnv / deploy | Strong | Space live, Docker passing, validation passing |
 | UI clarity | Improving | Trust Mission Control is live, but still needs full judge-demo mode |
 | Presentation assets | Partial | Story exists, but diagrams and finale pack need stronger structure |
-| Training evidence | Partial | Baselines are real; final onsite GRPO curve still missing |
 | Submission completeness | Partial | Mini-blog/video and final finale package still needed |
 ## What We Borrow From MiroFish

 | Area | Status | Notes |
 | --- | --- | --- |
+| Environment core | Strong | `reset()`, `step()`, `state()`, reward v2, task graph, specialists, trust ledger |
 | OpenEnv / deploy | Strong | Space live, Docker passing, validation passing |
 | UI clarity | Improving | Trust Mission Control is live, but still needs full judge-demo mode |
 | Presentation assets | Partial | Story exists, but diagrams and finale pack need stronger structure |
+| Training evidence | Partial | Baselines are refreshed under Reward Engine v2; final onsite GRPO curve still missing |
 | Submission completeness | Partial | Mini-blog/video and final finale package still needed |
 ## What We Borrow From MiroFish

docs/diagrams/VISUAL_SYSTEM.md CHANGED Viewed

@@ -9,6 +9,7 @@ This file is the diagram source of truth. Every diagram used in README, UI, blog
 | System stack | show the code architecture | ready |
 | Episode lifecycle | explain `reset()` to terminal reward | ready |
 | Trust and reward flow | show how state turns into learning signal | ready |
 | Before / after | show why SENTINEL matters | ready |
 | Theme fit | map the project to the hackathon | ready |
 | Training loop | show OpenEnv -> TRL / Unsloth pipeline | ready |
@@ -77,7 +78,34 @@ flowchart LR
 ---
-## 4. Before / After
 ```mermaid
 flowchart LR
@@ -98,7 +126,7 @@ flowchart LR
 ---
-## 5. Theme Fit
 ```mermaid
 flowchart TD
@@ -115,7 +143,7 @@ flowchart TD
 ---
-## 6. Training Loop
 ```mermaid
 flowchart LR

 | System stack | show the code architecture | ready |
 | Episode lifecycle | explain `reset()` to terminal reward | ready |
 | Trust and reward flow | show how state turns into learning signal | ready |
+| Reward engine v2 | show process-aware reward components | ready |
 | Before / after | show why SENTINEL matters | ready |
 | Theme fit | map the project to the hackathon | ready |
 | Training loop | show OpenEnv -> TRL / Unsloth pipeline | ready |
 ---
+## 4. Reward Engine V2
+```mermaid
+flowchart LR
+  A["Specialist result<br/>outcome, confidence, metadata"] --> B["Step reward"]
+  C["TaskGraph<br/>completion, detections, poisonings"] --> D["Terminal reward"]
+  E["TrustLedger<br/>calibration, fingerprints"] --> D
+  B --> B1["task accuracy"]
+  B --> B2["stakes awareness"]
+  B --> B3["efficiency"]
+  B --> B4["confidence alignment"]
+  B --> B5["verification quality"]
+  B --> B6["domain routing"]
+  D --> D1["completion rate"]
+  D --> D2["detection rate"]
+  D --> D3["trust calibration"]
+  D --> D4["episode efficiency"]
+  B --> R["reward-report endpoint"]
+  D --> R
+  R --> T["component trace for judges"]
+```
+---
+## 5. Before / After
 ```mermaid
 flowchart LR
 ---
+## 6. Theme Fit
 ```mermaid
 flowchart TD
 ---
+## 7. Training Loop
 ```mermaid
 flowchart LR

environment.py CHANGED Viewed

@@ -73,6 +73,7 @@ class SentinelEnv:
         self.done: bool = False
         self.episode_status: str = "active"
         self.last_action_summary: str | None = None
         self._graph: TaskGraph | None = None
         self._ledger: TrustLedger = TrustLedger()
@@ -119,6 +120,7 @@ class SentinelEnv:
         self.done             = False
         self.episode_status   = "active"
         self.last_action_summary = None
         # Reset subcomponents
         self._graph = TaskGraph(scenario)
@@ -172,6 +174,9 @@ class SentinelEnv:
         subtask  = node.subtask
         stakes   = subtask["stakes"]
         step_cost = 1
@@ -203,6 +208,8 @@ class SentinelEnv:
             step_cost       = int(result.metadata.get("step_cost", 1)) + VERIFY_EXTRA_STEP_COST
             outcome         = result.outcome if not result.is_adversarial else 0.0
             was_adversarial  = result.is_adversarial
             # Verification means agent caught adversarial — treat as detection
             if result.is_adversarial:
                 outcome = 1.0   # successfully avoided
@@ -230,6 +237,8 @@ class SentinelEnv:
             step_cost       = int(result.metadata.get("step_cost", 1))
             was_adversarial  = result.is_adversarial
             outcome         = 0.0 if was_adversarial else result.outcome
             self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
             self._ledger.update(
                 specialist_id,
@@ -245,12 +254,26 @@ class SentinelEnv:
         # --- Grade this step ---
         reward_value, reason, breakdown = self._grade_step(
             task_type, action_type, specialist_id, outcome,
-            stakes, was_adversarial,
         )
         self.last_reward   = reward_value
         self.total_reward += reward_value
         self.reward_events += 1
         # --- Check episode end ---
         all_done    = self._graph.is_done()
@@ -309,6 +332,9 @@ class SentinelEnv:
         outcome: float,
         stakes: float,
         was_adversarial: bool,
     ) -> tuple[float, str, dict]:
         if task_type == "task1":
@@ -318,6 +344,9 @@ class SentinelEnv:
                 stakes=stakes,
                 was_adversarial=was_adversarial,
                 action_type=action_type,
             )
         elif task_type == "task2":
             return grade_task2_step(
@@ -325,6 +354,8 @@ class SentinelEnv:
                 action_type=action_type,
                 step_count=self.step_count,
                 max_steps=self.max_steps,
             )
         else:  # task3
             return grade_task3_step(
@@ -334,6 +365,9 @@ class SentinelEnv:
                 action_type=action_type,
                 step_count=self.step_count,
                 max_steps=self.max_steps,
             )
     def _terminal_reward(
@@ -376,6 +410,20 @@ class SentinelEnv:
         self.reward_events += 1
         self.done           = True
         self.episode_status = "failed" if forced_end else "completed"
         if self._difficulty_profile.adaptive:
             self._difficulty_controller.update(
                 {
@@ -396,6 +444,7 @@ class SentinelEnv:
                 "trust_snapshot": self._ledger.snapshot(),
                 "forced_end":     forced_end,
                 "difficulty_profile": self._difficulty_profile.to_dict(),
             },
         )
@@ -486,6 +535,71 @@ class SentinelEnv:
             "last_reward": round(self.last_reward, 4),
         }
     def _apply_difficulty_profile(
         self,
         scenario: Scenario,

         self.done: bool = False
         self.episode_status: str = "active"
         self.last_action_summary: str | None = None
+        self._reward_trace: list[dict[str, Any]] = []
         self._graph: TaskGraph | None = None
         self._ledger: TrustLedger = TrustLedger()
         self.done             = False
         self.episode_status   = "active"
         self.last_action_summary = None
+        self._reward_trace = []
         # Reset subcomponents
         self._graph = TaskGraph(scenario)
         subtask  = node.subtask
         stakes   = subtask["stakes"]
+        confidence: float | None = None
+        result_metadata: dict[str, Any] = {}
+        trust_before = self._ledger.trust(specialist_id) if specialist_id else None
         step_cost = 1
             step_cost       = int(result.metadata.get("step_cost", 1)) + VERIFY_EXTRA_STEP_COST
             outcome         = result.outcome if not result.is_adversarial else 0.0
             was_adversarial  = result.is_adversarial
+            confidence = result.confidence
+            result_metadata = dict(result.metadata)
             # Verification means agent caught adversarial — treat as detection
             if result.is_adversarial:
                 outcome = 1.0   # successfully avoided
             step_cost       = int(result.metadata.get("step_cost", 1))
             was_adversarial  = result.is_adversarial
             outcome         = 0.0 if was_adversarial else result.outcome
+            confidence = result.confidence
+            result_metadata = dict(result.metadata)
             self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
             self._ledger.update(
                 specialist_id,
         # --- Grade this step ---
         reward_value, reason, breakdown = self._grade_step(
             task_type, action_type, specialist_id, outcome,
+            stakes, was_adversarial, confidence, result_metadata, trust_before,
         )
         self.last_reward   = reward_value
         self.total_reward += reward_value
         self.reward_events += 1
+        self._record_reward_event(
+            kind="step",
+            action_type=action_type,
+            specialist_id=specialist_id,
+            subtask=subtask,
+            stakes=stakes,
+            reward_value=reward_value,
+            reason=reason,
+            breakdown=breakdown,
+            was_adversarial=was_adversarial,
+            confidence=confidence,
+            result_metadata=result_metadata,
+            trust_before=trust_before,
+        )
         # --- Check episode end ---
         all_done    = self._graph.is_done()
         outcome: float,
         stakes: float,
         was_adversarial: bool,
+        confidence: float | None,
+        result_metadata: dict[str, Any],
+        trust_score: float | None,
     ) -> tuple[float, str, dict]:
         if task_type == "task1":
                 stakes=stakes,
                 was_adversarial=was_adversarial,
                 action_type=action_type,
+                confidence=confidence,
+                result_metadata=result_metadata,
+                trust_score=trust_score,
             )
         elif task_type == "task2":
             return grade_task2_step(
                 action_type=action_type,
                 step_count=self.step_count,
                 max_steps=self.max_steps,
+                confidence=confidence,
+                result_metadata=result_metadata,
             )
         else:  # task3
             return grade_task3_step(
                 action_type=action_type,
                 step_count=self.step_count,
                 max_steps=self.max_steps,
+                confidence=confidence,
+                result_metadata=result_metadata,
+                trust_score=trust_score,
             )
     def _terminal_reward(
         self.reward_events += 1
         self.done           = True
         self.episode_status = "failed" if forced_end else "completed"
+        self._record_reward_event(
+            kind="terminal",
+            action_type="terminal",
+            specialist_id=None,
+            subtask=None,
+            stakes=0.0,
+            reward_value=terminal_value,
+            reason=terminal_reason,
+            breakdown=terminal_breakdown,
+            was_adversarial=False,
+            confidence=None,
+            result_metadata={},
+            trust_before=None,
+        )
         if self._difficulty_profile.adaptive:
             self._difficulty_controller.update(
                 {
                 "trust_snapshot": self._ledger.snapshot(),
                 "forced_end":     forced_end,
                 "difficulty_profile": self._difficulty_profile.to_dict(),
+                "reward_report": self.reward_report(),
             },
         )
             "last_reward": round(self.last_reward, 4),
         }
+    def reward_report(self) -> dict:
+        return {
+            "episode_id": self.episode_id,
+            "session_id": self.session_id,
+            "task_type": self.current_scenario["task_type"] if self.current_scenario else "",
+            "score": round(self.normalized_score(), 4),
+            "total_reward": round(self.total_reward, 4),
+            "reward_events": self.reward_events,
+            "component_averages": self._reward_component_averages(),
+            "events": list(self._reward_trace),
+            "formula": {
+                "task1_step": "0.43 accuracy + 0.30 stakes + 0.12 efficiency + 0.07 confidence + 0.04 domain + 0.04 verify",
+                "task2_step": "0.55 accuracy + 0.25 efficiency + 0.10 confidence + 0.10 domain",
+                "task3_step": "0.32 accuracy + 0.33 stakes + 0.10 efficiency + 0.10 confidence + 0.10 verify + 0.05 domain",
+                "task3_terminal": "0.35 completion + 0.30 detection + 0.25 calibration + 0.10 efficiency",
+            },
+        }
+    def _record_reward_event(
+        self,
+        kind: str,
+        action_type: str,
+        specialist_id: str | None,
+        subtask: dict[str, Any] | None,
+        stakes: float,
+        reward_value: float,
+        reason: str,
+        breakdown: dict,
+        was_adversarial: bool,
+        confidence: float | None,
+        result_metadata: dict[str, Any],
+        trust_before: float | None,
+    ) -> None:
+        event = {
+            "kind": kind,
+            "step_count": self.step_count,
+            "action_type": action_type,
+            "specialist_id": specialist_id,
+            "subtask_id": subtask.get("id") if subtask else None,
+            "domain": subtask.get("domain") if subtask else None,
+            "stakes": round(stakes, 3),
+            "reward": round(reward_value, 4),
+            "reason": reason,
+            "signal_breakdown": breakdown,
+            "was_adversarial": was_adversarial,
+            "confidence": round(confidence, 3) if confidence is not None else None,
+            "trust_before": round(trust_before, 3) if trust_before is not None else None,
+            "trust_after": self._ledger.snapshot().get(specialist_id) if specialist_id else None,
+            "result_metadata": result_metadata,
+        }
+        self._reward_trace.append(event)
+    def _reward_component_averages(self) -> dict[str, float]:
+        totals: dict[str, float] = {}
+        counts: dict[str, int] = {}
+        for event in self._reward_trace:
+            for key, value in event.get("signal_breakdown", {}).items():
+                if isinstance(value, (int, float)):
+                    totals[key] = totals.get(key, 0.0) + float(value)
+                    counts[key] = counts.get(key, 0) + 1
+        return {
+            key: round(total / max(1, counts[key]), 4)
+            for key, total in sorted(totals.items())
+        }
     def _apply_difficulty_profile(
         self,
         scenario: Scenario,

graders.py CHANGED Viewed

@@ -14,6 +14,58 @@ def clamp(value: float) -> float:
     return max(0.01, min(0.99, value))
 # ---------------------------------------------------------------------------
 # Task 1 — Single-step trust decision (Easy)
 #
@@ -27,6 +79,9 @@ def grade_task1_step(
     stakes: float,
     was_adversarial: bool,
     action_type: str,              # delegate | verify | solve_independently | skip
 ) -> tuple[float, str, dict]:
     if action_type == "skip":
@@ -49,16 +104,26 @@ def grade_task1_step(
     # Efficiency: verify and solve_independently cost extra steps
     efficiency = {"delegate": 1.0, "verify": 0.7, "solve_independently": 0.5, "skip": 0.0}.get(action_type, 0.5)
     score = (
-        0.50 * task_accuracy
-      + 0.35 * stakes_awareness
-      + 0.15 * efficiency
     )
     breakdown = {
         "task_accuracy":    round(task_accuracy, 3),
         "stakes_awareness": round(stakes_awareness, 3),
         "efficiency":       round(efficiency, 3),
     }
     return clamp(score), reason, breakdown
@@ -75,6 +140,8 @@ def grade_task2_step(
     action_type: str,
     step_count: int,
     max_steps: int,
 ) -> tuple[float, str, dict]:
     if action_type == "skip":
@@ -88,9 +155,14 @@ def grade_task2_step(
     action_cost = {"delegate": 0.0, "verify": 0.10, "solve_independently": 0.20}.get(action_type, 0.0)
     efficiency  = max(0.0, efficiency - action_cost)
     score = (
-        0.65 * task_accuracy
-      + 0.35 * efficiency
     )
     reason = (
@@ -100,6 +172,8 @@ def grade_task2_step(
     breakdown = {
         "task_accuracy": round(task_accuracy, 3),
         "efficiency":    round(efficiency, 3),
     }
     return clamp(score), reason, breakdown
@@ -143,6 +217,9 @@ def grade_task3_step(
     action_type: str,
     step_count: int,
     max_steps: int,
 ) -> tuple[float, str, dict]:
     if action_type == "skip":
@@ -169,16 +246,26 @@ def grade_task3_step(
     step_budget_used = step_count / max_steps
     efficiency = max(0.0, 1.0 - step_budget_used)
     score = (
-        0.40 * task_accuracy
-      + 0.45 * stakes_awareness
-      + 0.15 * efficiency
     )
     breakdown = {
         "task_accuracy":    round(task_accuracy, 3),
         "stakes_awareness": round(stakes_awareness, 3),
         "efficiency":       round(efficiency, 3),
     }
     return clamp(score), reason, breakdown

     return max(0.01, min(0.99, value))
+def confidence_alignment(confidence: float | None, outcome: float) -> float:
+    """
+    Reward honest confidence. High confidence on wrong output is suspicious.
+    Returns 0.0-1.0. Unknown confidence is neutral so older callers remain stable.
+    """
+    if confidence is None:
+        return 0.5
+    overconfidence_gap = max(0.0, confidence - outcome)
+    return max(0.0, min(1.0, 1.0 - overconfidence_gap))
+def domain_routing_score(result_metadata: dict | None) -> float:
+    """
+    Reward using a domain-bound specialist only when it is actually in-domain.
+    If the specialist does not expose domain metadata, return neutral.
+    """
+    metadata = result_metadata or {}
+    if "in_domain" not in metadata:
+        return 0.5
+    return 1.0 if metadata.get("in_domain") else 0.1
+def verification_quality(
+    action_type: str,
+    was_adversarial: bool,
+    stakes: float,
+    trust_score: float | None = None,
+) -> float:
+    """
+    Reward verification when it catches real risk, not as a blind habit.
+    """
+    trust = 0.5 if trust_score is None else trust_score
+    high_stakes = stakes >= ADVERSARIAL_AWARENESS_STAKES
+    if action_type == "verify":
+        if was_adversarial and high_stakes:
+            return 1.0
+        if high_stakes:
+            return 0.75
+        if trust < 0.45:
+            return 0.65
+        return 0.25
+    if action_type == "delegate" and was_adversarial and high_stakes:
+        return 0.0
+    if action_type == "solve_independently" and high_stakes:
+        return 0.7
+    return 0.55
 # ---------------------------------------------------------------------------
 # Task 1 — Single-step trust decision (Easy)
 #
     stakes: float,
     was_adversarial: bool,
     action_type: str,              # delegate | verify | solve_independently | skip
+    confidence: float | None = None,
+    result_metadata: dict | None = None,
+    trust_score: float | None = None,
 ) -> tuple[float, str, dict]:
     if action_type == "skip":
     # Efficiency: verify and solve_independently cost extra steps
     efficiency = {"delegate": 1.0, "verify": 0.7, "solve_independently": 0.5, "skip": 0.0}.get(action_type, 0.5)
+    confidence_score = confidence_alignment(confidence, task_accuracy)
+    domain_score = domain_routing_score(result_metadata)
+    verify_score = verification_quality(action_type, was_adversarial, stakes, trust_score)
     score = (
+        0.43 * task_accuracy
+      + 0.30 * stakes_awareness
+      + 0.12 * efficiency
+      + 0.07 * confidence_score
+      + 0.04 * domain_score
+      + 0.04 * verify_score
     )
     breakdown = {
         "task_accuracy":    round(task_accuracy, 3),
         "stakes_awareness": round(stakes_awareness, 3),
         "efficiency":       round(efficiency, 3),
+        "confidence_alignment": round(confidence_score, 3),
+        "domain_routing": round(domain_score, 3),
+        "verification_quality": round(verify_score, 3),
     }
     return clamp(score), reason, breakdown
     action_type: str,
     step_count: int,
     max_steps: int,
+    confidence: float | None = None,
+    result_metadata: dict | None = None,
 ) -> tuple[float, str, dict]:
     if action_type == "skip":
     action_cost = {"delegate": 0.0, "verify": 0.10, "solve_independently": 0.20}.get(action_type, 0.0)
     efficiency  = max(0.0, efficiency - action_cost)
+    confidence_score = confidence_alignment(confidence, task_accuracy)
+    domain_score = domain_routing_score(result_metadata)
     score = (
+        0.55 * task_accuracy
+      + 0.25 * efficiency
+      + 0.10 * confidence_score
+      + 0.10 * domain_score
     )
     reason = (
     breakdown = {
         "task_accuracy": round(task_accuracy, 3),
         "efficiency":    round(efficiency, 3),
+        "confidence_alignment": round(confidence_score, 3),
+        "domain_routing": round(domain_score, 3),
     }
     return clamp(score), reason, breakdown
     action_type: str,
     step_count: int,
     max_steps: int,
+    confidence: float | None = None,
+    result_metadata: dict | None = None,
+    trust_score: float | None = None,
 ) -> tuple[float, str, dict]:
     if action_type == "skip":
     step_budget_used = step_count / max_steps
     efficiency = max(0.0, 1.0 - step_budget_used)
+    confidence_score = confidence_alignment(confidence, task_accuracy)
+    domain_score = domain_routing_score(result_metadata)
+    verify_score = verification_quality(action_type, was_adversarial, stakes, trust_score)
     score = (
+        0.32 * task_accuracy
+      + 0.33 * stakes_awareness
+      + 0.10 * efficiency
+      + 0.10 * confidence_score
+      + 0.10 * verify_score
+      + 0.05 * domain_score
     )
     breakdown = {
         "task_accuracy":    round(task_accuracy, 3),
         "stakes_awareness": round(stakes_awareness, 3),
         "efficiency":       round(efficiency, 3),
+        "confidence_alignment": round(confidence_score, 3),
+        "verification_quality": round(verify_score, 3),
+        "domain_routing": round(domain_score, 3),
     }
     return clamp(score), reason, breakdown

openenv.yaml CHANGED Viewed

@@ -97,6 +97,15 @@ api:
           required: true
       returns: SentinelState with trust_snapshot, completion, adversarial stats
     difficulty:
       method: GET
       path: /difficulty
@@ -144,7 +153,7 @@ tasks:
     subtasks: 15
     max_steps: 30
     adversary_active: false
-    reward: "per-step accuracy + efficiency | terminal completion×0.65 + calibration×0.35"
   task3:
     name: Full Adversarial Episode
@@ -152,7 +161,16 @@ tasks:
     subtasks: 20
     max_steps: 45
     adversary_active: true
-    reward: "terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
 specialists:
   S0: "AccurateSlow — 90% accurate, costs 2 steps"

           required: true
       returns: SentinelState with trust_snapshot, completion, adversarial stats
+    reward_report:
+      method: GET
+      path: /reward-report
+      params:
+        session_id:
+          type: string
+          required: true
+      returns: Reward component trace with per-step process-aware signals
     difficulty:
       method: GET
       path: /difficulty
     subtasks: 15
     max_steps: 30
     adversary_active: false
+    reward: "per-step accuracy + efficiency + confidence alignment + domain routing | terminal completion×0.65 + calibration×0.35"
   task3:
     name: Full Adversarial Episode
     subtasks: 20
     max_steps: 45
     adversary_active: true
+    reward: "step accuracy + stakes awareness + efficiency + confidence alignment + verification quality + domain routing | terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
+reward_engine_v2:
+  source: verifier/execution-style behavioral outcomes
+  granularity: step plus terminal trajectory
+  aggregation: fixed weighted multi-signal reward
+  process_signals:
+    confidence_alignment: penalizes high confidence on wrong outputs
+    domain_routing: rewards in-domain specialist behavior when metadata exists
+    verification_quality: rewards verification when it catches high-stakes adversarial risk
 specialists:
   S0: "AccurateSlow — 90% accurate, costs 2 steps"

outputs/baseline_comparison.png CHANGED Viewed

outputs/evaluation_results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

static/index.html CHANGED Viewed

@@ -1867,11 +1867,11 @@
               <div class="hero-stats">
                 <div class="hero-stat">
                   <div class="label">Random overall</div>
-                  <div id="heroRandomScore" class="value">0.714</div>
                 </div>
                 <div class="hero-stat">
                   <div class="label">Heuristic overall</div>
-                  <div id="heroHeuristicScore" class="value">0.816</div>
                 </div>
                 <div class="hero-stat">
                   <div class="label">Task 3 detect</div>
@@ -2114,7 +2114,7 @@
             <div class="story-lane before">
               <div class="story-title">
                 <strong>Without SENTINEL</strong>
-                <span id="storyBeforeScore" class="story-score">task3 random 0.699</span>
               </div>
               <div class="story-flow">
                 <div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
@@ -2154,12 +2154,12 @@
             <div class="judge-stats">
               <div class="judge-card bad">
                 <div class="label">Random baseline</div>
-                <div id="judgeRandomScore" class="value">0.714</div>
                 <span class="muted">Blind delegation baseline. Good enough to move, weak at skepticism.</span>
               </div>
               <div class="judge-card warn">
                 <div class="label">Heuristic policy</div>
-                <div id="judgeHeuristicScore" class="value">0.816</div>
                 <span class="muted">Trust-weighted routing plus verification at risky gates.</span>
               </div>
               <div class="judge-card good">
@@ -2228,18 +2228,18 @@
             <div class="baseline-table">
               <div class="baseline-row">
                 <span>Random</span>
-                <div class="mini-bar"><span id="proofRandomBar" style="width:71.4%;background:#ff5f45"></span></div>
-                <strong id="proofRandomScore">0.714</strong>
               </div>
               <div class="baseline-row">
                 <span>Heuristic</span>
-                <div class="mini-bar"><span id="proofHeuristicBar" style="width:81.6%;background:#73a7ff"></span></div>
-                <strong id="proofHeuristicScore">0.816</strong>
               </div>
               <div class="baseline-row">
                 <span>Oracle-lite</span>
-                <div class="mini-bar"><span id="proofOracleBar" style="width:87.2%;background:#27e0a1"></span></div>
-                <strong id="proofOracleScore">0.872</strong>
               </div>
               <div class="baseline-row">
                 <span>T3 detect</span>

               <div class="hero-stats">
                 <div class="hero-stat">
                   <div class="label">Random overall</div>
+                  <div id="heroRandomScore" class="value">0.695</div>
                 </div>
                 <div class="hero-stat">
                   <div class="label">Heuristic overall</div>
+                  <div id="heroHeuristicScore" class="value">0.796</div>
                 </div>
                 <div class="hero-stat">
                   <div class="label">Task 3 detect</div>
             <div class="story-lane before">
               <div class="story-title">
                 <strong>Without SENTINEL</strong>
+                <span id="storyBeforeScore" class="story-score">task3 random 0.666</span>
               </div>
               <div class="story-flow">
                 <div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
             <div class="judge-stats">
               <div class="judge-card bad">
                 <div class="label">Random baseline</div>
+                <div id="judgeRandomScore" class="value">0.695</div>
                 <span class="muted">Blind delegation baseline. Good enough to move, weak at skepticism.</span>
               </div>
               <div class="judge-card warn">
                 <div class="label">Heuristic policy</div>
+                <div id="judgeHeuristicScore" class="value">0.796</div>
                 <span class="muted">Trust-weighted routing plus verification at risky gates.</span>
               </div>
               <div class="judge-card good">
             <div class="baseline-table">
               <div class="baseline-row">
                 <span>Random</span>
+                <div class="mini-bar"><span id="proofRandomBar" style="width:69.5%;background:#ff5f45"></span></div>
+                <strong id="proofRandomScore">0.695</strong>
               </div>
               <div class="baseline-row">
                 <span>Heuristic</span>
+                <div class="mini-bar"><span id="proofHeuristicBar" style="width:79.6%;background:#73a7ff"></span></div>
+                <strong id="proofHeuristicScore">0.796</strong>
               </div>
               <div class="baseline-row">
                 <span>Oracle-lite</span>
+                <div class="mini-bar"><span id="proofOracleBar" style="width:85.5%;background:#27e0a1"></span></div>
+                <strong id="proofOracleScore">0.855</strong>
               </div>
               <div class="baseline-row">
                 <span>T3 detect</span>

tests/test_app.py CHANGED Viewed

@@ -4,7 +4,9 @@ import time
 import unittest
 from app import SessionStore
 from environment import SentinelEnv
 class SessionStoreTests(unittest.TestCase):
@@ -29,7 +31,30 @@ class SessionStoreTests(unittest.TestCase):
         self.assertIsNone(store.get("first"))
         self.assertIs(store.get("second"), second)
 if __name__ == "__main__":
     unittest.main()

 import unittest
 from app import SessionStore
+from app import app
 from environment import SentinelEnv
+from fastapi.testclient import TestClient
 class SessionStoreTests(unittest.TestCase):
         self.assertIsNone(store.get("first"))
         self.assertIs(store.get("second"), second)
+    def test_reward_report_endpoint_returns_active_trace(self) -> None:
+        client = TestClient(app)
+        reset = client.post("/reset", json={"task_type": "task3", "seed": 42})
+        self.assertEqual(reset.status_code, 200)
+        payload = reset.json()
+        sid = payload["info"]["session_id"]
+        obs = payload["observation"]
+        step = client.post(
+            f"/step?session_id={sid}",
+            json={
+                "session_id": sid,
+                "task_type": obs["task_type"],
+                "action_type": "delegate",
+                "specialist_id": "S0",
+            },
+        )
+        self.assertEqual(step.status_code, 200)
+        report = client.get(f"/reward-report?session_id={sid}")
+        self.assertEqual(report.status_code, 200)
+        self.assertEqual(report.json()["reward_events"], 1)
 if __name__ == "__main__":
     unittest.main()

tests/test_environment.py CHANGED Viewed

@@ -67,6 +67,27 @@ class EnvironmentTests(unittest.TestCase):
         self.assertGreater(result["info"]["step_count"], 2)
         self.assertGreaterEqual(result["info"]["score"], 0.0)
         self.assertLessEqual(result["info"]["score"], 1.0)
 if __name__ == "__main__":

         self.assertGreater(result["info"]["step_count"], 2)
         self.assertGreaterEqual(result["info"]["score"], 0.0)
         self.assertLessEqual(result["info"]["score"], 1.0)
+        self.assertIn("reward_report", result["info"])
+        self.assertGreater(result["info"]["reward_report"]["reward_events"], 0)
+    def test_reward_report_tracks_process_components(self) -> None:
+        env = SentinelEnv()
+        result = env.reset(task_type="task3", seed=42)
+        obs = result["observation"]
+        result = env.step({
+            "session_id": obs["session_id"],
+            "task_type": "task3",
+            "action_type": "delegate",
+            "specialist_id": "S0",
+        })
+        report = env.reward_report()
+        self.assertEqual(report["reward_events"], 1)
+        self.assertIn("confidence_alignment", report["component_averages"])
+        self.assertIn("domain_routing", report["component_averages"])
+        self.assertEqual(report["events"][0]["action_type"], "delegate")
 if __name__ == "__main__":

tests/test_graders.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import unittest
-from graders import clamp, grade_task3_step
 from scenarios import get_scenario
 from task_graph import TaskGraph
@@ -26,6 +26,11 @@ class GraderAndGraphTests(unittest.TestCase):
         self.assertGreater(reward, 0.8)
         self.assertIn("Adversarial detected", reason)
         self.assertEqual(breakdown["stakes_awareness"], 0.99)
     def test_failed_nodes_are_retriable_then_resolved(self) -> None:
         graph = TaskGraph(get_scenario("SCN-TASK1-001"))

 import unittest
+from graders import clamp, confidence_alignment, grade_task3_step
 from scenarios import get_scenario
 from task_graph import TaskGraph
         self.assertGreater(reward, 0.8)
         self.assertIn("Adversarial detected", reason)
         self.assertEqual(breakdown["stakes_awareness"], 0.99)
+        self.assertIn("verification_quality", breakdown)
+    def test_overconfident_wrong_answer_is_penalized(self) -> None:
+        self.assertLess(confidence_alignment(0.95, 0.0), 0.1)
+        self.assertGreater(confidence_alignment(0.85, 1.0), 0.8)
     def test_failed_nodes_are_retriable_then_resolved(self) -> None:
         graph = TaskGraph(get_scenario("SCN-TASK1-001"))