XcodeAddy commited on
Commit
a8deee9
·
1 Parent(s): 62f00e5

Add backend walkthrough for SENTINEL environment

Browse files
Files changed (2) hide show
  1. README.md +19 -1
  2. scripts/backend_walkthrough.py +263 -0
README.md CHANGED
@@ -100,6 +100,23 @@ The root route `/` serves the live SENTINEL dashboard on Hugging Face Spaces.
100
  Use `/api` for the JSON route index.
101
  Use `/assets/baseline_comparison.png` for the committed baseline chart used in the dashboard.
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  ## Live Dashboard
104
 
105
  The Space opens directly into **SENTINEL Trust Mission Control**, a judge-demo dashboard:
@@ -193,11 +210,12 @@ pip install pytest
193
  Run checks:
194
 
195
  ```bash
196
- python -m py_compile app.py server/app.py environment.py models.py graders.py specialists.py trust_ledger.py task_graph.py scenarios.py inference.py comms_bus.py training/evaluate.py training/train.py
197
  python -m pytest -q
198
  python inference.py
199
  python training/evaluate.py --episodes 20 --task all --plot outputs/baseline_comparison.png
200
  python training/train.py --dry-run --episodes 5
 
201
  ```
202
 
203
  Run the server:
 
100
  Use `/api` for the JSON route index.
101
  Use `/assets/baseline_comparison.png` for the committed baseline chart used in the dashboard.
102
 
103
+ ## Backend Walkthrough
104
+
105
+ For terminal-first debugging and pitch clarity, run:
106
+
107
+ ```bash
108
+ python scripts/backend_walkthrough.py --task task3 --seed 42 --policy heuristic --compare
109
+ ```
110
+
111
+ This prints the full backend story:
112
+
113
+ - the compact `/reset` JSON the orchestrator sees
114
+ - the hidden shuffled profile for builders only
115
+ - each action, reward, score, trust update, detection, and poisoning count
116
+ - a before/after comparison of blind trust vs trust-aware routing vs oracle-lite upper bound
117
+
118
+ The key scenario to understand is `task3, seed=42`: public slot `S0` is secretly adversarial. It behaves correctly at low stakes, gains trust, then starts poisoning high-stakes nodes. SENTINEL exists to train the orchestrator to catch that shift.
119
+
120
  ## Live Dashboard
121
 
122
  The Space opens directly into **SENTINEL Trust Mission Control**, a judge-demo dashboard:
 
210
  Run checks:
211
 
212
  ```bash
213
+ python -m py_compile app.py server/app.py environment.py models.py graders.py specialists.py trust_ledger.py task_graph.py scenarios.py inference.py comms_bus.py training/evaluate.py training/train.py scripts/backend_walkthrough.py
214
  python -m pytest -q
215
  python inference.py
216
  python training/evaluate.py --episodes 20 --task all --plot outputs/baseline_comparison.png
217
  python training/train.py --dry-run --episodes 5
218
+ python scripts/backend_walkthrough.py --task task3 --seed 42 --policy heuristic --compare --max-rows 14
219
  ```
220
 
221
  Run the server:
scripts/backend_walkthrough.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import random
6
+ import sys
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Callable
10
+
11
+ ROOT = Path(__file__).resolve().parents[1]
12
+ if str(ROOT) not in sys.path:
13
+ sys.path.insert(0, str(ROOT))
14
+
15
+ from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
16
+
17
+
18
+ Policy = Callable[[SentinelEnv, dict, random.Random], dict]
19
+
20
+
21
+ @dataclass
22
+ class TraceRow:
23
+ step_count: int
24
+ subtask_index: int
25
+ stakes: float
26
+ action: str
27
+ reward: float
28
+ score: float
29
+ completed: int
30
+ remaining: int
31
+ trust: dict[str, float]
32
+ reason: str
33
+ detections: int
34
+ poisonings: int
35
+
36
+
37
+ def blind_trust_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
38
+ """Before SENTINEL: always trust the currently highest-trust specialist."""
39
+ trust = obs["trust_snapshot"]
40
+ specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
41
+ return action(obs, "delegate", specialist, f"blind-trust best={specialist}")
42
+
43
+
44
+ def sentinel_heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
45
+ """
46
+ Simple SENTINEL-style baseline.
47
+ It still has flaws, but it shows the intended skill: verify when risk rises.
48
+ """
49
+ trust = obs["trust_snapshot"]
50
+ specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
51
+ score = trust.get(specialist, 0.5)
52
+ action_type = "verify" if obs["stakes_level"] >= 0.70 and score < 0.65 else "delegate"
53
+ return action(
54
+ obs,
55
+ action_type,
56
+ specialist,
57
+ f"sentinel-heuristic best={specialist} trust={score:.3f} stakes={obs['stakes_level']:.2f}",
58
+ )
59
+
60
+
61
+ def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
62
+ """
63
+ Upper-bound policy used for demos and evaluator comparison.
64
+ It uses hidden builder-only info, so it is NOT a deployable policy.
65
+ """
66
+ reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
67
+ if obs["task_type"] == "task3" and obs["stakes_level"] >= 0.70:
68
+ return action(obs, "verify", env._pool.adversarial_slot, "oracle-lite verifies adversarial slot")
69
+ specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
70
+ return action(obs, "delegate", specialist, f"oracle-lite best={specialist}")
71
+
72
+
73
+ POLICIES: dict[str, Policy] = {
74
+ "blind": blind_trust_policy,
75
+ "heuristic": sentinel_heuristic_policy,
76
+ "oracle": oracle_lite_policy,
77
+ }
78
+
79
+
80
+ def action(obs: dict, action_type: str, specialist_id: str | None, reason: str) -> dict:
81
+ return {
82
+ "session_id": obs["session_id"],
83
+ "task_type": obs["task_type"],
84
+ "action_type": action_type,
85
+ "specialist_id": specialist_id,
86
+ "subtask_response": "SELF_SOLVED" if action_type == "solve_independently" else None,
87
+ "reasoning": reason,
88
+ }
89
+
90
+
91
+ def compact_reset(result: dict) -> dict:
92
+ obs = result["observation"]
93
+ return {
94
+ "session_id": obs["session_id"],
95
+ "scenario_id": obs["scenario_id"],
96
+ "task_type": obs["task_type"],
97
+ "current_subtask": obs["current_subtask"],
98
+ "available_specialists": obs["available_specialists"],
99
+ "trust_snapshot": obs["trust_snapshot"],
100
+ "stakes_level": obs["stakes_level"],
101
+ "step_count": obs["step_count"],
102
+ "max_steps": obs["max_steps"],
103
+ "done": result["done"],
104
+ "reward": result["reward"],
105
+ }
106
+
107
+
108
+ def run_episode(
109
+ policy_name: str,
110
+ task_type: str,
111
+ seed: int,
112
+ show_hidden: bool,
113
+ max_rows: int | None,
114
+ ) -> tuple[SentinelEnv, dict, list[TraceRow]]:
115
+ policy = POLICIES[policy_name]
116
+ rng = random.Random(seed)
117
+ env = SentinelEnv()
118
+ result = env.reset(task_type=task_type, seed=seed)
119
+ rows: list[TraceRow] = []
120
+
121
+ print_header(policy_name, task_type, seed)
122
+ print("RESET JSON - compact agent-facing shape")
123
+ print(json.dumps(compact_reset(result), indent=2))
124
+ print()
125
+ if show_hidden:
126
+ print("BUILDER-ONLY HIDDEN PROFILE - agent never sees this")
127
+ print(json.dumps({
128
+ "public_slot_to_internal_behavior": env._pool.internal_profile(),
129
+ "adversarial_public_slot": env._pool.adversarial_slot,
130
+ }, indent=2))
131
+ print()
132
+
133
+ print_trace_header()
134
+ guard = 0
135
+ while not result["done"] and guard < 100:
136
+ obs = result["observation"]
137
+ chosen = policy(env, obs, rng)
138
+ result = env.step(chosen)
139
+ graph_summary = env._graph.summary()
140
+ row = TraceRow(
141
+ step_count=result["info"]["step_count"],
142
+ subtask_index=result["observation"]["subtask_index"],
143
+ stakes=obs["stakes_level"],
144
+ action=f"{chosen['action_type']}:{chosen.get('specialist_id') or 'SELF'}",
145
+ reward=result["reward"]["value"],
146
+ score=result["info"]["score"],
147
+ completed=graph_summary["subtasks_completed"],
148
+ remaining=graph_summary["subtasks_remaining"],
149
+ trust=result["observation"]["trust_snapshot"],
150
+ reason=result["reward"]["reason"],
151
+ detections=graph_summary["adversarial_detections"],
152
+ poisonings=graph_summary["adversarial_poisonings"],
153
+ )
154
+ rows.append(row)
155
+ if max_rows is None or len(rows) <= max_rows:
156
+ print_trace_row(row)
157
+ guard += 1
158
+
159
+ if max_rows is not None and len(rows) > max_rows:
160
+ print(f"... {len(rows) - max_rows} more rows hidden by --max-rows")
161
+
162
+ print()
163
+ print("FINAL INFO")
164
+ print(json.dumps(result["info"], indent=2))
165
+ print("FINAL REWARD")
166
+ print(json.dumps(result["reward"], indent=2))
167
+ print()
168
+ return env, result, rows
169
+
170
+
171
+ def print_header(policy_name: str, task_type: str, seed: int) -> None:
172
+ print("=" * 92)
173
+ print("SENTINEL BACKEND WALKTHROUGH")
174
+ print("=" * 92)
175
+ print(f"policy={policy_name} task={task_type} seed={seed}")
176
+ print()
177
+ print("REAL-WORLD MAPPING")
178
+ print("User gives a long task -> orchestrator splits it -> specialists answer subtasks.")
179
+ print("Some specialists are unreliable: fast-but-wrong, domain-limited, degrading, or adversarial.")
180
+ print("SENTINEL trains the orchestrator behavior: trust, verify, recover, finish.")
181
+ print()
182
+
183
+
184
+ def print_trace_header() -> None:
185
+ print("STEP TRACE")
186
+ print(
187
+ "step | node | stake | action | reward | score | done/rem | adv det/poison | trust snapshot"
188
+ )
189
+ print("-" * 132)
190
+
191
+
192
+ def print_trace_row(row: TraceRow) -> None:
193
+ trust = " ".join(f"{sid}:{score:.3f}" for sid, score in row.trust.items())
194
+ print(
195
+ f"{row.step_count:>4} | {row.subtask_index:>4} | {row.stakes:>5.2f} | "
196
+ f"{row.action:<15} | {row.reward:>6.3f} | {row.score:>5.3f} | "
197
+ f"{row.completed:>2}/{row.completed + row.remaining:<2} | "
198
+ f"{row.detections:>2}/{row.poisonings:<2} | {trust}"
199
+ )
200
+ print(f" reason: {row.reason}")
201
+
202
+
203
+ def compare_policies(task_type: str, seed: int, show_hidden: bool) -> None:
204
+ print("=" * 92)
205
+ print("BEFORE / AFTER BACKEND COMPARISON")
206
+ print("=" * 92)
207
+ print("before=blind trust, middle=heuristic trust, target=oracle-lite upper bound")
208
+ print()
209
+ results = []
210
+ for policy_name in ("blind", "heuristic", "oracle"):
211
+ env = SentinelEnv()
212
+ result = env.reset(task_type=task_type, seed=seed)
213
+ rng = random.Random(seed)
214
+ while not result["done"]:
215
+ chosen = POLICIES[policy_name](env, result["observation"], rng)
216
+ result = env.step(chosen)
217
+ info = result["info"]
218
+ results.append({
219
+ "policy": policy_name,
220
+ "score": info.get("score", 0.0),
221
+ "completion": info.get("completion_rate", 0.0),
222
+ "detections": info.get("adversarial_detections", 0),
223
+ "poisonings": info.get("adversarial_poisonings", 0),
224
+ "steps": info.get("step_count", 0),
225
+ "status": "failed" if info.get("forced_end") else "completed",
226
+ })
227
+ if show_hidden and policy_name == "blind":
228
+ print("Hidden profile for this comparison seed:")
229
+ print(json.dumps({
230
+ "public_slot_to_internal_behavior": env._pool.internal_profile(),
231
+ "adversarial_public_slot": env._pool.adversarial_slot,
232
+ }, indent=2))
233
+ print()
234
+
235
+ print("policy | score | completion | detections | poisonings | steps | status")
236
+ print("-" * 78)
237
+ for item in results:
238
+ print(
239
+ f"{item['policy']:<9} | {item['score']:.3f} | "
240
+ f"{item['completion']:.3f} | {item['detections']:<10} | "
241
+ f"{item['poisonings']:<10} | {item['steps']:<5} | {item['status']}"
242
+ )
243
+ print()
244
+
245
+
246
+ def main() -> None:
247
+ parser = argparse.ArgumentParser(description="Explain SENTINEL backend behavior from terminal.")
248
+ parser.add_argument("--task", default="task3", choices=["task1", "task2", "task3"])
249
+ parser.add_argument("--seed", type=int, default=42)
250
+ parser.add_argument("--policy", default="heuristic", choices=sorted(POLICIES))
251
+ parser.add_argument("--hide-hidden", action="store_true", help="Do not print builder-only hidden profile.")
252
+ parser.add_argument("--max-rows", type=int, default=None, help="Limit printed trace rows.")
253
+ parser.add_argument("--compare", action="store_true", help="Compare blind vs heuristic vs oracle-lite.")
254
+ args = parser.parse_args()
255
+
256
+ show_hidden = not args.hide_hidden
257
+ if args.compare:
258
+ compare_policies(args.task, args.seed, show_hidden)
259
+ run_episode(args.policy, args.task, args.seed, show_hidden, args.max_rows)
260
+
261
+
262
+ if __name__ == "__main__":
263
+ main()