XcodeAddy commited on
Commit
7b81543
·
1 Parent(s): 2bd0413

Add live SENTINEL trust dashboard

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -0
  2. README.md +15 -0
  3. app.py +21 -1
  4. static/index.html +807 -0
Dockerfile CHANGED
@@ -21,6 +21,7 @@ COPY inference.py .
21
  COPY README.md .
22
  COPY pyproject.toml .
23
  COPY server ./server
 
24
 
25
  # Create outputs directory for baseline scores
26
  RUN mkdir -p outputs
 
21
  COPY README.md .
22
  COPY pyproject.toml .
23
  COPY server ./server
24
+ COPY static ./static
25
 
26
  # Create outputs directory for baseline scores
27
  RUN mkdir -p outputs
README.md CHANGED
@@ -82,11 +82,26 @@ The episode `score` exposed in `info` and inference logs is normalized to `0.0-1
82
  ```bash
83
  curl http://localhost:7860/health
84
  curl http://localhost:7860/
 
85
  curl http://localhost:7860/metadata
86
  curl http://localhost:7860/tasks
87
  curl http://localhost:7860/schema
88
  ```
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  Start an episode:
91
 
92
  ```bash
 
82
  ```bash
83
  curl http://localhost:7860/health
84
  curl http://localhost:7860/
85
+ curl http://localhost:7860/api
86
  curl http://localhost:7860/metadata
87
  curl http://localhost:7860/tasks
88
  curl http://localhost:7860/schema
89
  ```
90
 
91
+ The root route `/` serves the live SENTINEL dashboard on Hugging Face Spaces.
92
+ Use `/api` for the JSON route index.
93
+
94
+ ## Live Dashboard
95
+
96
+ The Space opens directly into a judge-demo dashboard:
97
+
98
+ - live task progress and score
99
+ - S0-S4 trust ledger bars
100
+ - manual `delegate`, `verify`, `solve_independently`, and `skip` controls
101
+ - heuristic auto-run
102
+ - adversarial detection and poisoning counters
103
+ - baseline proof table for random, heuristic, and oracle-lite policies
104
+
105
  Start an episode:
106
 
107
  ```bash
app.py CHANGED
@@ -1,10 +1,11 @@
1
  from __future__ import annotations
2
 
3
  import os
 
4
  from typing import Any
5
 
6
  from fastapi import FastAPI, HTTPException, Query
7
- from fastapi.responses import JSONResponse
8
  from pydantic import BaseModel
9
 
10
  from environment import SentinelEnv
@@ -25,6 +26,7 @@ app = FastAPI(
25
 
26
  # One env instance per session_id
27
  _sessions: dict[str, SentinelEnv] = {}
 
28
 
29
  def _get_env(session_id: str) -> SentinelEnv:
30
  if session_id not in _sessions:
@@ -61,6 +63,24 @@ def health():
61
 
62
  @app.get("/")
63
  def root():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  return {
65
  "name": "sentinel-env",
66
  "status": "ok",
 
1
  from __future__ import annotations
2
 
3
  import os
4
+ from pathlib import Path
5
  from typing import Any
6
 
7
  from fastapi import FastAPI, HTTPException, Query
8
+ from fastapi.responses import FileResponse, JSONResponse
9
  from pydantic import BaseModel
10
 
11
  from environment import SentinelEnv
 
26
 
27
  # One env instance per session_id
28
  _sessions: dict[str, SentinelEnv] = {}
29
+ _STATIC_DIR = Path(__file__).resolve().parent / "static"
30
 
31
  def _get_env(session_id: str) -> SentinelEnv:
32
  if session_id not in _sessions:
 
63
 
64
  @app.get("/")
65
  def root():
66
+ index_path = _STATIC_DIR / "index.html"
67
+ if index_path.exists():
68
+ return FileResponse(index_path)
69
+ return JSONResponse(
70
+ {
71
+ "name": "sentinel-env",
72
+ "status": "ok",
73
+ "summary": (
74
+ "SENTINEL trains an orchestrator to calibrate trust, verify risky "
75
+ "outputs, recover from failures, and finish long multi-agent tasks."
76
+ ),
77
+ "routes": ["/health", "/metadata", "/tasks", "/schema", "/grader", "/reset", "/step", "/state"],
78
+ }
79
+ )
80
+
81
+
82
+ @app.get("/api")
83
+ def api_root():
84
  return {
85
  "name": "sentinel-env",
86
  "status": "ok",
static/index.html ADDED
@@ -0,0 +1,807 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1">
6
+ <title>SENTINEL</title>
7
+ <style>
8
+ :root {
9
+ --bg: #f7f8fb;
10
+ --panel: #ffffff;
11
+ --ink: #172033;
12
+ --muted: #667085;
13
+ --line: #d9dee8;
14
+ --accent: #0f766e;
15
+ --blue: #2563eb;
16
+ --red: #dc2626;
17
+ --amber: #b45309;
18
+ --green: #059669;
19
+ --shadow: 0 10px 30px rgba(16, 24, 40, 0.08);
20
+ }
21
+
22
+ * { box-sizing: border-box; }
23
+
24
+ body {
25
+ margin: 0;
26
+ min-height: 100vh;
27
+ color: var(--ink);
28
+ background: var(--bg);
29
+ font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
30
+ letter-spacing: 0;
31
+ }
32
+
33
+ button, select, input {
34
+ font: inherit;
35
+ }
36
+
37
+ .shell {
38
+ min-height: 100vh;
39
+ display: grid;
40
+ grid-template-rows: auto 1fr;
41
+ }
42
+
43
+ header {
44
+ display: flex;
45
+ align-items: center;
46
+ justify-content: space-between;
47
+ gap: 16px;
48
+ padding: 18px 24px;
49
+ background: #111827;
50
+ color: #f8fafc;
51
+ border-bottom: 1px solid #0b1220;
52
+ }
53
+
54
+ .brand {
55
+ display: flex;
56
+ align-items: center;
57
+ gap: 14px;
58
+ min-width: 0;
59
+ }
60
+
61
+ .mark {
62
+ width: 34px;
63
+ height: 34px;
64
+ display: grid;
65
+ place-items: center;
66
+ border: 1px solid rgba(255,255,255,0.22);
67
+ background: #0f766e;
68
+ border-radius: 7px;
69
+ font-weight: 800;
70
+ }
71
+
72
+ h1 {
73
+ margin: 0;
74
+ font-size: 20px;
75
+ line-height: 1.1;
76
+ font-weight: 780;
77
+ }
78
+
79
+ .subhead {
80
+ margin-top: 3px;
81
+ color: #cbd5e1;
82
+ font-size: 13px;
83
+ white-space: nowrap;
84
+ overflow: hidden;
85
+ text-overflow: ellipsis;
86
+ }
87
+
88
+ .header-actions {
89
+ display: flex;
90
+ align-items: center;
91
+ gap: 10px;
92
+ flex-wrap: wrap;
93
+ justify-content: flex-end;
94
+ }
95
+
96
+ main {
97
+ padding: 20px;
98
+ display: grid;
99
+ gap: 16px;
100
+ grid-template-columns: minmax(320px, 1.05fr) minmax(320px, 1fr);
101
+ grid-template-areas:
102
+ "mission trust"
103
+ "controls trust"
104
+ "events metrics";
105
+ align-items: start;
106
+ max-width: 1480px;
107
+ width: 100%;
108
+ margin: 0 auto;
109
+ }
110
+
111
+ section {
112
+ background: var(--panel);
113
+ border: 1px solid var(--line);
114
+ border-radius: 8px;
115
+ box-shadow: var(--shadow);
116
+ min-width: 0;
117
+ }
118
+
119
+ .section-head {
120
+ padding: 14px 16px 10px;
121
+ border-bottom: 1px solid var(--line);
122
+ display: flex;
123
+ align-items: center;
124
+ justify-content: space-between;
125
+ gap: 12px;
126
+ }
127
+
128
+ h2 {
129
+ margin: 0;
130
+ font-size: 14px;
131
+ text-transform: uppercase;
132
+ color: #475467;
133
+ letter-spacing: 0;
134
+ }
135
+
136
+ .mission { grid-area: mission; }
137
+ .controls { grid-area: controls; }
138
+ .trust { grid-area: trust; }
139
+ .events { grid-area: events; }
140
+ .metrics { grid-area: metrics; }
141
+
142
+ .body {
143
+ padding: 16px;
144
+ }
145
+
146
+ .grid {
147
+ display: grid;
148
+ grid-template-columns: repeat(4, minmax(0, 1fr));
149
+ gap: 10px;
150
+ }
151
+
152
+ .stat {
153
+ border: 1px solid var(--line);
154
+ border-radius: 7px;
155
+ padding: 12px;
156
+ background: #fbfcfe;
157
+ min-height: 76px;
158
+ }
159
+
160
+ .label {
161
+ color: var(--muted);
162
+ font-size: 12px;
163
+ line-height: 1.2;
164
+ margin-bottom: 8px;
165
+ }
166
+
167
+ .value {
168
+ font-size: 22px;
169
+ font-weight: 760;
170
+ line-height: 1.05;
171
+ overflow-wrap: anywhere;
172
+ }
173
+
174
+ .subtask {
175
+ margin-top: 14px;
176
+ padding: 14px;
177
+ border: 1px solid var(--line);
178
+ border-radius: 7px;
179
+ background: #f8fafc;
180
+ line-height: 1.45;
181
+ color: #1f2937;
182
+ min-height: 82px;
183
+ }
184
+
185
+ .toolbar {
186
+ display: flex;
187
+ gap: 10px;
188
+ flex-wrap: wrap;
189
+ align-items: center;
190
+ }
191
+
192
+ select, input {
193
+ height: 38px;
194
+ border: 1px solid #cbd5e1;
195
+ background: #fff;
196
+ border-radius: 7px;
197
+ padding: 0 10px;
198
+ color: #111827;
199
+ }
200
+
201
+ input {
202
+ width: 84px;
203
+ }
204
+
205
+ button {
206
+ height: 38px;
207
+ border: 1px solid #cbd5e1;
208
+ background: #fff;
209
+ color: #111827;
210
+ border-radius: 7px;
211
+ padding: 0 12px;
212
+ cursor: pointer;
213
+ display: inline-flex;
214
+ align-items: center;
215
+ gap: 8px;
216
+ white-space: nowrap;
217
+ }
218
+
219
+ button:hover {
220
+ border-color: #94a3b8;
221
+ background: #f8fafc;
222
+ }
223
+
224
+ button.primary {
225
+ background: var(--accent);
226
+ border-color: var(--accent);
227
+ color: #fff;
228
+ }
229
+
230
+ button.danger {
231
+ border-color: #fca5a5;
232
+ color: #991b1b;
233
+ background: #fff5f5;
234
+ }
235
+
236
+ button:disabled {
237
+ cursor: not-allowed;
238
+ opacity: 0.55;
239
+ }
240
+
241
+ .control-grid {
242
+ display: grid;
243
+ grid-template-columns: repeat(4, minmax(0, 1fr));
244
+ gap: 10px;
245
+ }
246
+
247
+ .control-grid button {
248
+ justify-content: center;
249
+ width: 100%;
250
+ }
251
+
252
+ .trust-list {
253
+ display: grid;
254
+ gap: 10px;
255
+ }
256
+
257
+ .specialist {
258
+ display: grid;
259
+ grid-template-columns: 56px 1fr 54px;
260
+ align-items: center;
261
+ gap: 12px;
262
+ padding: 12px;
263
+ border: 1px solid var(--line);
264
+ border-radius: 7px;
265
+ background: #fbfcfe;
266
+ }
267
+
268
+ .sid {
269
+ font-weight: 760;
270
+ color: #111827;
271
+ }
272
+
273
+ .bar {
274
+ height: 12px;
275
+ border-radius: 999px;
276
+ background: #e5e7eb;
277
+ overflow: hidden;
278
+ position: relative;
279
+ }
280
+
281
+ .fill {
282
+ height: 100%;
283
+ width: 50%;
284
+ background: var(--blue);
285
+ border-radius: inherit;
286
+ transition: width 220ms ease, background 220ms ease;
287
+ }
288
+
289
+ .score {
290
+ text-align: right;
291
+ font-variant-numeric: tabular-nums;
292
+ color: #334155;
293
+ font-weight: 700;
294
+ }
295
+
296
+ .chips {
297
+ display: flex;
298
+ flex-wrap: wrap;
299
+ gap: 8px;
300
+ }
301
+
302
+ .chip {
303
+ border-radius: 999px;
304
+ padding: 5px 9px;
305
+ background: #eef2ff;
306
+ color: #3730a3;
307
+ font-size: 12px;
308
+ font-weight: 700;
309
+ }
310
+
311
+ .chip.live { background: #ecfdf3; color: #027a48; }
312
+ .chip.warn { background: #fff7ed; color: #9a3412; }
313
+ .chip.fail { background: #fef2f2; color: #b91c1c; }
314
+
315
+ .progress {
316
+ height: 10px;
317
+ background: #e5e7eb;
318
+ border-radius: 999px;
319
+ overflow: hidden;
320
+ margin-top: 12px;
321
+ }
322
+
323
+ .progress span {
324
+ display: block;
325
+ height: 100%;
326
+ width: 0%;
327
+ background: var(--accent);
328
+ border-radius: inherit;
329
+ transition: width 220ms ease;
330
+ }
331
+
332
+ .event-list {
333
+ display: grid;
334
+ gap: 8px;
335
+ max-height: 360px;
336
+ overflow: auto;
337
+ padding-right: 4px;
338
+ }
339
+
340
+ .event {
341
+ display: grid;
342
+ grid-template-columns: 54px 1fr 64px;
343
+ gap: 10px;
344
+ align-items: start;
345
+ padding: 10px;
346
+ border: 1px solid var(--line);
347
+ border-radius: 7px;
348
+ background: #fbfcfe;
349
+ font-size: 13px;
350
+ }
351
+
352
+ .event strong {
353
+ color: #111827;
354
+ }
355
+
356
+ .event .reward {
357
+ text-align: right;
358
+ font-variant-numeric: tabular-nums;
359
+ font-weight: 760;
360
+ color: var(--accent);
361
+ }
362
+
363
+ .metric-grid {
364
+ display: grid;
365
+ grid-template-columns: repeat(3, minmax(0, 1fr));
366
+ gap: 10px;
367
+ }
368
+
369
+ .baseline-bars {
370
+ margin-top: 14px;
371
+ display: grid;
372
+ gap: 10px;
373
+ }
374
+
375
+ .baseline-row {
376
+ display: grid;
377
+ grid-template-columns: 118px 1fr 48px;
378
+ align-items: center;
379
+ gap: 10px;
380
+ font-size: 13px;
381
+ }
382
+
383
+ .mini-bar {
384
+ height: 10px;
385
+ background: #e5e7eb;
386
+ border-radius: 999px;
387
+ overflow: hidden;
388
+ }
389
+
390
+ .mini-bar span {
391
+ display: block;
392
+ height: 100%;
393
+ border-radius: inherit;
394
+ }
395
+
396
+ .muted {
397
+ color: var(--muted);
398
+ }
399
+
400
+ @media (max-width: 980px) {
401
+ header {
402
+ align-items: flex-start;
403
+ flex-direction: column;
404
+ }
405
+
406
+ main {
407
+ grid-template-columns: 1fr;
408
+ grid-template-areas:
409
+ "mission"
410
+ "trust"
411
+ "controls"
412
+ "metrics"
413
+ "events";
414
+ padding: 14px;
415
+ }
416
+
417
+ .grid, .control-grid, .metric-grid {
418
+ grid-template-columns: repeat(2, minmax(0, 1fr));
419
+ }
420
+ }
421
+
422
+ @media (max-width: 560px) {
423
+ .grid, .control-grid, .metric-grid {
424
+ grid-template-columns: 1fr;
425
+ }
426
+
427
+ .specialist {
428
+ grid-template-columns: 44px 1fr 48px;
429
+ }
430
+
431
+ .event {
432
+ grid-template-columns: 44px 1fr;
433
+ }
434
+
435
+ .event .reward {
436
+ grid-column: 2;
437
+ text-align: left;
438
+ }
439
+ }
440
+ </style>
441
+ </head>
442
+ <body>
443
+ <div class="shell">
444
+ <header>
445
+ <div class="brand">
446
+ <div class="mark">S</div>
447
+ <div>
448
+ <h1>SENTINEL</h1>
449
+ <div class="subhead">Multi-agent trust calibration environment</div>
450
+ </div>
451
+ </div>
452
+ <div class="header-actions">
453
+ <select id="taskSelect" aria-label="Task">
454
+ <option value="task1">Task 1 - Easy</option>
455
+ <option value="task2">Task 2 - Medium</option>
456
+ <option value="task3" selected>Task 3 - Hard</option>
457
+ </select>
458
+ <input id="seedInput" aria-label="Seed" type="number" value="42">
459
+ <button id="resetBtn" class="primary" type="button">Reset</button>
460
+ <button id="autoBtn" type="button">Auto Run</button>
461
+ </div>
462
+ </header>
463
+
464
+ <main>
465
+ <section class="mission">
466
+ <div class="section-head">
467
+ <h2>Mission</h2>
468
+ <div class="chips">
469
+ <span id="statusChip" class="chip live">READY</span>
470
+ <span id="scenarioChip" class="chip">SCENARIO</span>
471
+ </div>
472
+ </div>
473
+ <div class="body">
474
+ <div class="grid">
475
+ <div class="stat">
476
+ <div class="label">Score</div>
477
+ <div id="scoreValue" class="value">0.000</div>
478
+ </div>
479
+ <div class="stat">
480
+ <div class="label">Step</div>
481
+ <div id="stepValue" class="value">0/45</div>
482
+ </div>
483
+ <div class="stat">
484
+ <div class="label">Complete</div>
485
+ <div id="completeValue" class="value">0/20</div>
486
+ </div>
487
+ <div class="stat">
488
+ <div class="label">Stakes</div>
489
+ <div id="stakesValue" class="value">0.00</div>
490
+ </div>
491
+ </div>
492
+ <div class="progress"><span id="progressFill"></span></div>
493
+ <div id="subtaskText" class="subtask">Reset an episode to begin.</div>
494
+ </div>
495
+ </section>
496
+
497
+ <section class="trust">
498
+ <div class="section-head">
499
+ <h2>Trust Ledger</h2>
500
+ <div class="chips">
501
+ <span id="detectChip" class="chip">0 detected</span>
502
+ <span id="poisonChip" class="chip warn">0 poison</span>
503
+ </div>
504
+ </div>
505
+ <div class="body">
506
+ <div id="trustList" class="trust-list"></div>
507
+ </div>
508
+ </section>
509
+
510
+ <section class="controls">
511
+ <div class="section-head">
512
+ <h2>Actions</h2>
513
+ <div class="toolbar">
514
+ <select id="specialistSelect" aria-label="Specialist"></select>
515
+ </div>
516
+ </div>
517
+ <div class="body">
518
+ <div class="control-grid">
519
+ <button id="delegateBtn" class="primary" type="button">Delegate</button>
520
+ <button id="verifyBtn" type="button">Verify</button>
521
+ <button id="selfBtn" type="button">Self Solve</button>
522
+ <button id="skipBtn" class="danger" type="button">Skip</button>
523
+ </div>
524
+ </div>
525
+ </section>
526
+
527
+ <section class="events">
528
+ <div class="section-head">
529
+ <h2>Episode Events</h2>
530
+ <span id="sessionText" class="muted">No session</span>
531
+ </div>
532
+ <div class="body">
533
+ <div id="eventList" class="event-list"></div>
534
+ </div>
535
+ </section>
536
+
537
+ <section class="metrics">
538
+ <div class="section-head">
539
+ <h2>Baseline Proof</h2>
540
+ <span class="muted">20 episodes per task</span>
541
+ </div>
542
+ <div class="body">
543
+ <div class="metric-grid">
544
+ <div class="stat">
545
+ <div class="label">Random</div>
546
+ <div class="value">0.714</div>
547
+ </div>
548
+ <div class="stat">
549
+ <div class="label">Heuristic</div>
550
+ <div class="value">0.816</div>
551
+ </div>
552
+ <div class="stat">
553
+ <div class="label">Oracle-lite</div>
554
+ <div class="value">0.872</div>
555
+ </div>
556
+ </div>
557
+ <div class="baseline-bars">
558
+ <div class="baseline-row">
559
+ <span>Random</span>
560
+ <div class="mini-bar"><span style="width:71.4%;background:#ef4444"></span></div>
561
+ <strong>0.714</strong>
562
+ </div>
563
+ <div class="baseline-row">
564
+ <span>Heuristic</span>
565
+ <div class="mini-bar"><span style="width:81.6%;background:#2563eb"></span></div>
566
+ <strong>0.816</strong>
567
+ </div>
568
+ <div class="baseline-row">
569
+ <span>Oracle-lite</span>
570
+ <div class="mini-bar"><span style="width:87.2%;background:#059669"></span></div>
571
+ <strong>0.872</strong>
572
+ </div>
573
+ </div>
574
+ </div>
575
+ </section>
576
+ </main>
577
+ </div>
578
+
579
+ <script>
580
+ const ids = ["S0", "S1", "S2", "S3", "S4"];
581
+ const state = {
582
+ sessionId: null,
583
+ taskType: "task3",
584
+ observation: null,
585
+ done: true,
586
+ running: false,
587
+ events: []
588
+ };
589
+
590
+ const el = {
591
+ taskSelect: document.getElementById("taskSelect"),
592
+ seedInput: document.getElementById("seedInput"),
593
+ resetBtn: document.getElementById("resetBtn"),
594
+ autoBtn: document.getElementById("autoBtn"),
595
+ specialistSelect: document.getElementById("specialistSelect"),
596
+ delegateBtn: document.getElementById("delegateBtn"),
597
+ verifyBtn: document.getElementById("verifyBtn"),
598
+ selfBtn: document.getElementById("selfBtn"),
599
+ skipBtn: document.getElementById("skipBtn"),
600
+ statusChip: document.getElementById("statusChip"),
601
+ scenarioChip: document.getElementById("scenarioChip"),
602
+ scoreValue: document.getElementById("scoreValue"),
603
+ stepValue: document.getElementById("stepValue"),
604
+ completeValue: document.getElementById("completeValue"),
605
+ stakesValue: document.getElementById("stakesValue"),
606
+ progressFill: document.getElementById("progressFill"),
607
+ subtaskText: document.getElementById("subtaskText"),
608
+ trustList: document.getElementById("trustList"),
609
+ detectChip: document.getElementById("detectChip"),
610
+ poisonChip: document.getElementById("poisonChip"),
611
+ sessionText: document.getElementById("sessionText"),
612
+ eventList: document.getElementById("eventList")
613
+ };
614
+
615
+ function trustColor(value) {
616
+ if (value >= 0.72) return "#059669";
617
+ if (value >= 0.48) return "#2563eb";
618
+ if (value >= 0.30) return "#b45309";
619
+ return "#dc2626";
620
+ }
621
+
622
+ function bestSpecialist() {
623
+ const obs = state.observation;
624
+ if (!obs) return "S0";
625
+ return obs.available_specialists
626
+ .slice()
627
+ .sort((a, b) => (obs.trust_snapshot[b] ?? 0.5) - (obs.trust_snapshot[a] ?? 0.5))[0];
628
+ }
629
+
630
+ function renderTrust() {
631
+ const trust = state.observation?.trust_snapshot || Object.fromEntries(ids.map(id => [id, 0.5]));
632
+ el.trustList.innerHTML = ids.map(id => {
633
+ const value = Number(trust[id] ?? 0.5);
634
+ const pct = Math.round(value * 100);
635
+ return `
636
+ <div class="specialist">
637
+ <div class="sid">${id}</div>
638
+ <div class="bar"><div class="fill" style="width:${pct}%;background:${trustColor(value)}"></div></div>
639
+ <div class="score">${value.toFixed(2)}</div>
640
+ </div>
641
+ `;
642
+ }).join("");
643
+ }
644
+
645
+ function renderSpecialists() {
646
+ const available = state.observation?.available_specialists || ids;
647
+ const selected = el.specialistSelect.value || bestSpecialist();
648
+ el.specialistSelect.innerHTML = available.map(id => `<option value="${id}">${id}</option>`).join("");
649
+ el.specialistSelect.value = available.includes(selected) ? selected : bestSpecialist();
650
+ }
651
+
652
+ function renderEvents() {
653
+ if (!state.events.length) {
654
+ el.eventList.innerHTML = `<div class="muted">No events yet.</div>`;
655
+ return;
656
+ }
657
+ el.eventList.innerHTML = state.events.slice(-16).reverse().map(item => `
658
+ <div class="event">
659
+ <strong>#${item.step}</strong>
660
+ <div>${item.action}<br><span class="muted">${item.summary}</span></div>
661
+ <div class="reward">${item.reward}</div>
662
+ </div>
663
+ `).join("");
664
+ }
665
+
666
+ function render(result) {
667
+ if (result) {
668
+ state.observation = result.observation;
669
+ state.done = Boolean(result.done);
670
+ }
671
+ const obs = state.observation;
672
+ if (!obs) {
673
+ renderTrust();
674
+ renderSpecialists();
675
+ renderEvents();
676
+ return;
677
+ }
678
+
679
+ const info = result?.info || {};
680
+ const completed = obs.subtasks_total - obs.subtasks_remaining;
681
+ const progress = obs.subtasks_total ? (completed / obs.subtasks_total) * 100 : 0;
682
+ const status = state.done ? "DONE" : obs.episode_status.toUpperCase();
683
+
684
+ el.statusChip.textContent = status;
685
+ el.statusChip.className = `chip ${state.done ? "live" : "live"}`;
686
+ el.scenarioChip.textContent = obs.scenario_id;
687
+ el.scoreValue.textContent = Number(info.score ?? 0).toFixed(3);
688
+ el.stepValue.textContent = `${obs.step_count}/${obs.max_steps}`;
689
+ el.completeValue.textContent = `${completed}/${obs.subtasks_total}`;
690
+ el.stakesValue.textContent = Number(obs.stakes_level).toFixed(2);
691
+ el.progressFill.style.width = `${Math.max(0, Math.min(100, progress))}%`;
692
+ el.subtaskText.textContent = state.done ? "Episode complete." : obs.current_subtask;
693
+ el.sessionText.textContent = state.sessionId ? state.sessionId.slice(0, 8) : "No session";
694
+ el.detectChip.textContent = `${info.adversarial_detections ?? 0} detected`;
695
+ el.poisonChip.textContent = `${info.adversarial_poisonings ?? 0} poison`;
696
+ el.poisonChip.className = `chip ${(info.adversarial_poisonings ?? 0) > 0 ? "fail" : "warn"}`;
697
+
698
+ renderTrust();
699
+ renderSpecialists();
700
+ renderEvents();
701
+ setDisabled(state.done || state.running);
702
+ }
703
+
704
+ function setDisabled(disabled) {
705
+ el.delegateBtn.disabled = disabled;
706
+ el.verifyBtn.disabled = disabled;
707
+ el.selfBtn.disabled = disabled;
708
+ el.skipBtn.disabled = disabled;
709
+ }
710
+
711
+ async function resetEpisode() {
712
+ state.running = true;
713
+ el.resetBtn.disabled = true;
714
+ try {
715
+ const seed = Number(el.seedInput.value || 0);
716
+ const response = await fetch("/reset", {
717
+ method: "POST",
718
+ headers: {"Content-Type": "application/json"},
719
+ body: JSON.stringify({task_type: el.taskSelect.value, seed})
720
+ });
721
+ const result = await response.json();
722
+ if (!response.ok) throw new Error(result.detail || "reset failed");
723
+ state.taskType = result.observation.task_type;
724
+ state.sessionId = result.info.session_id;
725
+ state.events = [];
726
+ state.done = false;
727
+ addEvent(0, "reset", "Episode initialized.", "0.00");
728
+ render(result);
729
+ } catch (error) {
730
+ addEvent(0, "error", error.message, "0.00");
731
+ renderEvents();
732
+ } finally {
733
+ state.running = false;
734
+ el.resetBtn.disabled = false;
735
+ setDisabled(state.done);
736
+ }
737
+ }
738
+
739
+ function addEvent(step, action, summary, reward) {
740
+ state.events.push({step, action, summary, reward});
741
+ }
742
+
743
+ function actionPayload(type, specialist) {
744
+ const obs = state.observation;
745
+ return {
746
+ session_id: state.sessionId,
747
+ task_type: obs.task_type,
748
+ action_type: type,
749
+ specialist_id: specialist,
750
+ subtask_response: type === "solve_independently" ? "SELF_SOLVED" : null,
751
+ reasoning: `ui-${type}${specialist ? "-" + specialist : ""}`
752
+ };
753
+ }
754
+
755
+ async function stepEpisode(type, specialist = null) {
756
+ if (!state.sessionId || state.done || state.running) return;
757
+ state.running = true;
758
+ setDisabled(true);
759
+ try {
760
+ const chosen = specialist || el.specialistSelect.value || bestSpecialist();
761
+ const payload = actionPayload(type, type === "delegate" || type === "verify" ? chosen : null);
762
+ const response = await fetch(`/step?session_id=${encodeURIComponent(state.sessionId)}`, {
763
+ method: "POST",
764
+ headers: {"Content-Type": "application/json"},
765
+ body: JSON.stringify(payload)
766
+ });
767
+ const result = await response.json();
768
+ if (!response.ok) throw new Error(result.detail || "step failed");
769
+ const reward = Number(result.reward.value || 0).toFixed(2);
770
+ const label = payload.specialist_id ? `${type}:${payload.specialist_id}` : type;
771
+ addEvent(result.info.step_count, label, result.reward.reason, reward);
772
+ render(result);
773
+ } catch (error) {
774
+ addEvent(state.observation?.step_count || 0, "error", error.message, "0.00");
775
+ renderEvents();
776
+ } finally {
777
+ state.running = false;
778
+ setDisabled(state.done);
779
+ }
780
+ }
781
+
782
+ async function autoRun() {
783
+ if (!state.observation || state.done) await resetEpisode();
784
+ let guard = 0;
785
+ while (!state.done && guard < 60) {
786
+ const obs = state.observation;
787
+ const best = bestSpecialist();
788
+ const trust = obs.trust_snapshot[best] ?? 0.5;
789
+ const type = obs.stakes_level >= 0.70 && trust < 0.65 ? "verify" : "delegate";
790
+ await stepEpisode(type, best);
791
+ guard += 1;
792
+ await new Promise(resolve => setTimeout(resolve, 180));
793
+ }
794
+ }
795
+
796
+ el.resetBtn.addEventListener("click", resetEpisode);
797
+ el.delegateBtn.addEventListener("click", () => stepEpisode("delegate"));
798
+ el.verifyBtn.addEventListener("click", () => stepEpisode("verify"));
799
+ el.selfBtn.addEventListener("click", () => stepEpisode("solve_independently"));
800
+ el.skipBtn.addEventListener("click", () => stepEpisode("skip"));
801
+ el.autoBtn.addEventListener("click", autoRun);
802
+
803
+ render();
804
+ resetEpisode();
805
+ </script>
806
+ </body>
807
+ </html>