XcodeAddy commited on
Commit
1c10148
·
1 Parent(s): 7b81543

Deepen SENTINEL demo dashboard

Browse files
Files changed (2) hide show
  1. README.md +16 -0
  2. static/index.html +285 -9
README.md CHANGED
@@ -99,9 +99,25 @@ The Space opens directly into a judge-demo dashboard:
99
  - S0-S4 trust ledger bars
100
  - manual `delegate`, `verify`, `solve_independently`, and `skip` controls
101
  - heuristic auto-run
 
 
 
 
102
  - adversarial detection and poisoning counters
103
  - baseline proof table for random, heuristic, and oracle-lite policies
104
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  Start an episode:
106
 
107
  ```bash
 
99
  - S0-S4 trust ledger bars
100
  - manual `delegate`, `verify`, `solve_independently`, and `skip` controls
101
  - heuristic auto-run
102
+ - profile reshuffle demo via seed swap
103
+ - before/after failure contrast
104
+ - human-body architecture map
105
+ - hackathon theme coverage map
106
  - adversarial detection and poisoning counters
107
  - baseline proof table for random, heuristic, and oracle-lite policies
108
 
109
+ Current status as of April 22, 2026:
110
+
111
+ | Requirement | Status |
112
+ | --- | --- |
113
+ | Hugging Face Space | Live |
114
+ | Docker build | Passing |
115
+ | OpenEnv validation | Passing |
116
+ | Baseline chart | Committed |
117
+ | Live trust UI | Deployed |
118
+ | Mini-blog/video | Still required before finale |
119
+ | Onsite GRPO curve | Still required during finale |
120
+
121
  Start an episode:
122
 
123
  ```bash
static/index.html CHANGED
@@ -101,7 +101,9 @@
101
  grid-template-areas:
102
  "mission trust"
103
  "controls trust"
104
- "events metrics";
 
 
105
  align-items: start;
106
  max-width: 1480px;
107
  width: 100%;
@@ -138,6 +140,9 @@
138
  .trust { grid-area: trust; }
139
  .events { grid-area: events; }
140
  .metrics { grid-area: metrics; }
 
 
 
141
 
142
  .body {
143
  padding: 16px;
@@ -366,6 +371,133 @@
366
  gap: 10px;
367
  }
368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  .baseline-bars {
370
  margin-top: 14px;
371
  display: grid;
@@ -410,17 +542,32 @@
410
  "trust"
411
  "controls"
412
  "metrics"
413
- "events";
 
 
 
414
  padding: 14px;
415
  }
416
 
417
- .grid, .control-grid, .metric-grid {
418
  grid-template-columns: repeat(2, minmax(0, 1fr));
419
  }
 
 
 
 
 
 
 
 
 
 
 
 
420
  }
421
 
422
  @media (max-width: 560px) {
423
- .grid, .control-grid, .metric-grid {
424
  grid-template-columns: 1fr;
425
  }
426
 
@@ -457,6 +604,7 @@
457
  </select>
458
  <input id="seedInput" aria-label="Seed" type="number" value="42">
459
  <button id="resetBtn" class="primary" type="button">Reset</button>
 
460
  <button id="autoBtn" type="button">Auto Run</button>
461
  </div>
462
  </header>
@@ -512,6 +660,7 @@
512
  <h2>Actions</h2>
513
  <div class="toolbar">
514
  <select id="specialistSelect" aria-label="Specialist"></select>
 
515
  </div>
516
  </div>
517
  <div class="body">
@@ -573,6 +722,108 @@
573
  </div>
574
  </div>
575
  </section>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  </main>
577
  </div>
578
 
@@ -591,8 +842,10 @@
591
  taskSelect: document.getElementById("taskSelect"),
592
  seedInput: document.getElementById("seedInput"),
593
  resetBtn: document.getElementById("resetBtn"),
 
594
  autoBtn: document.getElementById("autoBtn"),
595
  specialistSelect: document.getElementById("specialistSelect"),
 
596
  delegateBtn: document.getElementById("delegateBtn"),
597
  verifyBtn: document.getElementById("verifyBtn"),
598
  selfBtn: document.getElementById("selfBtn"),
@@ -698,9 +951,28 @@
698
  renderTrust();
699
  renderSpecialists();
700
  renderEvents();
 
701
  setDisabled(state.done || state.running);
702
  }
703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
  function setDisabled(disabled) {
705
  el.delegateBtn.disabled = disabled;
706
  el.verifyBtn.disabled = disabled;
@@ -783,17 +1055,21 @@
783
  if (!state.observation || state.done) await resetEpisode();
784
  let guard = 0;
785
  while (!state.done && guard < 60) {
786
- const obs = state.observation;
787
- const best = bestSpecialist();
788
- const trust = obs.trust_snapshot[best] ?? 0.5;
789
- const type = obs.stakes_level >= 0.70 && trust < 0.65 ? "verify" : "delegate";
790
- await stepEpisode(type, best);
791
  guard += 1;
792
  await new Promise(resolve => setTimeout(resolve, 180));
793
  }
794
  }
795
 
 
 
 
 
 
 
796
  el.resetBtn.addEventListener("click", resetEpisode);
 
797
  el.delegateBtn.addEventListener("click", () => stepEpisode("delegate"));
798
  el.verifyBtn.addEventListener("click", () => stepEpisode("verify"));
799
  el.selfBtn.addEventListener("click", () => stepEpisode("solve_independently"));
 
101
  grid-template-areas:
102
  "mission trust"
103
  "controls trust"
104
+ "events metrics"
105
+ "story story"
106
+ "architecture themes";
107
  align-items: start;
108
  max-width: 1480px;
109
  width: 100%;
 
140
  .trust { grid-area: trust; }
141
  .events { grid-area: events; }
142
  .metrics { grid-area: metrics; }
143
+ .story { grid-area: story; }
144
+ .architecture { grid-area: architecture; }
145
+ .themes { grid-area: themes; }
146
 
147
  .body {
148
  padding: 16px;
 
371
  gap: 10px;
372
  }
373
 
374
+ .story-grid {
375
+ display: grid;
376
+ grid-template-columns: minmax(0, 1fr) 72px minmax(0, 1fr);
377
+ gap: 14px;
378
+ align-items: stretch;
379
+ }
380
+
381
+ .contrast {
382
+ border: 1px solid var(--line);
383
+ border-radius: 8px;
384
+ padding: 14px;
385
+ background: #fbfcfe;
386
+ }
387
+
388
+ .contrast.before {
389
+ border-color: #fecaca;
390
+ background: #fff7f7;
391
+ }
392
+
393
+ .contrast.after {
394
+ border-color: #a7f3d0;
395
+ background: #f0fdf9;
396
+ }
397
+
398
+ .contrast-title {
399
+ display: flex;
400
+ justify-content: space-between;
401
+ gap: 10px;
402
+ align-items: center;
403
+ font-weight: 780;
404
+ margin-bottom: 12px;
405
+ color: #111827;
406
+ }
407
+
408
+ .score-pill {
409
+ border-radius: 999px;
410
+ padding: 5px 9px;
411
+ font-size: 12px;
412
+ font-weight: 800;
413
+ color: #fff;
414
+ background: #475569;
415
+ font-variant-numeric: tabular-nums;
416
+ }
417
+
418
+ .score-pill.bad { background: var(--red); }
419
+ .score-pill.good { background: var(--accent); }
420
+
421
+ .flow {
422
+ display: grid;
423
+ gap: 8px;
424
+ }
425
+
426
+ .flow-node {
427
+ border: 1px solid var(--line);
428
+ border-radius: 7px;
429
+ background: #fff;
430
+ padding: 10px;
431
+ line-height: 1.35;
432
+ min-height: 48px;
433
+ }
434
+
435
+ .before .flow-node.warn {
436
+ border-color: #fca5a5;
437
+ color: #7f1d1d;
438
+ background: #fff1f2;
439
+ }
440
+
441
+ .after .flow-node.clean {
442
+ border-color: #86efac;
443
+ color: #064e3b;
444
+ background: #ecfdf3;
445
+ }
446
+
447
+ .bridge {
448
+ display: grid;
449
+ place-items: center;
450
+ min-height: 100%;
451
+ color: #64748b;
452
+ font-size: 12px;
453
+ text-align: center;
454
+ line-height: 1.35;
455
+ border-left: 1px dashed #94a3b8;
456
+ border-right: 1px dashed #94a3b8;
457
+ }
458
+
459
+ .role-grid, .theme-grid {
460
+ display: grid;
461
+ grid-template-columns: repeat(2, minmax(0, 1fr));
462
+ gap: 10px;
463
+ }
464
+
465
+ .role, .theme-card {
466
+ border: 1px solid var(--line);
467
+ border-radius: 7px;
468
+ padding: 12px;
469
+ background: #fbfcfe;
470
+ min-height: 88px;
471
+ }
472
+
473
+ .role strong, .theme-card strong {
474
+ display: block;
475
+ margin-bottom: 6px;
476
+ color: #111827;
477
+ }
478
+
479
+ .theme-card.blue { border-color: #bfdbfe; background: #eff6ff; }
480
+ .theme-card.green { border-color: #bbf7d0; background: #f0fdf4; }
481
+ .theme-card.amber { border-color: #fde68a; background: #fffbeb; }
482
+ .theme-card.magenta { border-color: #fbcfe8; background: #fdf2f8; }
483
+
484
+ .status-strip {
485
+ display: grid;
486
+ grid-template-columns: repeat(4, minmax(0, 1fr));
487
+ gap: 10px;
488
+ margin-bottom: 14px;
489
+ }
490
+
491
+ .status-item {
492
+ border: 1px solid #bbf7d0;
493
+ background: #f0fdf4;
494
+ color: #065f46;
495
+ border-radius: 7px;
496
+ padding: 10px 12px;
497
+ font-size: 13px;
498
+ font-weight: 720;
499
+ }
500
+
501
  .baseline-bars {
502
  margin-top: 14px;
503
  display: grid;
 
542
  "trust"
543
  "controls"
544
  "metrics"
545
+ "events"
546
+ "story"
547
+ "architecture"
548
+ "themes";
549
  padding: 14px;
550
  }
551
 
552
+ .grid, .control-grid, .metric-grid, .role-grid, .theme-grid, .status-strip {
553
  grid-template-columns: repeat(2, minmax(0, 1fr));
554
  }
555
+
556
+ .story-grid {
557
+ grid-template-columns: 1fr;
558
+ }
559
+
560
+ .bridge {
561
+ min-height: 58px;
562
+ border-left: 0;
563
+ border-right: 0;
564
+ border-top: 1px dashed #94a3b8;
565
+ border-bottom: 1px dashed #94a3b8;
566
+ }
567
  }
568
 
569
  @media (max-width: 560px) {
570
+ .grid, .control-grid, .metric-grid, .role-grid, .theme-grid, .status-strip {
571
  grid-template-columns: 1fr;
572
  }
573
 
 
604
  </select>
605
  <input id="seedInput" aria-label="Seed" type="number" value="42">
606
  <button id="resetBtn" class="primary" type="button">Reset</button>
607
+ <button id="swapBtn" type="button">Swap Profiles</button>
608
  <button id="autoBtn" type="button">Auto Run</button>
609
  </div>
610
  </header>
 
660
  <h2>Actions</h2>
661
  <div class="toolbar">
662
  <select id="specialistSelect" aria-label="Specialist"></select>
663
+ <span id="recommendChip" class="chip">delegate:S0</span>
664
  </div>
665
  </div>
666
  <div class="body">
 
722
  </div>
723
  </div>
724
  </section>
725
+
726
+ <section class="story">
727
+ <div class="section-head">
728
+ <h2>Before And After</h2>
729
+ <div class="chips">
730
+ <span class="chip fail">blind trust fails</span>
731
+ <span class="chip live">verification recovers</span>
732
+ </div>
733
+ </div>
734
+ <div class="body">
735
+ <div class="status-strip">
736
+ <div class="status-item">HF Space live</div>
737
+ <div class="status-item">OpenEnv validate green</div>
738
+ <div class="status-item">Docker build green</div>
739
+ <div class="status-item">Reward chart committed</div>
740
+ </div>
741
+ <div class="story-grid">
742
+ <div class="contrast before">
743
+ <div class="contrast-title">
744
+ <span>Before SENTINEL</span>
745
+ <span class="score-pill bad">T3 random 0.699</span>
746
+ </div>
747
+ <div class="flow">
748
+ <div class="flow-node">Uniform trust: S0-S4 all start at 0.50</div>
749
+ <div class="flow-node warn">High-stakes delegation can accept poisoned output</div>
750
+ <div class="flow-node warn">Downstream subtasks inherit corrupted state</div>
751
+ <div class="flow-node">Detection rate: 0.433</div>
752
+ </div>
753
+ </div>
754
+ <div class="bridge">
755
+ profile shuffle<br>
756
+ skill not memory
757
+ </div>
758
+ <div class="contrast after">
759
+ <div class="contrast-title">
760
+ <span>After SENTINEL Policy</span>
761
+ <span class="score-pill good">T3 oracle 0.895</span>
762
+ </div>
763
+ <div class="flow">
764
+ <div class="flow-node clean">Trust ledger moves after each behavioral outcome</div>
765
+ <div class="flow-node clean">High-stakes low-trust steps trigger verification</div>
766
+ <div class="flow-node clean">Adversarial attempts are blocked before cascade</div>
767
+ <div class="flow-node">Detection: heuristic 0.735, oracle 1.000</div>
768
+ </div>
769
+ </div>
770
+ </div>
771
+ </div>
772
+ </section>
773
+
774
+ <section class="architecture">
775
+ <div class="section-head">
776
+ <h2>Architecture</h2>
777
+ <span class="muted">human-body model</span>
778
+ </div>
779
+ <div class="body">
780
+ <div class="role-grid">
781
+ <div class="role">
782
+ <strong>Brain</strong>
783
+ Orchestrator chooses delegate, verify, self solve, or skip.
784
+ </div>
785
+ <div class="role">
786
+ <strong>Heart</strong>
787
+ Environment owns reset, step, state, budget, and terminal scoring.
788
+ </div>
789
+ <div class="role">
790
+ <strong>Immune System</strong>
791
+ Trust ledger updates S0-S4 reliability from behavior.
792
+ </div>
793
+ <div class="role">
794
+ <strong>Pathogen</strong>
795
+ Adversarial specialist stays benign until high-stakes steps.
796
+ </div>
797
+ </div>
798
+ </div>
799
+ </section>
800
+
801
+ <section class="themes">
802
+ <div class="section-head">
803
+ <h2>Theme Fit</h2>
804
+ <span class="muted">finale story map</span>
805
+ </div>
806
+ <div class="body">
807
+ <div class="theme-grid">
808
+ <div class="theme-card blue">
809
+ <strong>Theme 1</strong>
810
+ Multi-agent orchestration with partial observability and adversarial dynamics.
811
+ </div>
812
+ <div class="theme-card green">
813
+ <strong>Theme 2</strong>
814
+ Long-horizon task graphs with delayed terminal reward and budget pressure.
815
+ </div>
816
+ <div class="theme-card amber">
817
+ <strong>Theme 4</strong>
818
+ Profile reshuffle creates an auto-curriculum with no identity memorization.
819
+ </div>
820
+ <div class="theme-card magenta">
821
+ <strong>Theme 5</strong>
822
+ Real AI systems problem: blind trust inside agent pipelines.
823
+ </div>
824
+ </div>
825
+ </div>
826
+ </section>
827
  </main>
828
  </div>
829
 
 
842
  taskSelect: document.getElementById("taskSelect"),
843
  seedInput: document.getElementById("seedInput"),
844
  resetBtn: document.getElementById("resetBtn"),
845
+ swapBtn: document.getElementById("swapBtn"),
846
  autoBtn: document.getElementById("autoBtn"),
847
  specialistSelect: document.getElementById("specialistSelect"),
848
+ recommendChip: document.getElementById("recommendChip"),
849
  delegateBtn: document.getElementById("delegateBtn"),
850
  verifyBtn: document.getElementById("verifyBtn"),
851
  selfBtn: document.getElementById("selfBtn"),
 
951
  renderTrust();
952
  renderSpecialists();
953
  renderEvents();
954
+ renderRecommendation();
955
  setDisabled(state.done || state.running);
956
  }
957
 
958
+ function recommendedMove() {
959
+ const obs = state.observation;
960
+ if (!obs) return {type: "delegate", specialist: "S0"};
961
+ const best = bestSpecialist();
962
+ const trust = obs.trust_snapshot[best] ?? 0.5;
963
+ const highStakes = obs.stakes_level >= 0.70;
964
+ if (highStakes && trust < 0.65) return {type: "verify", specialist: best};
965
+ return {type: "delegate", specialist: best};
966
+ }
967
+
968
+ function renderRecommendation() {
969
+ if (!el.recommendChip) return;
970
+ const move = recommendedMove();
971
+ const label = `${move.type}:${move.specialist}`;
972
+ el.recommendChip.textContent = label;
973
+ el.recommendChip.className = `chip ${move.type === "verify" ? "warn" : "live"}`;
974
+ }
975
+
976
  function setDisabled(disabled) {
977
  el.delegateBtn.disabled = disabled;
978
  el.verifyBtn.disabled = disabled;
 
1055
  if (!state.observation || state.done) await resetEpisode();
1056
  let guard = 0;
1057
  while (!state.done && guard < 60) {
1058
+ const move = recommendedMove();
1059
+ await stepEpisode(move.type, move.specialist);
 
 
 
1060
  guard += 1;
1061
  await new Promise(resolve => setTimeout(resolve, 180));
1062
  }
1063
  }
1064
 
1065
+ async function swapProfiles() {
1066
+ const nextSeed = Number(el.seedInput.value || 0) + 1;
1067
+ el.seedInput.value = String(nextSeed);
1068
+ await resetEpisode();
1069
+ }
1070
+
1071
  el.resetBtn.addEventListener("click", resetEpisode);
1072
+ el.swapBtn.addEventListener("click", swapProfiles);
1073
  el.delegateBtn.addEventListener("click", () => stepEpisode("delegate"));
1074
  el.verifyBtn.addEventListener("click", () => stepEpisode("verify"));
1075
  el.selfBtn.addEventListener("click", () => stepEpisode("solve_independently"));