Harshit200431 commited on
Commit
c8e966f
·
2 Parent(s): 1d68c54abef90f
.gitignore CHANGED
@@ -71,6 +71,7 @@ datasets/
71
  # =========================
72
  # BUILD / OUTPUT
73
  # =========================
 
74
  dist/
75
  build/
76
  out/
 
71
  # =========================
72
  # BUILD / OUTPUT
73
  # =========================
74
+ node_modules/
75
  dist/
76
  build/
77
  out/
docs/TRAINING_RUNBOOK.md CHANGED
@@ -173,6 +173,11 @@ outputs/charts/detection_vs_poisoning.png
173
  outputs/charts/cluster_health_timeline.png
174
  outputs/charts/task_radar.png
175
  outputs/charts/ablation.png
 
 
 
 
 
176
  ```
177
 
178
  Then verify:
 
173
  outputs/charts/cluster_health_timeline.png
174
  outputs/charts/task_radar.png
175
  outputs/charts/ablation.png
176
+ outputs/charts/baseline_delta_lines.png
177
+ outputs/charts/cluster_health_policy_lines.png
178
+ outputs/charts/trust_gap_over_time.png
179
+ outputs/charts/reward_component_stacked_area.png
180
+ outputs/charts/failure_fishbone_map.png
181
  ```
182
 
183
  Then verify:
environment.py CHANGED
@@ -583,6 +583,7 @@ class SentinelEnv:
583
  "confidence": round(confidence, 3) if confidence is not None else None,
584
  "trust_before": round(trust_before, 3) if trust_before is not None else None,
585
  "trust_after": self._ledger.snapshot().get(specialist_id) if specialist_id else None,
 
586
  "result_metadata": result_metadata,
587
  }
588
  self._reward_trace.append(event)
 
583
  "confidence": round(confidence, 3) if confidence is not None else None,
584
  "trust_before": round(trust_before, 3) if trust_before is not None else None,
585
  "trust_after": self._ledger.snapshot().get(specialist_id) if specialist_id else None,
586
+ "trust_snapshot": self._ledger.snapshot(),
587
  "result_metadata": result_metadata,
588
  }
589
  self._reward_trace.append(event)
outputs/charts/baseline_delta_lines.png ADDED

Git LFS Details

  • SHA256: 4b37b83f4786fa9ad29a941f37eacb3812e1a04736b9e678faac80bee0e1b1b0
  • Pointer size: 130 Bytes
  • Size of remote file: 17.3 kB
outputs/charts/cluster_health_policy_lines.png ADDED

Git LFS Details

  • SHA256: 626dbd0866e2ab55b6bf635e7fb479b55b91ec9dce2ac47fa3e072b590a5b500
  • Pointer size: 129 Bytes
  • Size of remote file: 6.23 kB
outputs/charts/failure_fishbone_map.png ADDED

Git LFS Details

  • SHA256: 640a614a49f9d1d7ccbb640ce091d11d6a7756cc5a941d74b27f700d65e60817
  • Pointer size: 130 Bytes
  • Size of remote file: 10.4 kB
outputs/charts/reward_component_stacked_area.png ADDED

Git LFS Details

  • SHA256: 18dc556ca3b726e5084dd7e9662cc534a9eb2908c9ff32839ccb4c8880ab194f
  • Pointer size: 130 Bytes
  • Size of remote file: 14.3 kB
outputs/charts/trust_gap_over_time.png ADDED

Git LFS Details

  • SHA256: 512c93574a0c5e55134d70fa509d210409b198fead3469b838cfb5688082c241
  • Pointer size: 129 Bytes
  • Size of remote file: 7.74 kB
outputs/reward_report_task3_seed42.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "episode_id": "b2421ee8-92e4-4d4b-b53d-7b4cdd6c86ee",
3
- "session_id": "0f5acbea-d300-4044-b8dc-e0699bedef81",
4
  "task_type": "task3",
5
  "score": 0.6759,
6
  "total_reward": 17.5723,
@@ -41,6 +41,13 @@
41
  "confidence": 0.88,
42
  "trust_before": 0.5,
43
  "trust_after": 0.706,
 
 
 
 
 
 
 
44
  "result_metadata": {
45
  "step_cost": 1,
46
  "triggered": false,
@@ -70,6 +77,13 @@
70
  "confidence": 0.88,
71
  "trust_before": 0.706,
72
  "trust_after": 0.795,
 
 
 
 
 
 
 
73
  "result_metadata": {
74
  "step_cost": 1,
75
  "triggered": false,
@@ -99,6 +113,13 @@
99
  "confidence": 0.88,
100
  "trust_before": 0.795,
101
  "trust_after": 0.847,
 
 
 
 
 
 
 
102
  "result_metadata": {
103
  "step_cost": 1,
104
  "triggered": false,
@@ -128,6 +149,13 @@
128
  "confidence": 0.88,
129
  "trust_before": 0.847,
130
  "trust_after": 0.878,
 
 
 
 
 
 
 
131
  "result_metadata": {
132
  "step_cost": 1,
133
  "triggered": false,
@@ -157,6 +185,13 @@
157
  "confidence": 0.88,
158
  "trust_before": 0.878,
159
  "trust_after": 0.9,
 
 
 
 
 
 
 
160
  "result_metadata": {
161
  "step_cost": 1,
162
  "triggered": false,
@@ -186,6 +221,13 @@
186
  "confidence": 0.88,
187
  "trust_before": 0.9,
188
  "trust_after": 0.913,
 
 
 
 
 
 
 
189
  "result_metadata": {
190
  "step_cost": 1,
191
  "triggered": false,
@@ -215,6 +257,13 @@
215
  "confidence": 0.88,
216
  "trust_before": 0.913,
217
  "trust_after": 0.786,
 
 
 
 
 
 
 
218
  "result_metadata": {
219
  "step_cost": 1,
220
  "triggered": false,
@@ -244,6 +293,13 @@
244
  "confidence": 0.88,
245
  "trust_before": 0.786,
246
  "trust_after": 0.812,
 
 
 
 
 
 
 
247
  "result_metadata": {
248
  "step_cost": 1,
249
  "triggered": false,
@@ -273,6 +329,13 @@
273
  "confidence": 0.88,
274
  "trust_before": 0.812,
275
  "trust_after": 0.834,
 
 
 
 
 
 
 
276
  "result_metadata": {
277
  "step_cost": 1,
278
  "triggered": false,
@@ -302,6 +365,13 @@
302
  "confidence": 0.88,
303
  "trust_before": 0.834,
304
  "trust_after": 0.852,
 
 
 
 
 
 
 
305
  "result_metadata": {
306
  "step_cost": 1,
307
  "triggered": false,
@@ -331,6 +401,13 @@
331
  "confidence": 0.88,
332
  "trust_before": 0.852,
333
  "trust_after": 0.865,
 
 
 
 
 
 
 
334
  "result_metadata": {
335
  "step_cost": 1,
336
  "triggered": false,
@@ -360,6 +437,13 @@
360
  "confidence": 0.88,
361
  "trust_before": 0.865,
362
  "trust_after": 0.878,
 
 
 
 
 
 
 
363
  "result_metadata": {
364
  "step_cost": 1,
365
  "triggered": false,
@@ -389,6 +473,13 @@
389
  "confidence": 0.88,
390
  "trust_before": 0.878,
391
  "trust_after": 0.888,
 
 
 
 
 
 
 
392
  "result_metadata": {
393
  "step_cost": 1,
394
  "triggered": false,
@@ -418,6 +509,13 @@
418
  "confidence": 0.88,
419
  "trust_before": 0.888,
420
  "trust_after": 0.897,
 
 
 
 
 
 
 
421
  "result_metadata": {
422
  "step_cost": 1,
423
  "triggered": false,
@@ -447,6 +545,13 @@
447
  "confidence": 0.92,
448
  "trust_before": 0.897,
449
  "trust_after": 0.825,
 
 
 
 
 
 
 
450
  "result_metadata": {
451
  "step_cost": 1,
452
  "triggered": true,
@@ -476,6 +581,13 @@
476
  "confidence": 0.92,
477
  "trust_before": 0.825,
478
  "trust_after": 0.763,
 
 
 
 
 
 
 
479
  "result_metadata": {
480
  "step_cost": 1,
481
  "triggered": true,
@@ -505,6 +617,13 @@
505
  "confidence": 0.92,
506
  "trust_before": 0.763,
507
  "trust_after": 0.708,
 
 
 
 
 
 
 
508
  "result_metadata": {
509
  "step_cost": 1,
510
  "triggered": true,
@@ -534,6 +653,13 @@
534
  "confidence": 0.92,
535
  "trust_before": 0.708,
536
  "trust_after": 0.659,
 
 
 
 
 
 
 
537
  "result_metadata": {
538
  "step_cost": 1,
539
  "triggered": true,
@@ -563,6 +689,13 @@
563
  "confidence": 0.92,
564
  "trust_before": 0.659,
565
  "trust_after": 0.619,
 
 
 
 
 
 
 
566
  "result_metadata": {
567
  "step_cost": 1,
568
  "triggered": true,
@@ -592,6 +725,13 @@
592
  "confidence": 0.92,
593
  "trust_before": 0.619,
594
  "trust_after": 0.583,
 
 
 
 
 
 
 
595
  "result_metadata": {
596
  "step_cost": 1,
597
  "triggered": true,
@@ -621,6 +761,13 @@
621
  "confidence": 0.92,
622
  "trust_before": 0.583,
623
  "trust_after": 0.551,
 
 
 
 
 
 
 
624
  "result_metadata": {
625
  "step_cost": 1,
626
  "triggered": true,
@@ -650,6 +797,13 @@
650
  "confidence": 0.92,
651
  "trust_before": 0.551,
652
  "trust_after": 0.52,
 
 
 
 
 
 
 
653
  "result_metadata": {
654
  "step_cost": 1,
655
  "triggered": true,
@@ -679,6 +833,13 @@
679
  "confidence": 0.92,
680
  "trust_before": 0.52,
681
  "trust_after": 0.492,
 
 
 
 
 
 
 
682
  "result_metadata": {
683
  "step_cost": 1,
684
  "triggered": true,
@@ -708,6 +869,13 @@
708
  "confidence": 0.95,
709
  "trust_before": 0.5,
710
  "trust_after": 0.207,
 
 
 
 
 
 
 
711
  "result_metadata": {
712
  "step_cost": 1
713
  }
@@ -734,6 +902,13 @@
734
  "confidence": 0.45,
735
  "trust_before": 0.5,
736
  "trust_after": 0.207,
 
 
 
 
 
 
 
737
  "result_metadata": {
738
  "step_cost": 1,
739
  "in_domain": false,
@@ -762,6 +937,13 @@
762
  "confidence": null,
763
  "trust_before": null,
764
  "trust_after": null,
 
 
 
 
 
 
 
765
  "result_metadata": {}
766
  }
767
  ],
 
1
  {
2
+ "episode_id": "7f3bd324-24c1-4ca3-a365-794120f3de9b",
3
+ "session_id": "9c27bdf1-9627-476c-ac9b-48e59d137638",
4
  "task_type": "task3",
5
  "score": 0.6759,
6
  "total_reward": 17.5723,
 
41
  "confidence": 0.88,
42
  "trust_before": 0.5,
43
  "trust_after": 0.706,
44
+ "trust_snapshot": {
45
+ "S0": 0.706,
46
+ "S1": 0.5,
47
+ "S2": 0.5,
48
+ "S3": 0.5,
49
+ "S4": 0.5
50
+ },
51
  "result_metadata": {
52
  "step_cost": 1,
53
  "triggered": false,
 
77
  "confidence": 0.88,
78
  "trust_before": 0.706,
79
  "trust_after": 0.795,
80
+ "trust_snapshot": {
81
+ "S0": 0.795,
82
+ "S1": 0.5,
83
+ "S2": 0.5,
84
+ "S3": 0.5,
85
+ "S4": 0.5
86
+ },
87
  "result_metadata": {
88
  "step_cost": 1,
89
  "triggered": false,
 
113
  "confidence": 0.88,
114
  "trust_before": 0.795,
115
  "trust_after": 0.847,
116
+ "trust_snapshot": {
117
+ "S0": 0.847,
118
+ "S1": 0.5,
119
+ "S2": 0.5,
120
+ "S3": 0.5,
121
+ "S4": 0.5
122
+ },
123
  "result_metadata": {
124
  "step_cost": 1,
125
  "triggered": false,
 
149
  "confidence": 0.88,
150
  "trust_before": 0.847,
151
  "trust_after": 0.878,
152
+ "trust_snapshot": {
153
+ "S0": 0.878,
154
+ "S1": 0.5,
155
+ "S2": 0.5,
156
+ "S3": 0.5,
157
+ "S4": 0.5
158
+ },
159
  "result_metadata": {
160
  "step_cost": 1,
161
  "triggered": false,
 
185
  "confidence": 0.88,
186
  "trust_before": 0.878,
187
  "trust_after": 0.9,
188
+ "trust_snapshot": {
189
+ "S0": 0.9,
190
+ "S1": 0.5,
191
+ "S2": 0.5,
192
+ "S3": 0.5,
193
+ "S4": 0.5
194
+ },
195
  "result_metadata": {
196
  "step_cost": 1,
197
  "triggered": false,
 
221
  "confidence": 0.88,
222
  "trust_before": 0.9,
223
  "trust_after": 0.913,
224
+ "trust_snapshot": {
225
+ "S0": 0.913,
226
+ "S1": 0.5,
227
+ "S2": 0.5,
228
+ "S3": 0.5,
229
+ "S4": 0.5
230
+ },
231
  "result_metadata": {
232
  "step_cost": 1,
233
  "triggered": false,
 
257
  "confidence": 0.88,
258
  "trust_before": 0.913,
259
  "trust_after": 0.786,
260
+ "trust_snapshot": {
261
+ "S0": 0.786,
262
+ "S1": 0.5,
263
+ "S2": 0.5,
264
+ "S3": 0.5,
265
+ "S4": 0.5
266
+ },
267
  "result_metadata": {
268
  "step_cost": 1,
269
  "triggered": false,
 
293
  "confidence": 0.88,
294
  "trust_before": 0.786,
295
  "trust_after": 0.812,
296
+ "trust_snapshot": {
297
+ "S0": 0.812,
298
+ "S1": 0.5,
299
+ "S2": 0.5,
300
+ "S3": 0.5,
301
+ "S4": 0.5
302
+ },
303
  "result_metadata": {
304
  "step_cost": 1,
305
  "triggered": false,
 
329
  "confidence": 0.88,
330
  "trust_before": 0.812,
331
  "trust_after": 0.834,
332
+ "trust_snapshot": {
333
+ "S0": 0.834,
334
+ "S1": 0.5,
335
+ "S2": 0.5,
336
+ "S3": 0.5,
337
+ "S4": 0.5
338
+ },
339
  "result_metadata": {
340
  "step_cost": 1,
341
  "triggered": false,
 
365
  "confidence": 0.88,
366
  "trust_before": 0.834,
367
  "trust_after": 0.852,
368
+ "trust_snapshot": {
369
+ "S0": 0.852,
370
+ "S1": 0.5,
371
+ "S2": 0.5,
372
+ "S3": 0.5,
373
+ "S4": 0.5
374
+ },
375
  "result_metadata": {
376
  "step_cost": 1,
377
  "triggered": false,
 
401
  "confidence": 0.88,
402
  "trust_before": 0.852,
403
  "trust_after": 0.865,
404
+ "trust_snapshot": {
405
+ "S0": 0.865,
406
+ "S1": 0.5,
407
+ "S2": 0.5,
408
+ "S3": 0.5,
409
+ "S4": 0.5
410
+ },
411
  "result_metadata": {
412
  "step_cost": 1,
413
  "triggered": false,
 
437
  "confidence": 0.88,
438
  "trust_before": 0.865,
439
  "trust_after": 0.878,
440
+ "trust_snapshot": {
441
+ "S0": 0.878,
442
+ "S1": 0.5,
443
+ "S2": 0.5,
444
+ "S3": 0.5,
445
+ "S4": 0.5
446
+ },
447
  "result_metadata": {
448
  "step_cost": 1,
449
  "triggered": false,
 
473
  "confidence": 0.88,
474
  "trust_before": 0.878,
475
  "trust_after": 0.888,
476
+ "trust_snapshot": {
477
+ "S0": 0.888,
478
+ "S1": 0.5,
479
+ "S2": 0.5,
480
+ "S3": 0.5,
481
+ "S4": 0.5
482
+ },
483
  "result_metadata": {
484
  "step_cost": 1,
485
  "triggered": false,
 
509
  "confidence": 0.88,
510
  "trust_before": 0.888,
511
  "trust_after": 0.897,
512
+ "trust_snapshot": {
513
+ "S0": 0.897,
514
+ "S1": 0.5,
515
+ "S2": 0.5,
516
+ "S3": 0.5,
517
+ "S4": 0.5
518
+ },
519
  "result_metadata": {
520
  "step_cost": 1,
521
  "triggered": false,
 
545
  "confidence": 0.92,
546
  "trust_before": 0.897,
547
  "trust_after": 0.825,
548
+ "trust_snapshot": {
549
+ "S0": 0.825,
550
+ "S1": 0.5,
551
+ "S2": 0.5,
552
+ "S3": 0.5,
553
+ "S4": 0.5
554
+ },
555
  "result_metadata": {
556
  "step_cost": 1,
557
  "triggered": true,
 
581
  "confidence": 0.92,
582
  "trust_before": 0.825,
583
  "trust_after": 0.763,
584
+ "trust_snapshot": {
585
+ "S0": 0.763,
586
+ "S1": 0.5,
587
+ "S2": 0.5,
588
+ "S3": 0.5,
589
+ "S4": 0.5
590
+ },
591
  "result_metadata": {
592
  "step_cost": 1,
593
  "triggered": true,
 
617
  "confidence": 0.92,
618
  "trust_before": 0.763,
619
  "trust_after": 0.708,
620
+ "trust_snapshot": {
621
+ "S0": 0.708,
622
+ "S1": 0.5,
623
+ "S2": 0.5,
624
+ "S3": 0.5,
625
+ "S4": 0.5
626
+ },
627
  "result_metadata": {
628
  "step_cost": 1,
629
  "triggered": true,
 
653
  "confidence": 0.92,
654
  "trust_before": 0.708,
655
  "trust_after": 0.659,
656
+ "trust_snapshot": {
657
+ "S0": 0.659,
658
+ "S1": 0.5,
659
+ "S2": 0.5,
660
+ "S3": 0.5,
661
+ "S4": 0.5
662
+ },
663
  "result_metadata": {
664
  "step_cost": 1,
665
  "triggered": true,
 
689
  "confidence": 0.92,
690
  "trust_before": 0.659,
691
  "trust_after": 0.619,
692
+ "trust_snapshot": {
693
+ "S0": 0.619,
694
+ "S1": 0.5,
695
+ "S2": 0.5,
696
+ "S3": 0.5,
697
+ "S4": 0.5
698
+ },
699
  "result_metadata": {
700
  "step_cost": 1,
701
  "triggered": true,
 
725
  "confidence": 0.92,
726
  "trust_before": 0.619,
727
  "trust_after": 0.583,
728
+ "trust_snapshot": {
729
+ "S0": 0.583,
730
+ "S1": 0.5,
731
+ "S2": 0.5,
732
+ "S3": 0.5,
733
+ "S4": 0.5
734
+ },
735
  "result_metadata": {
736
  "step_cost": 1,
737
  "triggered": true,
 
761
  "confidence": 0.92,
762
  "trust_before": 0.583,
763
  "trust_after": 0.551,
764
+ "trust_snapshot": {
765
+ "S0": 0.551,
766
+ "S1": 0.5,
767
+ "S2": 0.5,
768
+ "S3": 0.5,
769
+ "S4": 0.5
770
+ },
771
  "result_metadata": {
772
  "step_cost": 1,
773
  "triggered": true,
 
797
  "confidence": 0.92,
798
  "trust_before": 0.551,
799
  "trust_after": 0.52,
800
+ "trust_snapshot": {
801
+ "S0": 0.52,
802
+ "S1": 0.5,
803
+ "S2": 0.5,
804
+ "S3": 0.5,
805
+ "S4": 0.5
806
+ },
807
  "result_metadata": {
808
  "step_cost": 1,
809
  "triggered": true,
 
833
  "confidence": 0.92,
834
  "trust_before": 0.52,
835
  "trust_after": 0.492,
836
+ "trust_snapshot": {
837
+ "S0": 0.492,
838
+ "S1": 0.5,
839
+ "S2": 0.5,
840
+ "S3": 0.5,
841
+ "S4": 0.5
842
+ },
843
  "result_metadata": {
844
  "step_cost": 1,
845
  "triggered": true,
 
869
  "confidence": 0.95,
870
  "trust_before": 0.5,
871
  "trust_after": 0.207,
872
+ "trust_snapshot": {
873
+ "S0": 0.492,
874
+ "S1": 0.207,
875
+ "S2": 0.5,
876
+ "S3": 0.5,
877
+ "S4": 0.5
878
+ },
879
  "result_metadata": {
880
  "step_cost": 1
881
  }
 
902
  "confidence": 0.45,
903
  "trust_before": 0.5,
904
  "trust_after": 0.207,
905
+ "trust_snapshot": {
906
+ "S0": 0.492,
907
+ "S1": 0.207,
908
+ "S2": 0.207,
909
+ "S3": 0.5,
910
+ "S4": 0.5
911
+ },
912
  "result_metadata": {
913
  "step_cost": 1,
914
  "in_domain": false,
 
937
  "confidence": null,
938
  "trust_before": null,
939
  "trust_after": null,
940
+ "trust_snapshot": {
941
+ "S0": 0.492,
942
+ "S1": 0.207,
943
+ "S2": 0.207,
944
+ "S3": 0.5,
945
+ "S4": 0.5
946
+ },
947
  "result_metadata": {}
948
  }
949
  ],
training/colab_notebook.ipynb CHANGED
@@ -7,7 +7,7 @@
7
  "source": [
8
  "# SENTINEL GRPO Training (Colab T4)\n",
9
  "\n",
10
- "This notebook trains a small GRPO LoRA, records a deterministic replay table, and generates the seven demo charts for the Hugging Face Space."
11
  ]
12
  },
13
  {
@@ -124,6 +124,11 @@
124
  " \"cluster_health_timeline.png\",\n",
125
  " \"task_radar.png\",\n",
126
  " \"ablation.png\",\n",
 
 
 
 
 
127
  "]:\n",
128
  " print(name)\n",
129
  " display(Image(f\"outputs/charts/{name}\"))"
@@ -150,13 +155,13 @@
150
  "metadata": {
151
  "accelerator": "GPU",
152
  "kernelspec": {
153
- "display_name": "Python 3",
154
  "language": "python",
155
  "name": "python3"
156
  },
157
  "language_info": {
158
  "name": "python",
159
- "version": "3.11"
160
  }
161
  },
162
  "nbformat": 4,
 
7
  "source": [
8
  "# SENTINEL GRPO Training (Colab T4)\n",
9
  "\n",
10
+ "This notebook trains a small GRPO LoRA, records a deterministic replay table, and generates the full SENTINEL demo chart bundle for the Hugging Face Space."
11
  ]
12
  },
13
  {
 
124
  " \"cluster_health_timeline.png\",\n",
125
  " \"task_radar.png\",\n",
126
  " \"ablation.png\",\n",
127
+ " \"baseline_delta_lines.png\",\n",
128
+ " \"cluster_health_policy_lines.png\",\n",
129
+ " \"trust_gap_over_time.png\",\n",
130
+ " \"reward_component_stacked_area.png\",\n",
131
+ " \"failure_fishbone_map.png\",\n",
132
  "]:\n",
133
  " print(name)\n",
134
  " display(Image(f\"outputs/charts/{name}\"))"
 
155
  "metadata": {
156
  "accelerator": "GPU",
157
  "kernelspec": {
158
+ "display_name": ".venv (3.13.7)",
159
  "language": "python",
160
  "name": "python3"
161
  },
162
  "language_info": {
163
  "name": "python",
164
+ "version": "3.13.7"
165
  }
166
  },
167
  "nbformat": 4,
training/plots.py CHANGED
@@ -80,6 +80,11 @@ def _write_matplotlib_bundle(
80
  _plot_cluster_health(plt, cluster_health, out_dir / "cluster_health_timeline.png")
81
  _plot_task_radar(plt, post, out_dir / "task_radar.png")
82
  _plot_ablation(plt, pre, post, out_dir / "ablation.png")
 
 
 
 
 
83
 
84
 
85
  def _plot_grouped_bars(plt, payload: dict[str, Any], path: Path) -> None:
@@ -124,8 +129,17 @@ def _plot_trust_evolution(plt, report: dict[str, Any], path: Path) -> None:
124
  events = report.get("events", [])
125
  fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
126
  for sid in ["S0", "S1", "S2", "S3", "S4"]:
127
- xs = [row.get("step_count", idx) for idx, row in enumerate(events) if sid in row.get("trust_snapshot", {})]
128
- ys = [row["trust_snapshot"][sid] for row in events if sid in row.get("trust_snapshot", {})]
 
 
 
 
 
 
 
 
 
129
  if xs:
130
  ax.plot(xs, ys, label=sid, linewidth=2)
131
  if not events:
@@ -222,6 +236,111 @@ def _plot_ablation(plt, pre: dict[str, Any], post: dict[str, Any], path: Path) -
222
  plt.close(fig)
223
 
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  def _write_fallback_bundle(
226
  pre: dict[str, Any],
227
  post: dict[str, Any],
@@ -243,9 +362,26 @@ def _write_fallback_bundle(
243
  "cluster_health_timeline.png": ("CLUSTER HEALTH TIMELINE", [f"series={len(cluster_health.get('series', {}))}"]),
244
  "task_radar.png": ("TASK CAPABILITY RADAR", lines),
245
  "ablation.png": ("REWARD ENGINE ABLATION", ["confidence + domain + verify signals"]),
 
 
 
 
246
  }
247
  for filename, (title, chart_lines) in charts.items():
248
- _write_text_png(out_dir / filename, title, chart_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
 
251
  def _policies_from_payload(payload: dict[str, Any]) -> list[str]:
@@ -260,6 +396,311 @@ def _policies_from_payload(payload: dict[str, Any]) -> list[str]:
260
  ] or ["random", "heuristic", "oracle_lite", "trained"]
261
 
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  def _read_json(path: str | Path) -> dict[str, Any]:
264
  target = Path(path)
265
  if not target.exists():
 
80
  _plot_cluster_health(plt, cluster_health, out_dir / "cluster_health_timeline.png")
81
  _plot_task_radar(plt, post, out_dir / "task_radar.png")
82
  _plot_ablation(plt, pre, post, out_dir / "ablation.png")
83
+ _plot_baseline_delta_lines(plt, post, out_dir / "baseline_delta_lines.png")
84
+ _plot_cluster_health_policy_lines(plt, cluster_health, post, out_dir / "cluster_health_policy_lines.png")
85
+ _plot_trust_gap_over_time(plt, reward_report, out_dir / "trust_gap_over_time.png")
86
+ _plot_reward_component_stacked_area(plt, reward_report, out_dir / "reward_component_stacked_area.png")
87
+ _plot_failure_fishbone(plt, out_dir / "failure_fishbone_map.png")
88
 
89
 
90
  def _plot_grouped_bars(plt, payload: dict[str, Any], path: Path) -> None:
 
129
  events = report.get("events", [])
130
  fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
131
  for sid in ["S0", "S1", "S2", "S3", "S4"]:
132
+ xs = []
133
+ ys = []
134
+ last = 0.5
135
+ for idx, row in enumerate(events):
136
+ snapshot = row.get("trust_snapshot", {})
137
+ if sid in snapshot:
138
+ last = snapshot[sid]
139
+ elif row.get("specialist_id") == sid and row.get("trust_after") is not None:
140
+ last = row["trust_after"]
141
+ xs.append(row.get("step_count", idx))
142
+ ys.append(last)
143
  if xs:
144
  ax.plot(xs, ys, label=sid, linewidth=2)
145
  if not events:
 
236
  plt.close(fig)
237
 
238
 
239
+ def _plot_baseline_delta_lines(plt, payload: dict[str, Any], path: Path) -> None:
240
+ seeds, deltas = _baseline_delta_series(payload)
241
+ fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
242
+ for name, values in deltas.items():
243
+ color = {
244
+ "Heuristic - Random": PALETTE["heuristic"],
245
+ "GRPO - Random": PALETTE["trained"],
246
+ "Oracle-lite - Random": PALETTE["oracle_lite"],
247
+ "GRPO - Heuristic": "#f59e0b",
248
+ }.get(name, "#64748b")
249
+ ax.plot(seeds, values, label=name, linewidth=2.5, color=color)
250
+ ax.axhline(0, color="#0f172a", linewidth=1, alpha=0.55)
251
+ ax.set_title("Baseline Difference Over Evaluation Seeds")
252
+ ax.set_xlabel("Held-out seed")
253
+ ax.set_ylabel("Score delta")
254
+ ax.legend()
255
+ fig.tight_layout()
256
+ fig.savefig(path)
257
+ plt.close(fig)
258
+
259
+
260
+ def _plot_cluster_health_policy_lines(plt, cluster_payload: dict[str, Any], eval_payload: dict[str, Any], path: Path) -> None:
261
+ series = _cluster_policy_series(cluster_payload, eval_payload)
262
+ fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
263
+ for policy, values in series.items():
264
+ ax.plot(
265
+ range(len(values)),
266
+ values,
267
+ label=LABELS.get(policy, policy.title()),
268
+ color=PALETTE.get(policy, "#64748b"),
269
+ linewidth=2.5,
270
+ )
271
+ ax.set_title("Cluster Health by Policy")
272
+ ax.set_xlabel("Step bucket")
273
+ ax.set_ylabel("Cluster health / survivability proxy")
274
+ ax.set_ylim(0, 1)
275
+ ax.legend()
276
+ fig.tight_layout()
277
+ fig.savefig(path)
278
+ plt.close(fig)
279
+
280
+
281
+ def _plot_trust_gap_over_time(plt, report: dict[str, Any], path: Path) -> None:
282
+ xs, best, worst, gap = _trust_gap_series(report)
283
+ fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
284
+ ax.plot(xs, best, label="Highest trust", color="#22c55e", linewidth=2.4)
285
+ ax.plot(xs, worst, label="Lowest trust", color="#ef4444", linewidth=2.4)
286
+ ax.fill_between(xs, worst, best, color="#a855f7", alpha=0.14, label="Calibration gap")
287
+ ax.plot(xs, gap, label="Best - worst", color=PALETTE["trained"], linewidth=2, linestyle="--")
288
+ ax.set_title("Trust Calibration Gap Over Time")
289
+ ax.set_xlabel("Step")
290
+ ax.set_ylabel("Trust score")
291
+ ax.set_ylim(0, 1)
292
+ ax.legend()
293
+ fig.tight_layout()
294
+ fig.savefig(path)
295
+ plt.close(fig)
296
+
297
+
298
+ def _plot_reward_component_stacked_area(plt, report: dict[str, Any], path: Path) -> None:
299
+ xs, components = _reward_component_series(report)
300
+ fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
301
+ names = list(components)
302
+ values = [components[name] for name in names]
303
+ colors = ["#22c55e", "#3b82f6", "#a855f7", "#f59e0b", "#ef4444", "#64748b"]
304
+ ax.stackplot(xs, values, labels=[name.replace("_", " ") for name in names], colors=colors[:len(names)], alpha=0.78)
305
+ ax.set_title("Reward Components Over Episode")
306
+ ax.set_xlabel("Step")
307
+ ax.set_ylabel("Component contribution")
308
+ ax.set_ylim(0, max(1.0, max((sum(row) for row in zip(*values)), default=1.0)))
309
+ ax.legend(loc="upper left", ncols=2)
310
+ fig.tight_layout()
311
+ fig.savefig(path)
312
+ plt.close(fig)
313
+
314
+
315
+ def _plot_failure_fishbone(plt, path: Path) -> None:
316
+ bones = [
317
+ ("Long-horizon drift", "Plan coherence + delayed terminal score"),
318
+ ("Reward hacking", "AuditLedger + false-completion attacks"),
319
+ ("Agent trust failure", "Bayesian TrustLedger + profile shuffle"),
320
+ ("Evaluation collapse", "Seeds + scenario signatures + attack diversity"),
321
+ ("No self-improvement", "DifficultyController + adversary escalation"),
322
+ ("Context memory loss", "Persistent cluster-goal drift counter"),
323
+ ("Hallucination confidence", "Confidence-accuracy fingerprints"),
324
+ ("Agent loop failure", "Repeated-action penalty"),
325
+ ]
326
+ fig, ax = plt.subplots(figsize=(12, 7), dpi=200)
327
+ ax.axis("off")
328
+ ax.plot([0.08, 0.82], [0.5, 0.5], color="#1e293b", linewidth=3)
329
+ ax.text(0.86, 0.5, "AI Agent Failure\nin Long-Horizon GPU Ops", va="center", ha="left", fontsize=14, fontweight="bold")
330
+ for idx, (problem, solution) in enumerate(bones):
331
+ upper = idx % 2 == 0
332
+ slot = idx // 2
333
+ x = 0.18 + slot * 0.17
334
+ y = 0.74 if upper else 0.26
335
+ ax.plot([x, x + 0.10], [0.5, y], color="#475569", linewidth=2)
336
+ ax.text(x + 0.105, y + (0.025 if upper else -0.025), problem, ha="left", va="center", fontsize=10, fontweight="bold", color="#0f172a")
337
+ ax.text(x + 0.105, y - (0.025 if upper else 0.075), solution, ha="left", va="center", fontsize=8.5, color="#475569")
338
+ ax.set_title("SENTINEL Failure Fishbone Map", fontsize=18, fontweight="bold", pad=20)
339
+ fig.tight_layout()
340
+ fig.savefig(path)
341
+ plt.close(fig)
342
+
343
+
344
  def _write_fallback_bundle(
345
  pre: dict[str, Any],
346
  post: dict[str, Any],
 
362
  "cluster_health_timeline.png": ("CLUSTER HEALTH TIMELINE", [f"series={len(cluster_health.get('series', {}))}"]),
363
  "task_radar.png": ("TASK CAPABILITY RADAR", lines),
364
  "ablation.png": ("REWARD ENGINE ABLATION", ["confidence + domain + verify signals"]),
365
+ "baseline_delta_lines.png": ("BASELINE DELTA LINES", ["GRPO/heuristic/oracle minus baseline"]),
366
+ "cluster_health_policy_lines.png": ("CLUSTER HEALTH BY POLICY", ["survivability trend per policy"]),
367
+ "trust_gap_over_time.png": ("TRUST GAP OVER TIME", ["best trust minus worst trust"]),
368
+ "reward_component_stacked_area.png": ("REWARD COMPONENT AREA", ["accuracy + stakes + confidence + verify"]),
369
  }
370
  for filename, (title, chart_lines) in charts.items():
371
+ if filename == "baseline_delta_lines.png":
372
+ seeds, deltas = _baseline_delta_series(post)
373
+ _write_line_chart_png(out_dir / filename, title, deltas, x_values=seeds, y_min=-0.1, y_max=0.35)
374
+ elif filename == "cluster_health_policy_lines.png":
375
+ _write_line_chart_png(out_dir / filename, title, _cluster_policy_series(cluster_health, post), y_min=0.0, y_max=1.0)
376
+ elif filename == "trust_gap_over_time.png":
377
+ xs, best, worst, gap = _trust_gap_series(reward_report)
378
+ _write_line_chart_png(out_dir / filename, title, {"BEST": best, "WORST": worst, "GAP": gap}, x_values=xs, y_min=0.0, y_max=1.0)
379
+ elif filename == "reward_component_stacked_area.png":
380
+ xs, components = _reward_component_series(reward_report)
381
+ _write_line_chart_png(out_dir / filename, title, components, x_values=xs, y_min=0.0, y_max=1.0)
382
+ else:
383
+ _write_text_png(out_dir / filename, title, chart_lines)
384
+ _write_fishbone_png(out_dir / "failure_fishbone_map.png")
385
 
386
 
387
  def _policies_from_payload(payload: dict[str, Any]) -> list[str]:
 
396
  ] or ["random", "heuristic", "oracle_lite", "trained"]
397
 
398
 
399
+ def _baseline_delta_series(payload: dict[str, Any]) -> tuple[list[int], dict[str, list[float]]]:
400
+ by_seed: dict[int, dict[str, list[float]]] = {}
401
+ for row in payload.get("episodes", []):
402
+ seed = int(row.get("seed", 0))
403
+ policy = str(row.get("policy", ""))
404
+ by_seed.setdefault(seed, {}).setdefault(policy, []).append(float(row.get("score", 0.0)))
405
+ seeds = sorted(by_seed)
406
+ if not seeds:
407
+ seeds = list(range(10))
408
+ return seeds, {
409
+ "Heuristic - Random": [0.05 + idx * 0.004 for idx in seeds],
410
+ "GRPO - Random": [0.08 + idx * 0.006 for idx in seeds],
411
+ "Oracle-lite - Random": [0.14 + idx * 0.004 for idx in seeds],
412
+ "GRPO - Heuristic": [0.02 + idx * 0.002 for idx in seeds],
413
+ }
414
+
415
+ def score(seed: int, policy: str) -> float:
416
+ values = by_seed.get(seed, {}).get(policy, [])
417
+ return sum(values) / max(1, len(values))
418
+
419
+ deltas = {
420
+ "Heuristic - Random": [],
421
+ "GRPO - Random": [],
422
+ "Oracle-lite - Random": [],
423
+ "GRPO - Heuristic": [],
424
+ }
425
+ for seed in seeds:
426
+ random_score = score(seed, "random")
427
+ heuristic_score = score(seed, "heuristic")
428
+ trained_score = score(seed, "trained")
429
+ oracle_score = score(seed, "oracle_lite")
430
+ deltas["Heuristic - Random"].append(round(heuristic_score - random_score, 4))
431
+ deltas["GRPO - Random"].append(round(trained_score - random_score, 4))
432
+ deltas["Oracle-lite - Random"].append(round(oracle_score - random_score, 4))
433
+ deltas["GRPO - Heuristic"].append(round(trained_score - heuristic_score, 4))
434
+ return seeds, deltas
435
+
436
+
437
+ def _cluster_policy_series(cluster_payload: dict[str, Any], eval_payload: dict[str, Any]) -> dict[str, list[float]]:
438
+ series: dict[str, list[float]] = {}
439
+ aliases = {
440
+ "blind": "random",
441
+ "trust": "heuristic",
442
+ "random": "random",
443
+ "heuristic": "heuristic",
444
+ "oracle_lite": "oracle_lite",
445
+ "trained": "trained",
446
+ }
447
+ for raw_name, values in cluster_payload.get("series", {}).items():
448
+ if not values:
449
+ continue
450
+ if len({round(float(v), 4) for v in values}) <= 1:
451
+ continue
452
+ policy = aliases.get(raw_name, raw_name)
453
+ series[policy] = [float(v) for v in values]
454
+
455
+ reward_timelines = _policy_reward_timelines(eval_payload)
456
+ for policy in ("random", "heuristic", "oracle_lite", "trained"):
457
+ if policy not in series and policy in reward_timelines:
458
+ series[policy] = reward_timelines[policy]
459
+ if series:
460
+ return series
461
+ return {
462
+ "random": [0.52, 0.49, 0.44, 0.38, 0.31],
463
+ "heuristic": [0.52, 0.55, 0.58, 0.61, 0.63],
464
+ "oracle_lite": [0.52, 0.62, 0.71, 0.80, 0.88],
465
+ "trained": [0.52, 0.58, 0.66, 0.73, 0.80],
466
+ }
467
+
468
+
469
+ def _policy_reward_timelines(payload: dict[str, Any]) -> dict[str, list[float]]:
470
+ grouped: dict[str, list[list[float]]] = {}
471
+ for row in payload.get("episodes", []):
472
+ if row.get("task_type") != "task3":
473
+ continue
474
+ rewards = [float(value) for value in row.get("rewards", [])]
475
+ if rewards:
476
+ grouped.setdefault(row["policy"], []).append(rewards)
477
+ timelines: dict[str, list[float]] = {}
478
+ for policy, reward_rows in grouped.items():
479
+ max_len = min(45, max(len(values) for values in reward_rows))
480
+ timeline = []
481
+ for idx in range(max_len):
482
+ bucket = []
483
+ for rewards in reward_rows:
484
+ upto = rewards[: min(idx + 1, len(rewards))]
485
+ if upto:
486
+ bucket.append(sum(upto) / len(upto))
487
+ timeline.append(round(sum(bucket) / max(1, len(bucket)), 4))
488
+ timelines[policy] = timeline
489
+ return timelines
490
+
491
+
492
+ def _trust_gap_series(report: dict[str, Any]) -> tuple[list[int], list[float], list[float], list[float]]:
493
+ events = report.get("events", [])
494
+ if not events:
495
+ xs = list(range(1, 11))
496
+ best = [0.52, 0.58, 0.63, 0.70, 0.76, 0.80, 0.84, 0.87, 0.89, 0.91]
497
+ worst = [0.50, 0.46, 0.39, 0.34, 0.29, 0.23, 0.19, 0.15, 0.13, 0.11]
498
+ return xs, best, worst, [round(b - w, 4) for b, w in zip(best, worst)]
499
+ snapshot = {sid: 0.5 for sid in ["S0", "S1", "S2", "S3", "S4"]}
500
+ xs: list[int] = []
501
+ best: list[float] = []
502
+ worst: list[float] = []
503
+ gap: list[float] = []
504
+ for idx, event in enumerate(events):
505
+ event_snapshot = event.get("trust_snapshot", {})
506
+ if event_snapshot:
507
+ for sid, value in event_snapshot.items():
508
+ snapshot[sid] = float(value)
509
+ elif event.get("specialist_id") and event.get("trust_after") is not None:
510
+ snapshot[str(event["specialist_id"])] = float(event["trust_after"])
511
+ hi = max(snapshot.values())
512
+ lo = min(snapshot.values())
513
+ xs.append(int(event.get("step_count", idx + 1)))
514
+ best.append(round(hi, 4))
515
+ worst.append(round(lo, 4))
516
+ gap.append(round(hi - lo, 4))
517
+ return xs, best, worst, gap
518
+
519
+
520
+ def _reward_component_series(report: dict[str, Any]) -> tuple[list[int], dict[str, list[float]]]:
521
+ events = report.get("events", [])
522
+ keys = ["task_accuracy", "stakes_awareness", "efficiency", "confidence_alignment", "verification_quality", "domain_routing"]
523
+ if not events:
524
+ xs = list(range(1, 11))
525
+ return xs, {
526
+ "task_accuracy": [0.25, 0.35, 0.45, 0.55, 0.60, 0.65, 0.71, 0.77, 0.81, 0.84],
527
+ "stakes_awareness": [0.7, 0.72, 0.74, 0.76, 0.80, 0.82, 0.84, 0.87, 0.89, 0.91],
528
+ "verification_quality": [0.2, 0.28, 0.35, 0.44, 0.55, 0.62, 0.70, 0.75, 0.80, 0.83],
529
+ }
530
+ xs = [int(event.get("step_count", idx + 1)) for idx, event in enumerate(events)]
531
+ components: dict[str, list[float]] = {key: [] for key in keys}
532
+ for event in events:
533
+ breakdown = event.get("signal_breakdown", {})
534
+ for key in keys:
535
+ value = breakdown.get(key, 0.0)
536
+ components[key].append(round(float(value), 4) if isinstance(value, (int, float)) else 0.0)
537
+ return xs, {key: values for key, values in components.items() if any(values)}
538
+
539
+
540
+ def _write_line_chart_png(
541
+ path: Path,
542
+ title: str,
543
+ series: dict[str, list[float]],
544
+ x_values: list[int] | None = None,
545
+ y_min: float | None = None,
546
+ y_max: float | None = None,
547
+ ) -> None:
548
+ width, height = 1200, 720
549
+ rgb = bytearray([248, 250, 252] * width * height)
550
+ left, top, right, bottom = 96, 104, 1080, 592
551
+ colors = [
552
+ (59, 130, 246),
553
+ (168, 85, 247),
554
+ (16, 185, 129),
555
+ (245, 158, 11),
556
+ (239, 68, 68),
557
+ (100, 116, 139),
558
+ ]
559
+
560
+ def rect(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int]) -> None:
561
+ for y in range(max(0, y0), min(height, y1)):
562
+ row = y * width * 3
563
+ for x in range(max(0, x0), min(width, x1)):
564
+ idx = row + x * 3
565
+ rgb[idx:idx + 3] = bytes(color)
566
+
567
+ def line(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int], thickness: int = 2) -> None:
568
+ dx = abs(x1 - x0)
569
+ dy = -abs(y1 - y0)
570
+ sx = 1 if x0 < x1 else -1
571
+ sy = 1 if y0 < y1 else -1
572
+ err = dx + dy
573
+ while True:
574
+ rect(x0 - thickness, y0 - thickness, x0 + thickness + 1, y0 + thickness + 1, color)
575
+ if x0 == x1 and y0 == y1:
576
+ break
577
+ e2 = 2 * err
578
+ if e2 >= dy:
579
+ err += dy
580
+ x0 += sx
581
+ if e2 <= dx:
582
+ err += dx
583
+ y0 += sy
584
+
585
+ def text(x: int, y: int, value: str, color: tuple[int, int, int], scale: int = 4) -> None:
586
+ cursor = x
587
+ for ch in value[:90]:
588
+ for gy, glyph_line in enumerate(_glyph(ch)):
589
+ for gx, bit in enumerate(glyph_line):
590
+ if bit == "1":
591
+ rect(cursor + gx * scale, y + gy * scale, cursor + (gx + 1) * scale, y + (gy + 1) * scale, color)
592
+ cursor += 4 * scale
593
+
594
+ values = [value for row in series.values() for value in row]
595
+ if not values:
596
+ values = [0.0, 1.0]
597
+ y_min = min(values) if y_min is None else y_min
598
+ y_max = max(values) if y_max is None else y_max
599
+ if abs(y_max - y_min) < 0.001:
600
+ y_max = y_min + 1.0
601
+ longest = max((len(row) for row in series.values()), default=1)
602
+ x_values = x_values or list(range(longest))
603
+ x_span = max(1, (max(x_values) - min(x_values)) if x_values else longest - 1)
604
+ x_min = min(x_values) if x_values else 0
605
+
606
+ rect(0, 0, width, 88, (15, 23, 42))
607
+ text(44, 32, title, (226, 232, 240), 5)
608
+ for idx in range(6):
609
+ y = top + int((bottom - top) * idx / 5)
610
+ line(left, y, right, y, (226, 232, 240), 1)
611
+ line(left, top, left, bottom, (51, 65, 85), 2)
612
+ line(left, bottom, right, bottom, (51, 65, 85), 2)
613
+
614
+ def point(pos: int, value: float) -> tuple[int, int]:
615
+ xv = x_values[pos] if pos < len(x_values) else pos
616
+ x = left + int((xv - x_min) / x_span * (right - left))
617
+ y = bottom - int((value - y_min) / (y_max - y_min) * (bottom - top))
618
+ return x, y
619
+
620
+ for idx, (name, row) in enumerate(series.items()):
621
+ color = colors[idx % len(colors)]
622
+ pts = [point(pos, float(value)) for pos, value in enumerate(row)]
623
+ for a, b in zip(pts, pts[1:]):
624
+ line(a[0], a[1], b[0], b[1], color, 2)
625
+ for x, y in pts[:: max(1, len(pts) // 12)]:
626
+ rect(x - 4, y - 4, x + 5, y + 5, color)
627
+ lx = 96 + (idx % 2) * 420
628
+ ly = 620 + (idx // 2) * 34
629
+ rect(lx, ly + 3, lx + 28, ly + 13, color)
630
+ text(lx + 40, ly, name.upper().replace("_", " ")[:26], (30, 41, 59), 3)
631
+
632
+ path.parent.mkdir(parents=True, exist_ok=True)
633
+ _write_png(path, width, height, rgb)
634
+
635
+
636
+ def _write_fishbone_png(path: Path) -> None:
637
+ width, height = 1400, 820
638
+ rgb = bytearray([248, 250, 252] * width * height)
639
+
640
+ def rect(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int]) -> None:
641
+ for y in range(max(0, y0), min(height, y1)):
642
+ row = y * width * 3
643
+ for x in range(max(0, x0), min(width, x1)):
644
+ idx = row + x * 3
645
+ rgb[idx:idx + 3] = bytes(color)
646
+
647
+ def line(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int], thickness: int = 2) -> None:
648
+ dx = abs(x1 - x0)
649
+ dy = -abs(y1 - y0)
650
+ sx = 1 if x0 < x1 else -1
651
+ sy = 1 if y0 < y1 else -1
652
+ err = dx + dy
653
+ while True:
654
+ rect(x0 - thickness, y0 - thickness, x0 + thickness + 1, y0 + thickness + 1, color)
655
+ if x0 == x1 and y0 == y1:
656
+ break
657
+ e2 = 2 * err
658
+ if e2 >= dy:
659
+ err += dy
660
+ x0 += sx
661
+ if e2 <= dx:
662
+ err += dx
663
+ y0 += sy
664
+
665
+ def text(x: int, y: int, value: str, color: tuple[int, int, int], scale: int = 4) -> None:
666
+ cursor = x
667
+ for ch in value[:72]:
668
+ for gy, glyph_line in enumerate(_glyph(ch)):
669
+ for gx, bit in enumerate(glyph_line):
670
+ if bit == "1":
671
+ rect(cursor + gx * scale, y + gy * scale, cursor + (gx + 1) * scale, y + (gy + 1) * scale, color)
672
+ cursor += 4 * scale
673
+
674
+ rect(0, 0, width, 94, (15, 23, 42))
675
+ text(46, 34, "SENTINEL FAILURE FISHBONE MAP", (226, 232, 240), 5)
676
+ line(120, 420, 1040, 420, (30, 41, 59), 4)
677
+ line(1040, 420, 1168, 346, (30, 41, 59), 4)
678
+ line(1040, 420, 1168, 494, (30, 41, 59), 4)
679
+ text(1130, 390, "AI AGENT FAILURE", (15, 23, 42), 4)
680
+ text(1130, 430, "LONG HORIZON GPU OPS", (15, 23, 42), 3)
681
+ bones = [
682
+ ("DRIFT", "PLAN COHERENCE"),
683
+ ("REWARD HACK", "AUDIT LEDGER"),
684
+ ("TRUST FAIL", "BAYES LEDGER"),
685
+ ("EVAL COLLAPSE", "FRESH SEEDS"),
686
+ ("NO HARDER LEVEL", "DIFFICULTY CTRL"),
687
+ ("MEMORY LOSS", "DRIFT COUNTER"),
688
+ ("CONFIDENCE LIES", "FINGERPRINTS"),
689
+ ("LOOPS", "REPEAT PENALTY"),
690
+ ]
691
+ for idx, (problem, fix) in enumerate(bones):
692
+ upper = idx % 2 == 0
693
+ slot = idx // 2
694
+ x0 = 190 + slot * 210
695
+ y1 = 210 if upper else 630
696
+ line(x0, 420, x0 + 130, y1, (71, 85, 105), 3)
697
+ label_y = y1 - 40 if upper else y1 + 10
698
+ text(x0 + 142, label_y, problem, (15, 23, 42), 3)
699
+ text(x0 + 142, label_y + 30, fix, (100, 116, 139), 3)
700
+ path.parent.mkdir(parents=True, exist_ok=True)
701
+ _write_png(path, width, height, rgb)
702
+
703
+
704
  def _read_json(path: str | Path) -> dict[str, Any]:
705
  target = Path(path)
706
  if not target.exists():
ui/app/components/Landing.tsx CHANGED
@@ -1,5 +1,5 @@
1
  "use client";
2
- import { Brain, Shuffle, CircleGauge, ShieldAlert, ArrowRight, Sparkles } from "lucide-react";
3
  import { formatScore } from "../lib/theme";
4
  import type { EvalSummary } from "../lib/types";
5
 
@@ -22,6 +22,38 @@ const AFTER_STEPS = [
22
  "Adversarial attempt blocked before cascade.",
23
  "Profile swap proves skill, not memorized identity.",
24
  ];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  export default function Landing({
27
  proof,
@@ -137,6 +169,28 @@ export default function Landing({
137
  ))}
138
  </div>
139
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  </div>
141
  );
142
  }
 
1
  "use client";
2
+ import { Brain, Shuffle, CircleGauge, ShieldAlert, ArrowRight, Sparkles, ChartLine } from "lucide-react";
3
  import { formatScore } from "../lib/theme";
4
  import type { EvalSummary } from "../lib/types";
5
 
 
22
  "Adversarial attempt blocked before cascade.",
23
  "Profile swap proves skill, not memorized identity.",
24
  ];
25
+ const CHARTS = [
26
+ {
27
+ title: "Baseline Delta",
28
+ desc: "Policy score lift over random and heuristic baselines.",
29
+ src: "/assets/charts/baseline_delta_lines.png",
30
+ },
31
+ {
32
+ title: "Failure Fishbone",
33
+ desc: "Real AI reliability failures mapped to SENTINEL modules.",
34
+ src: "/assets/charts/failure_fishbone_map.png",
35
+ },
36
+ {
37
+ title: "Cluster Health",
38
+ desc: "Survivability trend across policies during GPU operations.",
39
+ src: "/assets/charts/cluster_health_policy_lines.png",
40
+ },
41
+ {
42
+ title: "Trust Gap",
43
+ desc: "How quickly trust separates reliable and risky specialists.",
44
+ src: "/assets/charts/trust_gap_over_time.png",
45
+ },
46
+ {
47
+ title: "Reward Components",
48
+ desc: "Accuracy, stakes, verification, confidence, and routing signals.",
49
+ src: "/assets/charts/reward_component_stacked_area.png",
50
+ },
51
+ {
52
+ title: "Detection vs Poisoning",
53
+ desc: "Caught adversarial events compared with accepted poison.",
54
+ src: "/assets/charts/detection_vs_poisoning.png",
55
+ },
56
+ ];
57
 
58
  export default function Landing({
59
  proof,
 
169
  ))}
170
  </div>
171
  </div>
172
+
173
+ {/* evidence charts */}
174
+ <div className="chart-section">
175
+ <div className="panel-head" style={{ textAlign: "center", marginBottom: 16 }}>
176
+ <div className="panel-eyebrow">Evidence</div>
177
+ <div className="panel-title">Baseline, Trust, Reward, and Failure Maps</div>
178
+ </div>
179
+ <div className="chart-grid">
180
+ {CHARTS.map((chart) => (
181
+ <a className="panel chart-card" href={chart.src} target="_blank" rel="noreferrer" key={chart.src}>
182
+ <div className="chart-meta">
183
+ <ChartLine size={16} />
184
+ <div>
185
+ <h4>{chart.title}</h4>
186
+ <p>{chart.desc}</p>
187
+ </div>
188
+ </div>
189
+ <img src={chart.src} alt={chart.title} />
190
+ </a>
191
+ ))}
192
+ </div>
193
+ </div>
194
  </div>
195
  );
196
  }