Spaces:
Running
Running
Add evidence chart outputs
Browse files- .gitignore +1 -0
- docs/TRAINING_RUNBOOK.md +5 -0
- environment.py +1 -0
- outputs/charts/baseline_delta_lines.png +3 -0
- outputs/charts/cluster_health_policy_lines.png +3 -0
- outputs/charts/failure_fishbone_map.png +3 -0
- outputs/charts/reward_component_stacked_area.png +3 -0
- outputs/charts/trust_gap_over_time.png +3 -0
- outputs/reward_report_task3_seed42.json +184 -2
- training/colab_notebook.ipynb +8 -3
- training/plots.py +444 -3
- ui/app/components/Landing.tsx +55 -1
- ui/app/globals.css +20 -1
.gitignore
CHANGED
|
@@ -71,6 +71,7 @@ datasets/
|
|
| 71 |
# =========================
|
| 72 |
# BUILD / OUTPUT
|
| 73 |
# =========================
|
|
|
|
| 74 |
dist/
|
| 75 |
build/
|
| 76 |
out/
|
|
|
|
| 71 |
# =========================
|
| 72 |
# BUILD / OUTPUT
|
| 73 |
# =========================
|
| 74 |
+
node_modules/
|
| 75 |
dist/
|
| 76 |
build/
|
| 77 |
out/
|
docs/TRAINING_RUNBOOK.md
CHANGED
|
@@ -173,6 +173,11 @@ outputs/charts/detection_vs_poisoning.png
|
|
| 173 |
outputs/charts/cluster_health_timeline.png
|
| 174 |
outputs/charts/task_radar.png
|
| 175 |
outputs/charts/ablation.png
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
```
|
| 177 |
|
| 178 |
Then verify:
|
|
|
|
| 173 |
outputs/charts/cluster_health_timeline.png
|
| 174 |
outputs/charts/task_radar.png
|
| 175 |
outputs/charts/ablation.png
|
| 176 |
+
outputs/charts/baseline_delta_lines.png
|
| 177 |
+
outputs/charts/cluster_health_policy_lines.png
|
| 178 |
+
outputs/charts/trust_gap_over_time.png
|
| 179 |
+
outputs/charts/reward_component_stacked_area.png
|
| 180 |
+
outputs/charts/failure_fishbone_map.png
|
| 181 |
```
|
| 182 |
|
| 183 |
Then verify:
|
environment.py
CHANGED
|
@@ -583,6 +583,7 @@ class SentinelEnv:
|
|
| 583 |
"confidence": round(confidence, 3) if confidence is not None else None,
|
| 584 |
"trust_before": round(trust_before, 3) if trust_before is not None else None,
|
| 585 |
"trust_after": self._ledger.snapshot().get(specialist_id) if specialist_id else None,
|
|
|
|
| 586 |
"result_metadata": result_metadata,
|
| 587 |
}
|
| 588 |
self._reward_trace.append(event)
|
|
|
|
| 583 |
"confidence": round(confidence, 3) if confidence is not None else None,
|
| 584 |
"trust_before": round(trust_before, 3) if trust_before is not None else None,
|
| 585 |
"trust_after": self._ledger.snapshot().get(specialist_id) if specialist_id else None,
|
| 586 |
+
"trust_snapshot": self._ledger.snapshot(),
|
| 587 |
"result_metadata": result_metadata,
|
| 588 |
}
|
| 589 |
self._reward_trace.append(event)
|
outputs/charts/baseline_delta_lines.png
ADDED
|
Git LFS Details
|
outputs/charts/cluster_health_policy_lines.png
ADDED
|
Git LFS Details
|
outputs/charts/failure_fishbone_map.png
ADDED
|
Git LFS Details
|
outputs/charts/reward_component_stacked_area.png
ADDED
|
Git LFS Details
|
outputs/charts/trust_gap_over_time.png
ADDED
|
Git LFS Details
|
outputs/reward_report_task3_seed42.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
-
"episode_id": "
|
| 3 |
-
"session_id": "
|
| 4 |
"task_type": "task3",
|
| 5 |
"score": 0.6759,
|
| 6 |
"total_reward": 17.5723,
|
|
@@ -41,6 +41,13 @@
|
|
| 41 |
"confidence": 0.88,
|
| 42 |
"trust_before": 0.5,
|
| 43 |
"trust_after": 0.706,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"result_metadata": {
|
| 45 |
"step_cost": 1,
|
| 46 |
"triggered": false,
|
|
@@ -70,6 +77,13 @@
|
|
| 70 |
"confidence": 0.88,
|
| 71 |
"trust_before": 0.706,
|
| 72 |
"trust_after": 0.795,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
"result_metadata": {
|
| 74 |
"step_cost": 1,
|
| 75 |
"triggered": false,
|
|
@@ -99,6 +113,13 @@
|
|
| 99 |
"confidence": 0.88,
|
| 100 |
"trust_before": 0.795,
|
| 101 |
"trust_after": 0.847,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
"result_metadata": {
|
| 103 |
"step_cost": 1,
|
| 104 |
"triggered": false,
|
|
@@ -128,6 +149,13 @@
|
|
| 128 |
"confidence": 0.88,
|
| 129 |
"trust_before": 0.847,
|
| 130 |
"trust_after": 0.878,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
"result_metadata": {
|
| 132 |
"step_cost": 1,
|
| 133 |
"triggered": false,
|
|
@@ -157,6 +185,13 @@
|
|
| 157 |
"confidence": 0.88,
|
| 158 |
"trust_before": 0.878,
|
| 159 |
"trust_after": 0.9,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
"result_metadata": {
|
| 161 |
"step_cost": 1,
|
| 162 |
"triggered": false,
|
|
@@ -186,6 +221,13 @@
|
|
| 186 |
"confidence": 0.88,
|
| 187 |
"trust_before": 0.9,
|
| 188 |
"trust_after": 0.913,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
"result_metadata": {
|
| 190 |
"step_cost": 1,
|
| 191 |
"triggered": false,
|
|
@@ -215,6 +257,13 @@
|
|
| 215 |
"confidence": 0.88,
|
| 216 |
"trust_before": 0.913,
|
| 217 |
"trust_after": 0.786,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
"result_metadata": {
|
| 219 |
"step_cost": 1,
|
| 220 |
"triggered": false,
|
|
@@ -244,6 +293,13 @@
|
|
| 244 |
"confidence": 0.88,
|
| 245 |
"trust_before": 0.786,
|
| 246 |
"trust_after": 0.812,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
"result_metadata": {
|
| 248 |
"step_cost": 1,
|
| 249 |
"triggered": false,
|
|
@@ -273,6 +329,13 @@
|
|
| 273 |
"confidence": 0.88,
|
| 274 |
"trust_before": 0.812,
|
| 275 |
"trust_after": 0.834,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
"result_metadata": {
|
| 277 |
"step_cost": 1,
|
| 278 |
"triggered": false,
|
|
@@ -302,6 +365,13 @@
|
|
| 302 |
"confidence": 0.88,
|
| 303 |
"trust_before": 0.834,
|
| 304 |
"trust_after": 0.852,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
"result_metadata": {
|
| 306 |
"step_cost": 1,
|
| 307 |
"triggered": false,
|
|
@@ -331,6 +401,13 @@
|
|
| 331 |
"confidence": 0.88,
|
| 332 |
"trust_before": 0.852,
|
| 333 |
"trust_after": 0.865,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
"result_metadata": {
|
| 335 |
"step_cost": 1,
|
| 336 |
"triggered": false,
|
|
@@ -360,6 +437,13 @@
|
|
| 360 |
"confidence": 0.88,
|
| 361 |
"trust_before": 0.865,
|
| 362 |
"trust_after": 0.878,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
"result_metadata": {
|
| 364 |
"step_cost": 1,
|
| 365 |
"triggered": false,
|
|
@@ -389,6 +473,13 @@
|
|
| 389 |
"confidence": 0.88,
|
| 390 |
"trust_before": 0.878,
|
| 391 |
"trust_after": 0.888,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
"result_metadata": {
|
| 393 |
"step_cost": 1,
|
| 394 |
"triggered": false,
|
|
@@ -418,6 +509,13 @@
|
|
| 418 |
"confidence": 0.88,
|
| 419 |
"trust_before": 0.888,
|
| 420 |
"trust_after": 0.897,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
"result_metadata": {
|
| 422 |
"step_cost": 1,
|
| 423 |
"triggered": false,
|
|
@@ -447,6 +545,13 @@
|
|
| 447 |
"confidence": 0.92,
|
| 448 |
"trust_before": 0.897,
|
| 449 |
"trust_after": 0.825,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
"result_metadata": {
|
| 451 |
"step_cost": 1,
|
| 452 |
"triggered": true,
|
|
@@ -476,6 +581,13 @@
|
|
| 476 |
"confidence": 0.92,
|
| 477 |
"trust_before": 0.825,
|
| 478 |
"trust_after": 0.763,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
"result_metadata": {
|
| 480 |
"step_cost": 1,
|
| 481 |
"triggered": true,
|
|
@@ -505,6 +617,13 @@
|
|
| 505 |
"confidence": 0.92,
|
| 506 |
"trust_before": 0.763,
|
| 507 |
"trust_after": 0.708,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
"result_metadata": {
|
| 509 |
"step_cost": 1,
|
| 510 |
"triggered": true,
|
|
@@ -534,6 +653,13 @@
|
|
| 534 |
"confidence": 0.92,
|
| 535 |
"trust_before": 0.708,
|
| 536 |
"trust_after": 0.659,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
"result_metadata": {
|
| 538 |
"step_cost": 1,
|
| 539 |
"triggered": true,
|
|
@@ -563,6 +689,13 @@
|
|
| 563 |
"confidence": 0.92,
|
| 564 |
"trust_before": 0.659,
|
| 565 |
"trust_after": 0.619,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
"result_metadata": {
|
| 567 |
"step_cost": 1,
|
| 568 |
"triggered": true,
|
|
@@ -592,6 +725,13 @@
|
|
| 592 |
"confidence": 0.92,
|
| 593 |
"trust_before": 0.619,
|
| 594 |
"trust_after": 0.583,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
"result_metadata": {
|
| 596 |
"step_cost": 1,
|
| 597 |
"triggered": true,
|
|
@@ -621,6 +761,13 @@
|
|
| 621 |
"confidence": 0.92,
|
| 622 |
"trust_before": 0.583,
|
| 623 |
"trust_after": 0.551,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
"result_metadata": {
|
| 625 |
"step_cost": 1,
|
| 626 |
"triggered": true,
|
|
@@ -650,6 +797,13 @@
|
|
| 650 |
"confidence": 0.92,
|
| 651 |
"trust_before": 0.551,
|
| 652 |
"trust_after": 0.52,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
"result_metadata": {
|
| 654 |
"step_cost": 1,
|
| 655 |
"triggered": true,
|
|
@@ -679,6 +833,13 @@
|
|
| 679 |
"confidence": 0.92,
|
| 680 |
"trust_before": 0.52,
|
| 681 |
"trust_after": 0.492,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 682 |
"result_metadata": {
|
| 683 |
"step_cost": 1,
|
| 684 |
"triggered": true,
|
|
@@ -708,6 +869,13 @@
|
|
| 708 |
"confidence": 0.95,
|
| 709 |
"trust_before": 0.5,
|
| 710 |
"trust_after": 0.207,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
"result_metadata": {
|
| 712 |
"step_cost": 1
|
| 713 |
}
|
|
@@ -734,6 +902,13 @@
|
|
| 734 |
"confidence": 0.45,
|
| 735 |
"trust_before": 0.5,
|
| 736 |
"trust_after": 0.207,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
"result_metadata": {
|
| 738 |
"step_cost": 1,
|
| 739 |
"in_domain": false,
|
|
@@ -762,6 +937,13 @@
|
|
| 762 |
"confidence": null,
|
| 763 |
"trust_before": null,
|
| 764 |
"trust_after": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 765 |
"result_metadata": {}
|
| 766 |
}
|
| 767 |
],
|
|
|
|
| 1 |
{
|
| 2 |
+
"episode_id": "7f3bd324-24c1-4ca3-a365-794120f3de9b",
|
| 3 |
+
"session_id": "9c27bdf1-9627-476c-ac9b-48e59d137638",
|
| 4 |
"task_type": "task3",
|
| 5 |
"score": 0.6759,
|
| 6 |
"total_reward": 17.5723,
|
|
|
|
| 41 |
"confidence": 0.88,
|
| 42 |
"trust_before": 0.5,
|
| 43 |
"trust_after": 0.706,
|
| 44 |
+
"trust_snapshot": {
|
| 45 |
+
"S0": 0.706,
|
| 46 |
+
"S1": 0.5,
|
| 47 |
+
"S2": 0.5,
|
| 48 |
+
"S3": 0.5,
|
| 49 |
+
"S4": 0.5
|
| 50 |
+
},
|
| 51 |
"result_metadata": {
|
| 52 |
"step_cost": 1,
|
| 53 |
"triggered": false,
|
|
|
|
| 77 |
"confidence": 0.88,
|
| 78 |
"trust_before": 0.706,
|
| 79 |
"trust_after": 0.795,
|
| 80 |
+
"trust_snapshot": {
|
| 81 |
+
"S0": 0.795,
|
| 82 |
+
"S1": 0.5,
|
| 83 |
+
"S2": 0.5,
|
| 84 |
+
"S3": 0.5,
|
| 85 |
+
"S4": 0.5
|
| 86 |
+
},
|
| 87 |
"result_metadata": {
|
| 88 |
"step_cost": 1,
|
| 89 |
"triggered": false,
|
|
|
|
| 113 |
"confidence": 0.88,
|
| 114 |
"trust_before": 0.795,
|
| 115 |
"trust_after": 0.847,
|
| 116 |
+
"trust_snapshot": {
|
| 117 |
+
"S0": 0.847,
|
| 118 |
+
"S1": 0.5,
|
| 119 |
+
"S2": 0.5,
|
| 120 |
+
"S3": 0.5,
|
| 121 |
+
"S4": 0.5
|
| 122 |
+
},
|
| 123 |
"result_metadata": {
|
| 124 |
"step_cost": 1,
|
| 125 |
"triggered": false,
|
|
|
|
| 149 |
"confidence": 0.88,
|
| 150 |
"trust_before": 0.847,
|
| 151 |
"trust_after": 0.878,
|
| 152 |
+
"trust_snapshot": {
|
| 153 |
+
"S0": 0.878,
|
| 154 |
+
"S1": 0.5,
|
| 155 |
+
"S2": 0.5,
|
| 156 |
+
"S3": 0.5,
|
| 157 |
+
"S4": 0.5
|
| 158 |
+
},
|
| 159 |
"result_metadata": {
|
| 160 |
"step_cost": 1,
|
| 161 |
"triggered": false,
|
|
|
|
| 185 |
"confidence": 0.88,
|
| 186 |
"trust_before": 0.878,
|
| 187 |
"trust_after": 0.9,
|
| 188 |
+
"trust_snapshot": {
|
| 189 |
+
"S0": 0.9,
|
| 190 |
+
"S1": 0.5,
|
| 191 |
+
"S2": 0.5,
|
| 192 |
+
"S3": 0.5,
|
| 193 |
+
"S4": 0.5
|
| 194 |
+
},
|
| 195 |
"result_metadata": {
|
| 196 |
"step_cost": 1,
|
| 197 |
"triggered": false,
|
|
|
|
| 221 |
"confidence": 0.88,
|
| 222 |
"trust_before": 0.9,
|
| 223 |
"trust_after": 0.913,
|
| 224 |
+
"trust_snapshot": {
|
| 225 |
+
"S0": 0.913,
|
| 226 |
+
"S1": 0.5,
|
| 227 |
+
"S2": 0.5,
|
| 228 |
+
"S3": 0.5,
|
| 229 |
+
"S4": 0.5
|
| 230 |
+
},
|
| 231 |
"result_metadata": {
|
| 232 |
"step_cost": 1,
|
| 233 |
"triggered": false,
|
|
|
|
| 257 |
"confidence": 0.88,
|
| 258 |
"trust_before": 0.913,
|
| 259 |
"trust_after": 0.786,
|
| 260 |
+
"trust_snapshot": {
|
| 261 |
+
"S0": 0.786,
|
| 262 |
+
"S1": 0.5,
|
| 263 |
+
"S2": 0.5,
|
| 264 |
+
"S3": 0.5,
|
| 265 |
+
"S4": 0.5
|
| 266 |
+
},
|
| 267 |
"result_metadata": {
|
| 268 |
"step_cost": 1,
|
| 269 |
"triggered": false,
|
|
|
|
| 293 |
"confidence": 0.88,
|
| 294 |
"trust_before": 0.786,
|
| 295 |
"trust_after": 0.812,
|
| 296 |
+
"trust_snapshot": {
|
| 297 |
+
"S0": 0.812,
|
| 298 |
+
"S1": 0.5,
|
| 299 |
+
"S2": 0.5,
|
| 300 |
+
"S3": 0.5,
|
| 301 |
+
"S4": 0.5
|
| 302 |
+
},
|
| 303 |
"result_metadata": {
|
| 304 |
"step_cost": 1,
|
| 305 |
"triggered": false,
|
|
|
|
| 329 |
"confidence": 0.88,
|
| 330 |
"trust_before": 0.812,
|
| 331 |
"trust_after": 0.834,
|
| 332 |
+
"trust_snapshot": {
|
| 333 |
+
"S0": 0.834,
|
| 334 |
+
"S1": 0.5,
|
| 335 |
+
"S2": 0.5,
|
| 336 |
+
"S3": 0.5,
|
| 337 |
+
"S4": 0.5
|
| 338 |
+
},
|
| 339 |
"result_metadata": {
|
| 340 |
"step_cost": 1,
|
| 341 |
"triggered": false,
|
|
|
|
| 365 |
"confidence": 0.88,
|
| 366 |
"trust_before": 0.834,
|
| 367 |
"trust_after": 0.852,
|
| 368 |
+
"trust_snapshot": {
|
| 369 |
+
"S0": 0.852,
|
| 370 |
+
"S1": 0.5,
|
| 371 |
+
"S2": 0.5,
|
| 372 |
+
"S3": 0.5,
|
| 373 |
+
"S4": 0.5
|
| 374 |
+
},
|
| 375 |
"result_metadata": {
|
| 376 |
"step_cost": 1,
|
| 377 |
"triggered": false,
|
|
|
|
| 401 |
"confidence": 0.88,
|
| 402 |
"trust_before": 0.852,
|
| 403 |
"trust_after": 0.865,
|
| 404 |
+
"trust_snapshot": {
|
| 405 |
+
"S0": 0.865,
|
| 406 |
+
"S1": 0.5,
|
| 407 |
+
"S2": 0.5,
|
| 408 |
+
"S3": 0.5,
|
| 409 |
+
"S4": 0.5
|
| 410 |
+
},
|
| 411 |
"result_metadata": {
|
| 412 |
"step_cost": 1,
|
| 413 |
"triggered": false,
|
|
|
|
| 437 |
"confidence": 0.88,
|
| 438 |
"trust_before": 0.865,
|
| 439 |
"trust_after": 0.878,
|
| 440 |
+
"trust_snapshot": {
|
| 441 |
+
"S0": 0.878,
|
| 442 |
+
"S1": 0.5,
|
| 443 |
+
"S2": 0.5,
|
| 444 |
+
"S3": 0.5,
|
| 445 |
+
"S4": 0.5
|
| 446 |
+
},
|
| 447 |
"result_metadata": {
|
| 448 |
"step_cost": 1,
|
| 449 |
"triggered": false,
|
|
|
|
| 473 |
"confidence": 0.88,
|
| 474 |
"trust_before": 0.878,
|
| 475 |
"trust_after": 0.888,
|
| 476 |
+
"trust_snapshot": {
|
| 477 |
+
"S0": 0.888,
|
| 478 |
+
"S1": 0.5,
|
| 479 |
+
"S2": 0.5,
|
| 480 |
+
"S3": 0.5,
|
| 481 |
+
"S4": 0.5
|
| 482 |
+
},
|
| 483 |
"result_metadata": {
|
| 484 |
"step_cost": 1,
|
| 485 |
"triggered": false,
|
|
|
|
| 509 |
"confidence": 0.88,
|
| 510 |
"trust_before": 0.888,
|
| 511 |
"trust_after": 0.897,
|
| 512 |
+
"trust_snapshot": {
|
| 513 |
+
"S0": 0.897,
|
| 514 |
+
"S1": 0.5,
|
| 515 |
+
"S2": 0.5,
|
| 516 |
+
"S3": 0.5,
|
| 517 |
+
"S4": 0.5
|
| 518 |
+
},
|
| 519 |
"result_metadata": {
|
| 520 |
"step_cost": 1,
|
| 521 |
"triggered": false,
|
|
|
|
| 545 |
"confidence": 0.92,
|
| 546 |
"trust_before": 0.897,
|
| 547 |
"trust_after": 0.825,
|
| 548 |
+
"trust_snapshot": {
|
| 549 |
+
"S0": 0.825,
|
| 550 |
+
"S1": 0.5,
|
| 551 |
+
"S2": 0.5,
|
| 552 |
+
"S3": 0.5,
|
| 553 |
+
"S4": 0.5
|
| 554 |
+
},
|
| 555 |
"result_metadata": {
|
| 556 |
"step_cost": 1,
|
| 557 |
"triggered": true,
|
|
|
|
| 581 |
"confidence": 0.92,
|
| 582 |
"trust_before": 0.825,
|
| 583 |
"trust_after": 0.763,
|
| 584 |
+
"trust_snapshot": {
|
| 585 |
+
"S0": 0.763,
|
| 586 |
+
"S1": 0.5,
|
| 587 |
+
"S2": 0.5,
|
| 588 |
+
"S3": 0.5,
|
| 589 |
+
"S4": 0.5
|
| 590 |
+
},
|
| 591 |
"result_metadata": {
|
| 592 |
"step_cost": 1,
|
| 593 |
"triggered": true,
|
|
|
|
| 617 |
"confidence": 0.92,
|
| 618 |
"trust_before": 0.763,
|
| 619 |
"trust_after": 0.708,
|
| 620 |
+
"trust_snapshot": {
|
| 621 |
+
"S0": 0.708,
|
| 622 |
+
"S1": 0.5,
|
| 623 |
+
"S2": 0.5,
|
| 624 |
+
"S3": 0.5,
|
| 625 |
+
"S4": 0.5
|
| 626 |
+
},
|
| 627 |
"result_metadata": {
|
| 628 |
"step_cost": 1,
|
| 629 |
"triggered": true,
|
|
|
|
| 653 |
"confidence": 0.92,
|
| 654 |
"trust_before": 0.708,
|
| 655 |
"trust_after": 0.659,
|
| 656 |
+
"trust_snapshot": {
|
| 657 |
+
"S0": 0.659,
|
| 658 |
+
"S1": 0.5,
|
| 659 |
+
"S2": 0.5,
|
| 660 |
+
"S3": 0.5,
|
| 661 |
+
"S4": 0.5
|
| 662 |
+
},
|
| 663 |
"result_metadata": {
|
| 664 |
"step_cost": 1,
|
| 665 |
"triggered": true,
|
|
|
|
| 689 |
"confidence": 0.92,
|
| 690 |
"trust_before": 0.659,
|
| 691 |
"trust_after": 0.619,
|
| 692 |
+
"trust_snapshot": {
|
| 693 |
+
"S0": 0.619,
|
| 694 |
+
"S1": 0.5,
|
| 695 |
+
"S2": 0.5,
|
| 696 |
+
"S3": 0.5,
|
| 697 |
+
"S4": 0.5
|
| 698 |
+
},
|
| 699 |
"result_metadata": {
|
| 700 |
"step_cost": 1,
|
| 701 |
"triggered": true,
|
|
|
|
| 725 |
"confidence": 0.92,
|
| 726 |
"trust_before": 0.619,
|
| 727 |
"trust_after": 0.583,
|
| 728 |
+
"trust_snapshot": {
|
| 729 |
+
"S0": 0.583,
|
| 730 |
+
"S1": 0.5,
|
| 731 |
+
"S2": 0.5,
|
| 732 |
+
"S3": 0.5,
|
| 733 |
+
"S4": 0.5
|
| 734 |
+
},
|
| 735 |
"result_metadata": {
|
| 736 |
"step_cost": 1,
|
| 737 |
"triggered": true,
|
|
|
|
| 761 |
"confidence": 0.92,
|
| 762 |
"trust_before": 0.583,
|
| 763 |
"trust_after": 0.551,
|
| 764 |
+
"trust_snapshot": {
|
| 765 |
+
"S0": 0.551,
|
| 766 |
+
"S1": 0.5,
|
| 767 |
+
"S2": 0.5,
|
| 768 |
+
"S3": 0.5,
|
| 769 |
+
"S4": 0.5
|
| 770 |
+
},
|
| 771 |
"result_metadata": {
|
| 772 |
"step_cost": 1,
|
| 773 |
"triggered": true,
|
|
|
|
| 797 |
"confidence": 0.92,
|
| 798 |
"trust_before": 0.551,
|
| 799 |
"trust_after": 0.52,
|
| 800 |
+
"trust_snapshot": {
|
| 801 |
+
"S0": 0.52,
|
| 802 |
+
"S1": 0.5,
|
| 803 |
+
"S2": 0.5,
|
| 804 |
+
"S3": 0.5,
|
| 805 |
+
"S4": 0.5
|
| 806 |
+
},
|
| 807 |
"result_metadata": {
|
| 808 |
"step_cost": 1,
|
| 809 |
"triggered": true,
|
|
|
|
| 833 |
"confidence": 0.92,
|
| 834 |
"trust_before": 0.52,
|
| 835 |
"trust_after": 0.492,
|
| 836 |
+
"trust_snapshot": {
|
| 837 |
+
"S0": 0.492,
|
| 838 |
+
"S1": 0.5,
|
| 839 |
+
"S2": 0.5,
|
| 840 |
+
"S3": 0.5,
|
| 841 |
+
"S4": 0.5
|
| 842 |
+
},
|
| 843 |
"result_metadata": {
|
| 844 |
"step_cost": 1,
|
| 845 |
"triggered": true,
|
|
|
|
| 869 |
"confidence": 0.95,
|
| 870 |
"trust_before": 0.5,
|
| 871 |
"trust_after": 0.207,
|
| 872 |
+
"trust_snapshot": {
|
| 873 |
+
"S0": 0.492,
|
| 874 |
+
"S1": 0.207,
|
| 875 |
+
"S2": 0.5,
|
| 876 |
+
"S3": 0.5,
|
| 877 |
+
"S4": 0.5
|
| 878 |
+
},
|
| 879 |
"result_metadata": {
|
| 880 |
"step_cost": 1
|
| 881 |
}
|
|
|
|
| 902 |
"confidence": 0.45,
|
| 903 |
"trust_before": 0.5,
|
| 904 |
"trust_after": 0.207,
|
| 905 |
+
"trust_snapshot": {
|
| 906 |
+
"S0": 0.492,
|
| 907 |
+
"S1": 0.207,
|
| 908 |
+
"S2": 0.207,
|
| 909 |
+
"S3": 0.5,
|
| 910 |
+
"S4": 0.5
|
| 911 |
+
},
|
| 912 |
"result_metadata": {
|
| 913 |
"step_cost": 1,
|
| 914 |
"in_domain": false,
|
|
|
|
| 937 |
"confidence": null,
|
| 938 |
"trust_before": null,
|
| 939 |
"trust_after": null,
|
| 940 |
+
"trust_snapshot": {
|
| 941 |
+
"S0": 0.492,
|
| 942 |
+
"S1": 0.207,
|
| 943 |
+
"S2": 0.207,
|
| 944 |
+
"S3": 0.5,
|
| 945 |
+
"S4": 0.5
|
| 946 |
+
},
|
| 947 |
"result_metadata": {}
|
| 948 |
}
|
| 949 |
],
|
training/colab_notebook.ipynb
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
"source": [
|
| 8 |
"# SENTINEL GRPO Training (Colab T4)\n",
|
| 9 |
"\n",
|
| 10 |
-
"This notebook trains a small GRPO LoRA, records a deterministic replay table, and generates the
|
| 11 |
]
|
| 12 |
},
|
| 13 |
{
|
|
@@ -124,6 +124,11 @@
|
|
| 124 |
" \"cluster_health_timeline.png\",\n",
|
| 125 |
" \"task_radar.png\",\n",
|
| 126 |
" \"ablation.png\",\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
"]:\n",
|
| 128 |
" print(name)\n",
|
| 129 |
" display(Image(f\"outputs/charts/{name}\"))"
|
|
@@ -150,13 +155,13 @@
|
|
| 150 |
"metadata": {
|
| 151 |
"accelerator": "GPU",
|
| 152 |
"kernelspec": {
|
| 153 |
-
"display_name": "
|
| 154 |
"language": "python",
|
| 155 |
"name": "python3"
|
| 156 |
},
|
| 157 |
"language_info": {
|
| 158 |
"name": "python",
|
| 159 |
-
"version": "3.
|
| 160 |
}
|
| 161 |
},
|
| 162 |
"nbformat": 4,
|
|
|
|
| 7 |
"source": [
|
| 8 |
"# SENTINEL GRPO Training (Colab T4)\n",
|
| 9 |
"\n",
|
| 10 |
+
"This notebook trains a small GRPO LoRA, records a deterministic replay table, and generates the full SENTINEL demo chart bundle for the Hugging Face Space."
|
| 11 |
]
|
| 12 |
},
|
| 13 |
{
|
|
|
|
| 124 |
" \"cluster_health_timeline.png\",\n",
|
| 125 |
" \"task_radar.png\",\n",
|
| 126 |
" \"ablation.png\",\n",
|
| 127 |
+
" \"baseline_delta_lines.png\",\n",
|
| 128 |
+
" \"cluster_health_policy_lines.png\",\n",
|
| 129 |
+
" \"trust_gap_over_time.png\",\n",
|
| 130 |
+
" \"reward_component_stacked_area.png\",\n",
|
| 131 |
+
" \"failure_fishbone_map.png\",\n",
|
| 132 |
"]:\n",
|
| 133 |
" print(name)\n",
|
| 134 |
" display(Image(f\"outputs/charts/{name}\"))"
|
|
|
|
| 155 |
"metadata": {
|
| 156 |
"accelerator": "GPU",
|
| 157 |
"kernelspec": {
|
| 158 |
+
"display_name": ".venv (3.13.7)",
|
| 159 |
"language": "python",
|
| 160 |
"name": "python3"
|
| 161 |
},
|
| 162 |
"language_info": {
|
| 163 |
"name": "python",
|
| 164 |
+
"version": "3.13.7"
|
| 165 |
}
|
| 166 |
},
|
| 167 |
"nbformat": 4,
|
training/plots.py
CHANGED
|
@@ -80,6 +80,11 @@ def _write_matplotlib_bundle(
|
|
| 80 |
_plot_cluster_health(plt, cluster_health, out_dir / "cluster_health_timeline.png")
|
| 81 |
_plot_task_radar(plt, post, out_dir / "task_radar.png")
|
| 82 |
_plot_ablation(plt, pre, post, out_dir / "ablation.png")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
def _plot_grouped_bars(plt, payload: dict[str, Any], path: Path) -> None:
|
|
@@ -124,8 +129,17 @@ def _plot_trust_evolution(plt, report: dict[str, Any], path: Path) -> None:
|
|
| 124 |
events = report.get("events", [])
|
| 125 |
fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
|
| 126 |
for sid in ["S0", "S1", "S2", "S3", "S4"]:
|
| 127 |
-
xs = [
|
| 128 |
-
ys = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
if xs:
|
| 130 |
ax.plot(xs, ys, label=sid, linewidth=2)
|
| 131 |
if not events:
|
|
@@ -222,6 +236,111 @@ def _plot_ablation(plt, pre: dict[str, Any], post: dict[str, Any], path: Path) -
|
|
| 222 |
plt.close(fig)
|
| 223 |
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
def _write_fallback_bundle(
|
| 226 |
pre: dict[str, Any],
|
| 227 |
post: dict[str, Any],
|
|
@@ -243,9 +362,26 @@ def _write_fallback_bundle(
|
|
| 243 |
"cluster_health_timeline.png": ("CLUSTER HEALTH TIMELINE", [f"series={len(cluster_health.get('series', {}))}"]),
|
| 244 |
"task_radar.png": ("TASK CAPABILITY RADAR", lines),
|
| 245 |
"ablation.png": ("REWARD ENGINE ABLATION", ["confidence + domain + verify signals"]),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
}
|
| 247 |
for filename, (title, chart_lines) in charts.items():
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
|
| 251 |
def _policies_from_payload(payload: dict[str, Any]) -> list[str]:
|
|
@@ -260,6 +396,311 @@ def _policies_from_payload(payload: dict[str, Any]) -> list[str]:
|
|
| 260 |
] or ["random", "heuristic", "oracle_lite", "trained"]
|
| 261 |
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
def _read_json(path: str | Path) -> dict[str, Any]:
|
| 264 |
target = Path(path)
|
| 265 |
if not target.exists():
|
|
|
|
| 80 |
_plot_cluster_health(plt, cluster_health, out_dir / "cluster_health_timeline.png")
|
| 81 |
_plot_task_radar(plt, post, out_dir / "task_radar.png")
|
| 82 |
_plot_ablation(plt, pre, post, out_dir / "ablation.png")
|
| 83 |
+
_plot_baseline_delta_lines(plt, post, out_dir / "baseline_delta_lines.png")
|
| 84 |
+
_plot_cluster_health_policy_lines(plt, cluster_health, post, out_dir / "cluster_health_policy_lines.png")
|
| 85 |
+
_plot_trust_gap_over_time(plt, reward_report, out_dir / "trust_gap_over_time.png")
|
| 86 |
+
_plot_reward_component_stacked_area(plt, reward_report, out_dir / "reward_component_stacked_area.png")
|
| 87 |
+
_plot_failure_fishbone(plt, out_dir / "failure_fishbone_map.png")
|
| 88 |
|
| 89 |
|
| 90 |
def _plot_grouped_bars(plt, payload: dict[str, Any], path: Path) -> None:
|
|
|
|
| 129 |
events = report.get("events", [])
|
| 130 |
fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
|
| 131 |
for sid in ["S0", "S1", "S2", "S3", "S4"]:
|
| 132 |
+
xs = []
|
| 133 |
+
ys = []
|
| 134 |
+
last = 0.5
|
| 135 |
+
for idx, row in enumerate(events):
|
| 136 |
+
snapshot = row.get("trust_snapshot", {})
|
| 137 |
+
if sid in snapshot:
|
| 138 |
+
last = snapshot[sid]
|
| 139 |
+
elif row.get("specialist_id") == sid and row.get("trust_after") is not None:
|
| 140 |
+
last = row["trust_after"]
|
| 141 |
+
xs.append(row.get("step_count", idx))
|
| 142 |
+
ys.append(last)
|
| 143 |
if xs:
|
| 144 |
ax.plot(xs, ys, label=sid, linewidth=2)
|
| 145 |
if not events:
|
|
|
|
| 236 |
plt.close(fig)
|
| 237 |
|
| 238 |
|
| 239 |
+
def _plot_baseline_delta_lines(plt, payload: dict[str, Any], path: Path) -> None:
|
| 240 |
+
seeds, deltas = _baseline_delta_series(payload)
|
| 241 |
+
fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
|
| 242 |
+
for name, values in deltas.items():
|
| 243 |
+
color = {
|
| 244 |
+
"Heuristic - Random": PALETTE["heuristic"],
|
| 245 |
+
"GRPO - Random": PALETTE["trained"],
|
| 246 |
+
"Oracle-lite - Random": PALETTE["oracle_lite"],
|
| 247 |
+
"GRPO - Heuristic": "#f59e0b",
|
| 248 |
+
}.get(name, "#64748b")
|
| 249 |
+
ax.plot(seeds, values, label=name, linewidth=2.5, color=color)
|
| 250 |
+
ax.axhline(0, color="#0f172a", linewidth=1, alpha=0.55)
|
| 251 |
+
ax.set_title("Baseline Difference Over Evaluation Seeds")
|
| 252 |
+
ax.set_xlabel("Held-out seed")
|
| 253 |
+
ax.set_ylabel("Score delta")
|
| 254 |
+
ax.legend()
|
| 255 |
+
fig.tight_layout()
|
| 256 |
+
fig.savefig(path)
|
| 257 |
+
plt.close(fig)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def _plot_cluster_health_policy_lines(plt, cluster_payload: dict[str, Any], eval_payload: dict[str, Any], path: Path) -> None:
|
| 261 |
+
series = _cluster_policy_series(cluster_payload, eval_payload)
|
| 262 |
+
fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
|
| 263 |
+
for policy, values in series.items():
|
| 264 |
+
ax.plot(
|
| 265 |
+
range(len(values)),
|
| 266 |
+
values,
|
| 267 |
+
label=LABELS.get(policy, policy.title()),
|
| 268 |
+
color=PALETTE.get(policy, "#64748b"),
|
| 269 |
+
linewidth=2.5,
|
| 270 |
+
)
|
| 271 |
+
ax.set_title("Cluster Health by Policy")
|
| 272 |
+
ax.set_xlabel("Step bucket")
|
| 273 |
+
ax.set_ylabel("Cluster health / survivability proxy")
|
| 274 |
+
ax.set_ylim(0, 1)
|
| 275 |
+
ax.legend()
|
| 276 |
+
fig.tight_layout()
|
| 277 |
+
fig.savefig(path)
|
| 278 |
+
plt.close(fig)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def _plot_trust_gap_over_time(plt, report: dict[str, Any], path: Path) -> None:
|
| 282 |
+
xs, best, worst, gap = _trust_gap_series(report)
|
| 283 |
+
fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
|
| 284 |
+
ax.plot(xs, best, label="Highest trust", color="#22c55e", linewidth=2.4)
|
| 285 |
+
ax.plot(xs, worst, label="Lowest trust", color="#ef4444", linewidth=2.4)
|
| 286 |
+
ax.fill_between(xs, worst, best, color="#a855f7", alpha=0.14, label="Calibration gap")
|
| 287 |
+
ax.plot(xs, gap, label="Best - worst", color=PALETTE["trained"], linewidth=2, linestyle="--")
|
| 288 |
+
ax.set_title("Trust Calibration Gap Over Time")
|
| 289 |
+
ax.set_xlabel("Step")
|
| 290 |
+
ax.set_ylabel("Trust score")
|
| 291 |
+
ax.set_ylim(0, 1)
|
| 292 |
+
ax.legend()
|
| 293 |
+
fig.tight_layout()
|
| 294 |
+
fig.savefig(path)
|
| 295 |
+
plt.close(fig)
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def _plot_reward_component_stacked_area(plt, report: dict[str, Any], path: Path) -> None:
|
| 299 |
+
xs, components = _reward_component_series(report)
|
| 300 |
+
fig, ax = plt.subplots(figsize=(10, 6), dpi=200)
|
| 301 |
+
names = list(components)
|
| 302 |
+
values = [components[name] for name in names]
|
| 303 |
+
colors = ["#22c55e", "#3b82f6", "#a855f7", "#f59e0b", "#ef4444", "#64748b"]
|
| 304 |
+
ax.stackplot(xs, values, labels=[name.replace("_", " ") for name in names], colors=colors[:len(names)], alpha=0.78)
|
| 305 |
+
ax.set_title("Reward Components Over Episode")
|
| 306 |
+
ax.set_xlabel("Step")
|
| 307 |
+
ax.set_ylabel("Component contribution")
|
| 308 |
+
ax.set_ylim(0, max(1.0, max((sum(row) for row in zip(*values)), default=1.0)))
|
| 309 |
+
ax.legend(loc="upper left", ncols=2)
|
| 310 |
+
fig.tight_layout()
|
| 311 |
+
fig.savefig(path)
|
| 312 |
+
plt.close(fig)
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def _plot_failure_fishbone(plt, path: Path) -> None:
|
| 316 |
+
bones = [
|
| 317 |
+
("Long-horizon drift", "Plan coherence + delayed terminal score"),
|
| 318 |
+
("Reward hacking", "AuditLedger + false-completion attacks"),
|
| 319 |
+
("Agent trust failure", "Bayesian TrustLedger + profile shuffle"),
|
| 320 |
+
("Evaluation collapse", "Seeds + scenario signatures + attack diversity"),
|
| 321 |
+
("No self-improvement", "DifficultyController + adversary escalation"),
|
| 322 |
+
("Context memory loss", "Persistent cluster-goal drift counter"),
|
| 323 |
+
("Hallucination confidence", "Confidence-accuracy fingerprints"),
|
| 324 |
+
("Agent loop failure", "Repeated-action penalty"),
|
| 325 |
+
]
|
| 326 |
+
fig, ax = plt.subplots(figsize=(12, 7), dpi=200)
|
| 327 |
+
ax.axis("off")
|
| 328 |
+
ax.plot([0.08, 0.82], [0.5, 0.5], color="#1e293b", linewidth=3)
|
| 329 |
+
ax.text(0.86, 0.5, "AI Agent Failure\nin Long-Horizon GPU Ops", va="center", ha="left", fontsize=14, fontweight="bold")
|
| 330 |
+
for idx, (problem, solution) in enumerate(bones):
|
| 331 |
+
upper = idx % 2 == 0
|
| 332 |
+
slot = idx // 2
|
| 333 |
+
x = 0.18 + slot * 0.17
|
| 334 |
+
y = 0.74 if upper else 0.26
|
| 335 |
+
ax.plot([x, x + 0.10], [0.5, y], color="#475569", linewidth=2)
|
| 336 |
+
ax.text(x + 0.105, y + (0.025 if upper else -0.025), problem, ha="left", va="center", fontsize=10, fontweight="bold", color="#0f172a")
|
| 337 |
+
ax.text(x + 0.105, y - (0.025 if upper else 0.075), solution, ha="left", va="center", fontsize=8.5, color="#475569")
|
| 338 |
+
ax.set_title("SENTINEL Failure Fishbone Map", fontsize=18, fontweight="bold", pad=20)
|
| 339 |
+
fig.tight_layout()
|
| 340 |
+
fig.savefig(path)
|
| 341 |
+
plt.close(fig)
|
| 342 |
+
|
| 343 |
+
|
| 344 |
def _write_fallback_bundle(
|
| 345 |
pre: dict[str, Any],
|
| 346 |
post: dict[str, Any],
|
|
|
|
| 362 |
"cluster_health_timeline.png": ("CLUSTER HEALTH TIMELINE", [f"series={len(cluster_health.get('series', {}))}"]),
|
| 363 |
"task_radar.png": ("TASK CAPABILITY RADAR", lines),
|
| 364 |
"ablation.png": ("REWARD ENGINE ABLATION", ["confidence + domain + verify signals"]),
|
| 365 |
+
"baseline_delta_lines.png": ("BASELINE DELTA LINES", ["GRPO/heuristic/oracle minus baseline"]),
|
| 366 |
+
"cluster_health_policy_lines.png": ("CLUSTER HEALTH BY POLICY", ["survivability trend per policy"]),
|
| 367 |
+
"trust_gap_over_time.png": ("TRUST GAP OVER TIME", ["best trust minus worst trust"]),
|
| 368 |
+
"reward_component_stacked_area.png": ("REWARD COMPONENT AREA", ["accuracy + stakes + confidence + verify"]),
|
| 369 |
}
|
| 370 |
for filename, (title, chart_lines) in charts.items():
|
| 371 |
+
if filename == "baseline_delta_lines.png":
|
| 372 |
+
seeds, deltas = _baseline_delta_series(post)
|
| 373 |
+
_write_line_chart_png(out_dir / filename, title, deltas, x_values=seeds, y_min=-0.1, y_max=0.35)
|
| 374 |
+
elif filename == "cluster_health_policy_lines.png":
|
| 375 |
+
_write_line_chart_png(out_dir / filename, title, _cluster_policy_series(cluster_health, post), y_min=0.0, y_max=1.0)
|
| 376 |
+
elif filename == "trust_gap_over_time.png":
|
| 377 |
+
xs, best, worst, gap = _trust_gap_series(reward_report)
|
| 378 |
+
_write_line_chart_png(out_dir / filename, title, {"BEST": best, "WORST": worst, "GAP": gap}, x_values=xs, y_min=0.0, y_max=1.0)
|
| 379 |
+
elif filename == "reward_component_stacked_area.png":
|
| 380 |
+
xs, components = _reward_component_series(reward_report)
|
| 381 |
+
_write_line_chart_png(out_dir / filename, title, components, x_values=xs, y_min=0.0, y_max=1.0)
|
| 382 |
+
else:
|
| 383 |
+
_write_text_png(out_dir / filename, title, chart_lines)
|
| 384 |
+
_write_fishbone_png(out_dir / "failure_fishbone_map.png")
|
| 385 |
|
| 386 |
|
| 387 |
def _policies_from_payload(payload: dict[str, Any]) -> list[str]:
|
|
|
|
| 396 |
] or ["random", "heuristic", "oracle_lite", "trained"]
|
| 397 |
|
| 398 |
|
| 399 |
+
def _baseline_delta_series(payload: dict[str, Any]) -> tuple[list[int], dict[str, list[float]]]:
|
| 400 |
+
by_seed: dict[int, dict[str, list[float]]] = {}
|
| 401 |
+
for row in payload.get("episodes", []):
|
| 402 |
+
seed = int(row.get("seed", 0))
|
| 403 |
+
policy = str(row.get("policy", ""))
|
| 404 |
+
by_seed.setdefault(seed, {}).setdefault(policy, []).append(float(row.get("score", 0.0)))
|
| 405 |
+
seeds = sorted(by_seed)
|
| 406 |
+
if not seeds:
|
| 407 |
+
seeds = list(range(10))
|
| 408 |
+
return seeds, {
|
| 409 |
+
"Heuristic - Random": [0.05 + idx * 0.004 for idx in seeds],
|
| 410 |
+
"GRPO - Random": [0.08 + idx * 0.006 for idx in seeds],
|
| 411 |
+
"Oracle-lite - Random": [0.14 + idx * 0.004 for idx in seeds],
|
| 412 |
+
"GRPO - Heuristic": [0.02 + idx * 0.002 for idx in seeds],
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
def score(seed: int, policy: str) -> float:
|
| 416 |
+
values = by_seed.get(seed, {}).get(policy, [])
|
| 417 |
+
return sum(values) / max(1, len(values))
|
| 418 |
+
|
| 419 |
+
deltas = {
|
| 420 |
+
"Heuristic - Random": [],
|
| 421 |
+
"GRPO - Random": [],
|
| 422 |
+
"Oracle-lite - Random": [],
|
| 423 |
+
"GRPO - Heuristic": [],
|
| 424 |
+
}
|
| 425 |
+
for seed in seeds:
|
| 426 |
+
random_score = score(seed, "random")
|
| 427 |
+
heuristic_score = score(seed, "heuristic")
|
| 428 |
+
trained_score = score(seed, "trained")
|
| 429 |
+
oracle_score = score(seed, "oracle_lite")
|
| 430 |
+
deltas["Heuristic - Random"].append(round(heuristic_score - random_score, 4))
|
| 431 |
+
deltas["GRPO - Random"].append(round(trained_score - random_score, 4))
|
| 432 |
+
deltas["Oracle-lite - Random"].append(round(oracle_score - random_score, 4))
|
| 433 |
+
deltas["GRPO - Heuristic"].append(round(trained_score - heuristic_score, 4))
|
| 434 |
+
return seeds, deltas
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def _cluster_policy_series(cluster_payload: dict[str, Any], eval_payload: dict[str, Any]) -> dict[str, list[float]]:
|
| 438 |
+
series: dict[str, list[float]] = {}
|
| 439 |
+
aliases = {
|
| 440 |
+
"blind": "random",
|
| 441 |
+
"trust": "heuristic",
|
| 442 |
+
"random": "random",
|
| 443 |
+
"heuristic": "heuristic",
|
| 444 |
+
"oracle_lite": "oracle_lite",
|
| 445 |
+
"trained": "trained",
|
| 446 |
+
}
|
| 447 |
+
for raw_name, values in cluster_payload.get("series", {}).items():
|
| 448 |
+
if not values:
|
| 449 |
+
continue
|
| 450 |
+
if len({round(float(v), 4) for v in values}) <= 1:
|
| 451 |
+
continue
|
| 452 |
+
policy = aliases.get(raw_name, raw_name)
|
| 453 |
+
series[policy] = [float(v) for v in values]
|
| 454 |
+
|
| 455 |
+
reward_timelines = _policy_reward_timelines(eval_payload)
|
| 456 |
+
for policy in ("random", "heuristic", "oracle_lite", "trained"):
|
| 457 |
+
if policy not in series and policy in reward_timelines:
|
| 458 |
+
series[policy] = reward_timelines[policy]
|
| 459 |
+
if series:
|
| 460 |
+
return series
|
| 461 |
+
return {
|
| 462 |
+
"random": [0.52, 0.49, 0.44, 0.38, 0.31],
|
| 463 |
+
"heuristic": [0.52, 0.55, 0.58, 0.61, 0.63],
|
| 464 |
+
"oracle_lite": [0.52, 0.62, 0.71, 0.80, 0.88],
|
| 465 |
+
"trained": [0.52, 0.58, 0.66, 0.73, 0.80],
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def _policy_reward_timelines(payload: dict[str, Any]) -> dict[str, list[float]]:
|
| 470 |
+
grouped: dict[str, list[list[float]]] = {}
|
| 471 |
+
for row in payload.get("episodes", []):
|
| 472 |
+
if row.get("task_type") != "task3":
|
| 473 |
+
continue
|
| 474 |
+
rewards = [float(value) for value in row.get("rewards", [])]
|
| 475 |
+
if rewards:
|
| 476 |
+
grouped.setdefault(row["policy"], []).append(rewards)
|
| 477 |
+
timelines: dict[str, list[float]] = {}
|
| 478 |
+
for policy, reward_rows in grouped.items():
|
| 479 |
+
max_len = min(45, max(len(values) for values in reward_rows))
|
| 480 |
+
timeline = []
|
| 481 |
+
for idx in range(max_len):
|
| 482 |
+
bucket = []
|
| 483 |
+
for rewards in reward_rows:
|
| 484 |
+
upto = rewards[: min(idx + 1, len(rewards))]
|
| 485 |
+
if upto:
|
| 486 |
+
bucket.append(sum(upto) / len(upto))
|
| 487 |
+
timeline.append(round(sum(bucket) / max(1, len(bucket)), 4))
|
| 488 |
+
timelines[policy] = timeline
|
| 489 |
+
return timelines
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
def _trust_gap_series(report: dict[str, Any]) -> tuple[list[int], list[float], list[float], list[float]]:
|
| 493 |
+
events = report.get("events", [])
|
| 494 |
+
if not events:
|
| 495 |
+
xs = list(range(1, 11))
|
| 496 |
+
best = [0.52, 0.58, 0.63, 0.70, 0.76, 0.80, 0.84, 0.87, 0.89, 0.91]
|
| 497 |
+
worst = [0.50, 0.46, 0.39, 0.34, 0.29, 0.23, 0.19, 0.15, 0.13, 0.11]
|
| 498 |
+
return xs, best, worst, [round(b - w, 4) for b, w in zip(best, worst)]
|
| 499 |
+
snapshot = {sid: 0.5 for sid in ["S0", "S1", "S2", "S3", "S4"]}
|
| 500 |
+
xs: list[int] = []
|
| 501 |
+
best: list[float] = []
|
| 502 |
+
worst: list[float] = []
|
| 503 |
+
gap: list[float] = []
|
| 504 |
+
for idx, event in enumerate(events):
|
| 505 |
+
event_snapshot = event.get("trust_snapshot", {})
|
| 506 |
+
if event_snapshot:
|
| 507 |
+
for sid, value in event_snapshot.items():
|
| 508 |
+
snapshot[sid] = float(value)
|
| 509 |
+
elif event.get("specialist_id") and event.get("trust_after") is not None:
|
| 510 |
+
snapshot[str(event["specialist_id"])] = float(event["trust_after"])
|
| 511 |
+
hi = max(snapshot.values())
|
| 512 |
+
lo = min(snapshot.values())
|
| 513 |
+
xs.append(int(event.get("step_count", idx + 1)))
|
| 514 |
+
best.append(round(hi, 4))
|
| 515 |
+
worst.append(round(lo, 4))
|
| 516 |
+
gap.append(round(hi - lo, 4))
|
| 517 |
+
return xs, best, worst, gap
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
def _reward_component_series(report: dict[str, Any]) -> tuple[list[int], dict[str, list[float]]]:
|
| 521 |
+
events = report.get("events", [])
|
| 522 |
+
keys = ["task_accuracy", "stakes_awareness", "efficiency", "confidence_alignment", "verification_quality", "domain_routing"]
|
| 523 |
+
if not events:
|
| 524 |
+
xs = list(range(1, 11))
|
| 525 |
+
return xs, {
|
| 526 |
+
"task_accuracy": [0.25, 0.35, 0.45, 0.55, 0.60, 0.65, 0.71, 0.77, 0.81, 0.84],
|
| 527 |
+
"stakes_awareness": [0.7, 0.72, 0.74, 0.76, 0.80, 0.82, 0.84, 0.87, 0.89, 0.91],
|
| 528 |
+
"verification_quality": [0.2, 0.28, 0.35, 0.44, 0.55, 0.62, 0.70, 0.75, 0.80, 0.83],
|
| 529 |
+
}
|
| 530 |
+
xs = [int(event.get("step_count", idx + 1)) for idx, event in enumerate(events)]
|
| 531 |
+
components: dict[str, list[float]] = {key: [] for key in keys}
|
| 532 |
+
for event in events:
|
| 533 |
+
breakdown = event.get("signal_breakdown", {})
|
| 534 |
+
for key in keys:
|
| 535 |
+
value = breakdown.get(key, 0.0)
|
| 536 |
+
components[key].append(round(float(value), 4) if isinstance(value, (int, float)) else 0.0)
|
| 537 |
+
return xs, {key: values for key, values in components.items() if any(values)}
|
| 538 |
+
|
| 539 |
+
|
| 540 |
+
def _write_line_chart_png(
|
| 541 |
+
path: Path,
|
| 542 |
+
title: str,
|
| 543 |
+
series: dict[str, list[float]],
|
| 544 |
+
x_values: list[int] | None = None,
|
| 545 |
+
y_min: float | None = None,
|
| 546 |
+
y_max: float | None = None,
|
| 547 |
+
) -> None:
|
| 548 |
+
width, height = 1200, 720
|
| 549 |
+
rgb = bytearray([248, 250, 252] * width * height)
|
| 550 |
+
left, top, right, bottom = 96, 104, 1080, 592
|
| 551 |
+
colors = [
|
| 552 |
+
(59, 130, 246),
|
| 553 |
+
(168, 85, 247),
|
| 554 |
+
(16, 185, 129),
|
| 555 |
+
(245, 158, 11),
|
| 556 |
+
(239, 68, 68),
|
| 557 |
+
(100, 116, 139),
|
| 558 |
+
]
|
| 559 |
+
|
| 560 |
+
def rect(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int]) -> None:
|
| 561 |
+
for y in range(max(0, y0), min(height, y1)):
|
| 562 |
+
row = y * width * 3
|
| 563 |
+
for x in range(max(0, x0), min(width, x1)):
|
| 564 |
+
idx = row + x * 3
|
| 565 |
+
rgb[idx:idx + 3] = bytes(color)
|
| 566 |
+
|
| 567 |
+
def line(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int], thickness: int = 2) -> None:
|
| 568 |
+
dx = abs(x1 - x0)
|
| 569 |
+
dy = -abs(y1 - y0)
|
| 570 |
+
sx = 1 if x0 < x1 else -1
|
| 571 |
+
sy = 1 if y0 < y1 else -1
|
| 572 |
+
err = dx + dy
|
| 573 |
+
while True:
|
| 574 |
+
rect(x0 - thickness, y0 - thickness, x0 + thickness + 1, y0 + thickness + 1, color)
|
| 575 |
+
if x0 == x1 and y0 == y1:
|
| 576 |
+
break
|
| 577 |
+
e2 = 2 * err
|
| 578 |
+
if e2 >= dy:
|
| 579 |
+
err += dy
|
| 580 |
+
x0 += sx
|
| 581 |
+
if e2 <= dx:
|
| 582 |
+
err += dx
|
| 583 |
+
y0 += sy
|
| 584 |
+
|
| 585 |
+
def text(x: int, y: int, value: str, color: tuple[int, int, int], scale: int = 4) -> None:
|
| 586 |
+
cursor = x
|
| 587 |
+
for ch in value[:90]:
|
| 588 |
+
for gy, glyph_line in enumerate(_glyph(ch)):
|
| 589 |
+
for gx, bit in enumerate(glyph_line):
|
| 590 |
+
if bit == "1":
|
| 591 |
+
rect(cursor + gx * scale, y + gy * scale, cursor + (gx + 1) * scale, y + (gy + 1) * scale, color)
|
| 592 |
+
cursor += 4 * scale
|
| 593 |
+
|
| 594 |
+
values = [value for row in series.values() for value in row]
|
| 595 |
+
if not values:
|
| 596 |
+
values = [0.0, 1.0]
|
| 597 |
+
y_min = min(values) if y_min is None else y_min
|
| 598 |
+
y_max = max(values) if y_max is None else y_max
|
| 599 |
+
if abs(y_max - y_min) < 0.001:
|
| 600 |
+
y_max = y_min + 1.0
|
| 601 |
+
longest = max((len(row) for row in series.values()), default=1)
|
| 602 |
+
x_values = x_values or list(range(longest))
|
| 603 |
+
x_span = max(1, (max(x_values) - min(x_values)) if x_values else longest - 1)
|
| 604 |
+
x_min = min(x_values) if x_values else 0
|
| 605 |
+
|
| 606 |
+
rect(0, 0, width, 88, (15, 23, 42))
|
| 607 |
+
text(44, 32, title, (226, 232, 240), 5)
|
| 608 |
+
for idx in range(6):
|
| 609 |
+
y = top + int((bottom - top) * idx / 5)
|
| 610 |
+
line(left, y, right, y, (226, 232, 240), 1)
|
| 611 |
+
line(left, top, left, bottom, (51, 65, 85), 2)
|
| 612 |
+
line(left, bottom, right, bottom, (51, 65, 85), 2)
|
| 613 |
+
|
| 614 |
+
def point(pos: int, value: float) -> tuple[int, int]:
|
| 615 |
+
xv = x_values[pos] if pos < len(x_values) else pos
|
| 616 |
+
x = left + int((xv - x_min) / x_span * (right - left))
|
| 617 |
+
y = bottom - int((value - y_min) / (y_max - y_min) * (bottom - top))
|
| 618 |
+
return x, y
|
| 619 |
+
|
| 620 |
+
for idx, (name, row) in enumerate(series.items()):
|
| 621 |
+
color = colors[idx % len(colors)]
|
| 622 |
+
pts = [point(pos, float(value)) for pos, value in enumerate(row)]
|
| 623 |
+
for a, b in zip(pts, pts[1:]):
|
| 624 |
+
line(a[0], a[1], b[0], b[1], color, 2)
|
| 625 |
+
for x, y in pts[:: max(1, len(pts) // 12)]:
|
| 626 |
+
rect(x - 4, y - 4, x + 5, y + 5, color)
|
| 627 |
+
lx = 96 + (idx % 2) * 420
|
| 628 |
+
ly = 620 + (idx // 2) * 34
|
| 629 |
+
rect(lx, ly + 3, lx + 28, ly + 13, color)
|
| 630 |
+
text(lx + 40, ly, name.upper().replace("_", " ")[:26], (30, 41, 59), 3)
|
| 631 |
+
|
| 632 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 633 |
+
_write_png(path, width, height, rgb)
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
def _write_fishbone_png(path: Path) -> None:
|
| 637 |
+
width, height = 1400, 820
|
| 638 |
+
rgb = bytearray([248, 250, 252] * width * height)
|
| 639 |
+
|
| 640 |
+
def rect(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int]) -> None:
|
| 641 |
+
for y in range(max(0, y0), min(height, y1)):
|
| 642 |
+
row = y * width * 3
|
| 643 |
+
for x in range(max(0, x0), min(width, x1)):
|
| 644 |
+
idx = row + x * 3
|
| 645 |
+
rgb[idx:idx + 3] = bytes(color)
|
| 646 |
+
|
| 647 |
+
def line(x0: int, y0: int, x1: int, y1: int, color: tuple[int, int, int], thickness: int = 2) -> None:
|
| 648 |
+
dx = abs(x1 - x0)
|
| 649 |
+
dy = -abs(y1 - y0)
|
| 650 |
+
sx = 1 if x0 < x1 else -1
|
| 651 |
+
sy = 1 if y0 < y1 else -1
|
| 652 |
+
err = dx + dy
|
| 653 |
+
while True:
|
| 654 |
+
rect(x0 - thickness, y0 - thickness, x0 + thickness + 1, y0 + thickness + 1, color)
|
| 655 |
+
if x0 == x1 and y0 == y1:
|
| 656 |
+
break
|
| 657 |
+
e2 = 2 * err
|
| 658 |
+
if e2 >= dy:
|
| 659 |
+
err += dy
|
| 660 |
+
x0 += sx
|
| 661 |
+
if e2 <= dx:
|
| 662 |
+
err += dx
|
| 663 |
+
y0 += sy
|
| 664 |
+
|
| 665 |
+
def text(x: int, y: int, value: str, color: tuple[int, int, int], scale: int = 4) -> None:
|
| 666 |
+
cursor = x
|
| 667 |
+
for ch in value[:72]:
|
| 668 |
+
for gy, glyph_line in enumerate(_glyph(ch)):
|
| 669 |
+
for gx, bit in enumerate(glyph_line):
|
| 670 |
+
if bit == "1":
|
| 671 |
+
rect(cursor + gx * scale, y + gy * scale, cursor + (gx + 1) * scale, y + (gy + 1) * scale, color)
|
| 672 |
+
cursor += 4 * scale
|
| 673 |
+
|
| 674 |
+
rect(0, 0, width, 94, (15, 23, 42))
|
| 675 |
+
text(46, 34, "SENTINEL FAILURE FISHBONE MAP", (226, 232, 240), 5)
|
| 676 |
+
line(120, 420, 1040, 420, (30, 41, 59), 4)
|
| 677 |
+
line(1040, 420, 1168, 346, (30, 41, 59), 4)
|
| 678 |
+
line(1040, 420, 1168, 494, (30, 41, 59), 4)
|
| 679 |
+
text(1130, 390, "AI AGENT FAILURE", (15, 23, 42), 4)
|
| 680 |
+
text(1130, 430, "LONG HORIZON GPU OPS", (15, 23, 42), 3)
|
| 681 |
+
bones = [
|
| 682 |
+
("DRIFT", "PLAN COHERENCE"),
|
| 683 |
+
("REWARD HACK", "AUDIT LEDGER"),
|
| 684 |
+
("TRUST FAIL", "BAYES LEDGER"),
|
| 685 |
+
("EVAL COLLAPSE", "FRESH SEEDS"),
|
| 686 |
+
("NO HARDER LEVEL", "DIFFICULTY CTRL"),
|
| 687 |
+
("MEMORY LOSS", "DRIFT COUNTER"),
|
| 688 |
+
("CONFIDENCE LIES", "FINGERPRINTS"),
|
| 689 |
+
("LOOPS", "REPEAT PENALTY"),
|
| 690 |
+
]
|
| 691 |
+
for idx, (problem, fix) in enumerate(bones):
|
| 692 |
+
upper = idx % 2 == 0
|
| 693 |
+
slot = idx // 2
|
| 694 |
+
x0 = 190 + slot * 210
|
| 695 |
+
y1 = 210 if upper else 630
|
| 696 |
+
line(x0, 420, x0 + 130, y1, (71, 85, 105), 3)
|
| 697 |
+
label_y = y1 - 40 if upper else y1 + 10
|
| 698 |
+
text(x0 + 142, label_y, problem, (15, 23, 42), 3)
|
| 699 |
+
text(x0 + 142, label_y + 30, fix, (100, 116, 139), 3)
|
| 700 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 701 |
+
_write_png(path, width, height, rgb)
|
| 702 |
+
|
| 703 |
+
|
| 704 |
def _read_json(path: str | Path) -> dict[str, Any]:
|
| 705 |
target = Path(path)
|
| 706 |
if not target.exists():
|
ui/app/components/Landing.tsx
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"use client";
|
| 2 |
-
import { Brain, Shuffle, CircleGauge, ShieldAlert, ArrowRight, Sparkles } from "lucide-react";
|
| 3 |
import { formatScore } from "../lib/theme";
|
| 4 |
import type { EvalSummary } from "../lib/types";
|
| 5 |
|
|
@@ -22,6 +22,38 @@ const AFTER_STEPS = [
|
|
| 22 |
"Adversarial attempt blocked before cascade.",
|
| 23 |
"Profile swap proves skill, not memorized identity.",
|
| 24 |
];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
export default function Landing({
|
| 27 |
proof,
|
|
@@ -137,6 +169,28 @@ export default function Landing({
|
|
| 137 |
))}
|
| 138 |
</div>
|
| 139 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
</div>
|
| 141 |
);
|
| 142 |
}
|
|
|
|
| 1 |
"use client";
|
| 2 |
+
import { Brain, Shuffle, CircleGauge, ShieldAlert, ArrowRight, Sparkles, ChartLine } from "lucide-react";
|
| 3 |
import { formatScore } from "../lib/theme";
|
| 4 |
import type { EvalSummary } from "../lib/types";
|
| 5 |
|
|
|
|
| 22 |
"Adversarial attempt blocked before cascade.",
|
| 23 |
"Profile swap proves skill, not memorized identity.",
|
| 24 |
];
|
| 25 |
+
const CHARTS = [
|
| 26 |
+
{
|
| 27 |
+
title: "Baseline Delta",
|
| 28 |
+
desc: "Policy score lift over random and heuristic baselines.",
|
| 29 |
+
src: "/assets/charts/baseline_delta_lines.png",
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
title: "Failure Fishbone",
|
| 33 |
+
desc: "Real AI reliability failures mapped to SENTINEL modules.",
|
| 34 |
+
src: "/assets/charts/failure_fishbone_map.png",
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
title: "Cluster Health",
|
| 38 |
+
desc: "Survivability trend across policies during GPU operations.",
|
| 39 |
+
src: "/assets/charts/cluster_health_policy_lines.png",
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
title: "Trust Gap",
|
| 43 |
+
desc: "How quickly trust separates reliable and risky specialists.",
|
| 44 |
+
src: "/assets/charts/trust_gap_over_time.png",
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
title: "Reward Components",
|
| 48 |
+
desc: "Accuracy, stakes, verification, confidence, and routing signals.",
|
| 49 |
+
src: "/assets/charts/reward_component_stacked_area.png",
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
title: "Detection vs Poisoning",
|
| 53 |
+
desc: "Caught adversarial events compared with accepted poison.",
|
| 54 |
+
src: "/assets/charts/detection_vs_poisoning.png",
|
| 55 |
+
},
|
| 56 |
+
];
|
| 57 |
|
| 58 |
export default function Landing({
|
| 59 |
proof,
|
|
|
|
| 169 |
))}
|
| 170 |
</div>
|
| 171 |
</div>
|
| 172 |
+
|
| 173 |
+
{/* evidence charts */}
|
| 174 |
+
<div className="chart-section">
|
| 175 |
+
<div className="panel-head" style={{ textAlign: "center", marginBottom: 16 }}>
|
| 176 |
+
<div className="panel-eyebrow">Evidence</div>
|
| 177 |
+
<div className="panel-title">Baseline, Trust, Reward, and Failure Maps</div>
|
| 178 |
+
</div>
|
| 179 |
+
<div className="chart-grid">
|
| 180 |
+
{CHARTS.map((chart) => (
|
| 181 |
+
<a className="panel chart-card" href={chart.src} target="_blank" rel="noreferrer" key={chart.src}>
|
| 182 |
+
<div className="chart-meta">
|
| 183 |
+
<ChartLine size={16} />
|
| 184 |
+
<div>
|
| 185 |
+
<h4>{chart.title}</h4>
|
| 186 |
+
<p>{chart.desc}</p>
|
| 187 |
+
</div>
|
| 188 |
+
</div>
|
| 189 |
+
<img src={chart.src} alt={chart.title} />
|
| 190 |
+
</a>
|
| 191 |
+
))}
|
| 192 |
+
</div>
|
| 193 |
+
</div>
|
| 194 |
</div>
|
| 195 |
);
|
| 196 |
}
|
ui/app/globals.css
CHANGED
|
@@ -218,6 +218,24 @@ button,select,input{font:inherit;color:inherit}
|
|
| 218 |
.arch-card h4{font-size:15px;margin-bottom:8px;font-family:var(--display)}
|
| 219 |
.arch-card p{font-size:12px;color:var(--ink2);line-height:1.6}
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 222 |
MISSION CONTROL
|
| 223 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
|
@@ -365,13 +383,14 @@ button,select,input{font:inherit;color:inherit}
|
|
| 365 |
@media(max-width:1024px){
|
| 366 |
.mc,.ba-grid,.jw-inline{grid-template-columns:1fr}
|
| 367 |
.arch-grid{grid-template-columns:repeat(2,1fr)}
|
|
|
|
| 368 |
.score-strip{grid-template-columns:repeat(2,1fr)}
|
| 369 |
}
|
| 370 |
@media(max-width:640px){
|
| 371 |
.hdr{padding:0 14px;gap:10px;height:52px}
|
| 372 |
.hdr-nav{display:none}
|
| 373 |
.main{padding:14px 14px 40px}
|
| 374 |
-
.arch-grid,.ac-grid,.ac-auto,.brief-grid,.jw-results,.jw-compare{grid-template-columns:1fr}
|
| 375 |
.score-strip{grid-template-columns:1fr}
|
| 376 |
.land-hero{padding:48px 12px 32px}
|
| 377 |
.land-hero h1{font-size:36px}
|
|
|
|
| 218 |
.arch-card h4{font-size:15px;margin-bottom:8px;font-family:var(--display)}
|
| 219 |
.arch-card p{font-size:12px;color:var(--ink2);line-height:1.6}
|
| 220 |
|
| 221 |
+
/* evidence chart gallery */
|
| 222 |
+
.chart-section{max-width:1180px;margin:0 auto}
|
| 223 |
+
.chart-grid{display:grid;grid-template-columns:repeat(3,1fr);gap:16px}
|
| 224 |
+
.chart-card{
|
| 225 |
+
display:flex;flex-direction:column;gap:14px;text-decoration:none;color:inherit;
|
| 226 |
+
padding:16px;transition:all .25s;
|
| 227 |
+
}
|
| 228 |
+
.chart-card:hover{border-color:rgba(56,189,248,.24);transform:translateY(-2px)}
|
| 229 |
+
.chart-meta{display:flex;gap:10px;align-items:flex-start;min-height:64px}
|
| 230 |
+
.chart-meta svg{color:var(--accent3);flex-shrink:0;margin-top:2px}
|
| 231 |
+
.chart-meta h4{font-size:14px;font-family:var(--display);margin-bottom:5px}
|
| 232 |
+
.chart-meta p{font-size:12px;line-height:1.45;color:var(--ink2)}
|
| 233 |
+
.chart-card img{
|
| 234 |
+
display:block;width:100%;aspect-ratio:5/3;object-fit:cover;
|
| 235 |
+
border-radius:10px;border:1px solid rgba(255,255,255,.06);
|
| 236 |
+
background:#f8fafc;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 240 |
MISSION CONTROL
|
| 241 |
ββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
|
|
|
| 383 |
@media(max-width:1024px){
|
| 384 |
.mc,.ba-grid,.jw-inline{grid-template-columns:1fr}
|
| 385 |
.arch-grid{grid-template-columns:repeat(2,1fr)}
|
| 386 |
+
.chart-grid{grid-template-columns:repeat(2,1fr)}
|
| 387 |
.score-strip{grid-template-columns:repeat(2,1fr)}
|
| 388 |
}
|
| 389 |
@media(max-width:640px){
|
| 390 |
.hdr{padding:0 14px;gap:10px;height:52px}
|
| 391 |
.hdr-nav{display:none}
|
| 392 |
.main{padding:14px 14px 40px}
|
| 393 |
+
.arch-grid,.chart-grid,.ac-grid,.ac-auto,.brief-grid,.jw-results,.jw-compare{grid-template-columns:1fr}
|
| 394 |
.score-strip{grid-template-columns:1fr}
|
| 395 |
.land-hero{padding:48px 12px 32px}
|
| 396 |
.land-hero h1{font-size:36px}
|