Spaces:
Running
Running
Build SENTINEL phase 3 judge demo UI
Browse files- app.py +8 -0
- static/index.html +507 -22
app.py
CHANGED
|
@@ -88,6 +88,14 @@ def baseline_comparison_chart():
|
|
| 88 |
return FileResponse(chart_path, media_type="image/png")
|
| 89 |
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
@app.get("/api")
|
| 92 |
def api_root():
|
| 93 |
return {
|
|
|
|
| 88 |
return FileResponse(chart_path, media_type="image/png")
|
| 89 |
|
| 90 |
|
| 91 |
+
@app.get("/assets/evaluation_results.json")
|
| 92 |
+
def evaluation_results():
|
| 93 |
+
results_path = _OUTPUTS_DIR / "evaluation_results.json"
|
| 94 |
+
if not results_path.exists():
|
| 95 |
+
raise HTTPException(status_code=404, detail="Evaluation results not found.")
|
| 96 |
+
return FileResponse(results_path, media_type="application/json")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
@app.get("/api")
|
| 100 |
def api_root():
|
| 101 |
return {
|
static/index.html
CHANGED
|
@@ -193,6 +193,50 @@
|
|
| 193 |
flex-wrap: wrap;
|
| 194 |
}
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
.console {
|
| 197 |
width: min(1540px, 100%);
|
| 198 |
margin: 0 auto;
|
|
@@ -201,12 +245,14 @@
|
|
| 201 |
gap: 14px;
|
| 202 |
grid-template-columns: minmax(420px, 1.35fr) minmax(340px, 0.85fr);
|
| 203 |
grid-template-areas:
|
|
|
|
| 204 |
"theater command"
|
| 205 |
"mission playground"
|
| 206 |
"trust playground"
|
| 207 |
-
"story
|
| 208 |
"proof events"
|
| 209 |
-
"flow themes"
|
|
|
|
| 210 |
align-items: start;
|
| 211 |
}
|
| 212 |
|
|
@@ -219,18 +265,24 @@
|
|
| 219 |
overflow: hidden;
|
| 220 |
}
|
| 221 |
|
|
|
|
| 222 |
.theater { grid-area: theater; }
|
| 223 |
.command { grid-area: command; }
|
| 224 |
.mission { grid-area: mission; }
|
| 225 |
.trust { grid-area: trust; }
|
| 226 |
.playground { grid-area: playground; }
|
| 227 |
.story { grid-area: story; }
|
|
|
|
| 228 |
.readiness { grid-area: readiness; }
|
| 229 |
.proof { grid-area: proof; }
|
| 230 |
.events { grid-area: events; }
|
| 231 |
.flow { grid-area: flow; }
|
| 232 |
.themes { grid-area: themes; }
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
.section-head {
|
| 235 |
min-height: 54px;
|
| 236 |
display: flex;
|
|
@@ -254,6 +306,92 @@
|
|
| 254 |
padding: 15px;
|
| 255 |
}
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
.chips {
|
| 258 |
display: flex;
|
| 259 |
flex-wrap: wrap;
|
|
@@ -904,6 +1042,57 @@
|
|
| 904 |
background: rgba(10, 12, 8, 0.35);
|
| 905 |
}
|
| 906 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 907 |
.readiness-list {
|
| 908 |
display: grid;
|
| 909 |
gap: 10px;
|
|
@@ -1062,12 +1251,14 @@
|
|
| 1062 |
.console {
|
| 1063 |
grid-template-columns: 1fr;
|
| 1064 |
grid-template-areas:
|
|
|
|
| 1065 |
"theater"
|
| 1066 |
"command"
|
| 1067 |
"mission"
|
| 1068 |
"trust"
|
| 1069 |
"playground"
|
| 1070 |
"story"
|
|
|
|
| 1071 |
"readiness"
|
| 1072 |
"proof"
|
| 1073 |
"events"
|
|
@@ -1095,10 +1286,14 @@
|
|
| 1095 |
|
| 1096 |
.stage-topline,
|
| 1097 |
.outcome-strip,
|
|
|
|
|
|
|
| 1098 |
.proof-grid,
|
| 1099 |
.json-grid,
|
| 1100 |
.playground-meta,
|
| 1101 |
.story-grid,
|
|
|
|
|
|
|
| 1102 |
.flow-line,
|
| 1103 |
.theme-grid,
|
| 1104 |
.stats-grid {
|
|
@@ -1146,6 +1341,20 @@
|
|
| 1146 |
width: 100%;
|
| 1147 |
}
|
| 1148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1149 |
.specialist-grid {
|
| 1150 |
grid-template-columns: 1fr;
|
| 1151 |
}
|
|
@@ -1180,11 +1389,86 @@
|
|
| 1180 |
<input id="seedInput" aria-label="Seed" type="number" value="42">
|
| 1181 |
<button id="resetBtn" class="primary" type="button">Reset Episode</button>
|
| 1182 |
<button id="swapBtn" class="warn" type="button">Swap Profiles</button>
|
| 1183 |
-
<button id="autoBtn" type="button">
|
| 1184 |
</div>
|
| 1185 |
</header>
|
| 1186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1187 |
<main class="console">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1188 |
<section class="theater">
|
| 1189 |
<div class="section-head">
|
| 1190 |
<h2>Live Trust Theater</h2>
|
|
@@ -1399,7 +1683,7 @@
|
|
| 1399 |
<div class="story-lane before">
|
| 1400 |
<div class="story-title">
|
| 1401 |
<strong>Without SENTINEL</strong>
|
| 1402 |
-
<span class="story-score">task3 random 0.699</span>
|
| 1403 |
</div>
|
| 1404 |
<div class="story-flow">
|
| 1405 |
<div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
|
|
@@ -1412,7 +1696,7 @@
|
|
| 1412 |
<div class="story-lane after">
|
| 1413 |
<div class="story-title">
|
| 1414 |
<strong>With SENTINEL</strong>
|
| 1415 |
-
<span class="story-score">task3 heuristic 0.784</span>
|
| 1416 |
</div>
|
| 1417 |
<div class="story-flow">
|
| 1418 |
<div class="story-step">Behavior updates the TrustLedger after every step, so public slots diverge quickly.</div>
|
|
@@ -1426,6 +1710,56 @@
|
|
| 1426 |
</div>
|
| 1427 |
</section>
|
| 1428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1429 |
<section class="readiness">
|
| 1430 |
<div class="section-head">
|
| 1431 |
<h2>Hackathon Readiness</h2>
|
|
@@ -1463,23 +1797,23 @@
|
|
| 1463 |
<div class="baseline-table">
|
| 1464 |
<div class="baseline-row">
|
| 1465 |
<span>Random</span>
|
| 1466 |
-
<div class="mini-bar"><span style="width:71.4%;background:#ff5f45"></span></div>
|
| 1467 |
-
<strong>0.714</strong>
|
| 1468 |
</div>
|
| 1469 |
<div class="baseline-row">
|
| 1470 |
<span>Heuristic</span>
|
| 1471 |
-
<div class="mini-bar"><span style="width:81.6%;background:#73a7ff"></span></div>
|
| 1472 |
-
<strong>0.816</strong>
|
| 1473 |
</div>
|
| 1474 |
<div class="baseline-row">
|
| 1475 |
<span>Oracle-lite</span>
|
| 1476 |
-
<div class="mini-bar"><span style="width:87.2%;background:#27e0a1"></span></div>
|
| 1477 |
-
<strong>0.872</strong>
|
| 1478 |
</div>
|
| 1479 |
<div class="baseline-row">
|
| 1480 |
<span>T3 detect</span>
|
| 1481 |
-
<div class="mini-bar"><span style="width:73.5%;background:#f5ba41"></span></div>
|
| 1482 |
-
<strong>0.735</strong>
|
| 1483 |
</div>
|
| 1484 |
</div>
|
| 1485 |
<div class="chart-frame">
|
|
@@ -1574,7 +1908,10 @@
|
|
| 1574 |
events: [],
|
| 1575 |
lastRequest: null,
|
| 1576 |
lastResult: null,
|
| 1577 |
-
lastMode: "reset()"
|
|
|
|
|
|
|
|
|
|
| 1578 |
};
|
| 1579 |
|
| 1580 |
const el = {
|
|
@@ -1585,6 +1922,13 @@
|
|
| 1585 |
swapBtn: document.getElementById("swapBtn"),
|
| 1586 |
swapPanelBtn: document.getElementById("swapPanelBtn"),
|
| 1587 |
autoBtn: document.getElementById("autoBtn"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1588 |
specialistSelect: document.getElementById("specialistSelect"),
|
| 1589 |
recommendChip: document.getElementById("recommendChip"),
|
| 1590 |
recommendText: document.getElementById("recommendText"),
|
|
@@ -1617,7 +1961,23 @@
|
|
| 1617 |
leadMove: document.getElementById("leadMove"),
|
| 1618 |
stageMove: document.getElementById("stageMove"),
|
| 1619 |
stageSignals: document.getElementById("stageSignals"),
|
| 1620 |
-
rewardText: document.getElementById("rewardText")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1621 |
};
|
| 1622 |
|
| 1623 |
function trustColor(value) {
|
|
@@ -1656,6 +2016,111 @@
|
|
| 1656 |
return {type: "delegate", specialist: best, trust};
|
| 1657 |
}
|
| 1658 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1659 |
function renderTrust() {
|
| 1660 |
const trust = state.observation?.trust_snapshot || Object.fromEntries(ids.map(id => [id, 0.5]));
|
| 1661 |
const values = ids.map(id => Number(trust[id] ?? 0.5));
|
|
@@ -1766,6 +2231,9 @@
|
|
| 1766 |
el.selfBtn.disabled = disabled;
|
| 1767 |
el.skipBtn.disabled = disabled;
|
| 1768 |
el.applyRecommendBtn.disabled = disabled;
|
|
|
|
|
|
|
|
|
|
| 1769 |
}
|
| 1770 |
|
| 1771 |
function render(result) {
|
|
@@ -1908,11 +2376,11 @@
|
|
| 1908 |
}
|
| 1909 |
}
|
| 1910 |
|
| 1911 |
-
async function autoRun() {
|
| 1912 |
if (!state.observation || state.done) await resetEpisode();
|
| 1913 |
let guard = 0;
|
| 1914 |
while (!state.done && guard < 70) {
|
| 1915 |
-
const move = recommendedMove();
|
| 1916 |
await stepEpisode(move.type, move.specialist);
|
| 1917 |
guard += 1;
|
| 1918 |
await new Promise(resolve => setTimeout(resolve, 150));
|
|
@@ -1924,23 +2392,40 @@
|
|
| 1924 |
await stepEpisode(move.type, move.specialist);
|
| 1925 |
}
|
| 1926 |
|
| 1927 |
-
async function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1928 |
const nextSeed = Number(el.seedInput.value || 0) + 1;
|
| 1929 |
el.seedInput.value = String(nextSeed);
|
| 1930 |
await resetEpisode();
|
|
|
|
|
|
|
|
|
|
| 1931 |
}
|
| 1932 |
|
| 1933 |
el.resetBtn.addEventListener("click", resetEpisode);
|
| 1934 |
el.resetPanelBtn.addEventListener("click", resetEpisode);
|
| 1935 |
-
el.swapBtn.addEventListener("click", swapProfiles);
|
| 1936 |
-
el.swapPanelBtn.addEventListener("click", swapProfiles);
|
| 1937 |
el.delegateBtn.addEventListener("click", () => stepEpisode("delegate"));
|
| 1938 |
el.verifyBtn.addEventListener("click", () => stepEpisode("verify"));
|
| 1939 |
el.selfBtn.addEventListener("click", () => stepEpisode("solve_independently"));
|
| 1940 |
el.skipBtn.addEventListener("click", () => stepEpisode("skip"));
|
| 1941 |
-
el.autoBtn.addEventListener("click", autoRun);
|
| 1942 |
el.applyRecommendBtn.addEventListener("click", applyRecommendation);
|
| 1943 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1944 |
render();
|
| 1945 |
resetEpisode();
|
| 1946 |
</script>
|
|
|
|
| 193 |
flex-wrap: wrap;
|
| 194 |
}
|
| 195 |
|
| 196 |
+
.modebar {
|
| 197 |
+
padding: 0 22px 14px;
|
| 198 |
+
border-bottom: 1px solid #232920;
|
| 199 |
+
background: rgba(7, 8, 6, 0.9);
|
| 200 |
+
backdrop-filter: blur(14px);
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
.modebar-inner {
|
| 204 |
+
width: min(1540px, 100%);
|
| 205 |
+
margin: 0 auto;
|
| 206 |
+
display: flex;
|
| 207 |
+
align-items: center;
|
| 208 |
+
justify-content: space-between;
|
| 209 |
+
gap: 14px;
|
| 210 |
+
flex-wrap: wrap;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
.view-tabs {
|
| 214 |
+
display: flex;
|
| 215 |
+
align-items: center;
|
| 216 |
+
gap: 8px;
|
| 217 |
+
flex-wrap: wrap;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
.view-tab {
|
| 221 |
+
min-height: 38px;
|
| 222 |
+
border-radius: 999px;
|
| 223 |
+
padding: 0 14px;
|
| 224 |
+
background: #10140f;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.view-tab.active {
|
| 228 |
+
border-color: rgba(39, 224, 161, 0.72);
|
| 229 |
+
background: linear-gradient(180deg, #1d6a53, #133c30);
|
| 230 |
+
color: #effff7;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
.view-copy {
|
| 234 |
+
color: var(--muted);
|
| 235 |
+
font-size: 13px;
|
| 236 |
+
line-height: 1.4;
|
| 237 |
+
max-width: 760px;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
.console {
|
| 241 |
width: min(1540px, 100%);
|
| 242 |
margin: 0 auto;
|
|
|
|
| 245 |
gap: 14px;
|
| 246 |
grid-template-columns: minmax(420px, 1.35fr) minmax(340px, 0.85fr);
|
| 247 |
grid-template-areas:
|
| 248 |
+
"hero hero"
|
| 249 |
"theater command"
|
| 250 |
"mission playground"
|
| 251 |
"trust playground"
|
| 252 |
+
"story judge"
|
| 253 |
"proof events"
|
| 254 |
+
"flow themes"
|
| 255 |
+
"readiness readiness";
|
| 256 |
align-items: start;
|
| 257 |
}
|
| 258 |
|
|
|
|
| 265 |
overflow: hidden;
|
| 266 |
}
|
| 267 |
|
| 268 |
+
.hero { grid-area: hero; }
|
| 269 |
.theater { grid-area: theater; }
|
| 270 |
.command { grid-area: command; }
|
| 271 |
.mission { grid-area: mission; }
|
| 272 |
.trust { grid-area: trust; }
|
| 273 |
.playground { grid-area: playground; }
|
| 274 |
.story { grid-area: story; }
|
| 275 |
+
.judge { grid-area: judge; }
|
| 276 |
.readiness { grid-area: readiness; }
|
| 277 |
.proof { grid-area: proof; }
|
| 278 |
.events { grid-area: events; }
|
| 279 |
.flow { grid-area: flow; }
|
| 280 |
.themes { grid-area: themes; }
|
| 281 |
|
| 282 |
+
.section-hidden {
|
| 283 |
+
display: none;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
.section-head {
|
| 287 |
min-height: 54px;
|
| 288 |
display: flex;
|
|
|
|
| 306 |
padding: 15px;
|
| 307 |
}
|
| 308 |
|
| 309 |
+
.hero-grid {
|
| 310 |
+
display: grid;
|
| 311 |
+
grid-template-columns: minmax(0, 1.1fr) minmax(320px, 0.9fr);
|
| 312 |
+
gap: 13px;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
.hero-panel {
|
| 316 |
+
min-height: 208px;
|
| 317 |
+
border: 1px solid #394132;
|
| 318 |
+
border-radius: 8px;
|
| 319 |
+
padding: 15px;
|
| 320 |
+
background: var(--panel-2);
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
.hero-panel.primary {
|
| 324 |
+
border-color: rgba(39, 224, 161, 0.42);
|
| 325 |
+
background:
|
| 326 |
+
linear-gradient(180deg, rgba(39, 224, 161, 0.12), transparent 45%),
|
| 327 |
+
var(--panel-2);
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
.hero-panel h3,
|
| 331 |
+
.judge-card h3 {
|
| 332 |
+
font-size: 18px;
|
| 333 |
+
color: var(--cream);
|
| 334 |
+
margin: 0 0 10px 0;
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
.hero-panel p {
|
| 338 |
+
color: #e8e1ca;
|
| 339 |
+
font-size: 14px;
|
| 340 |
+
line-height: 1.55;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.hero-callouts,
|
| 344 |
+
.hero-steps,
|
| 345 |
+
.judge-list {
|
| 346 |
+
display: grid;
|
| 347 |
+
gap: 9px;
|
| 348 |
+
margin-top: 14px;
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
.hero-callout,
|
| 352 |
+
.hero-step,
|
| 353 |
+
.judge-step {
|
| 354 |
+
min-height: 52px;
|
| 355 |
+
border: 1px solid #394132;
|
| 356 |
+
border-radius: 8px;
|
| 357 |
+
padding: 11px 12px;
|
| 358 |
+
background: #0d100b;
|
| 359 |
+
color: #ebe5cf;
|
| 360 |
+
line-height: 1.42;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
.hero-callout strong,
|
| 364 |
+
.hero-step strong,
|
| 365 |
+
.judge-step strong {
|
| 366 |
+
display: block;
|
| 367 |
+
margin-bottom: 4px;
|
| 368 |
+
color: var(--cream);
|
| 369 |
+
font-size: 13px;
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
.hero-stats {
|
| 373 |
+
margin-top: 14px;
|
| 374 |
+
display: grid;
|
| 375 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 376 |
+
gap: 10px;
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
.hero-stat {
|
| 380 |
+
min-height: 80px;
|
| 381 |
+
border: 1px solid #394132;
|
| 382 |
+
border-radius: 8px;
|
| 383 |
+
padding: 11px;
|
| 384 |
+
background: #0d100b;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.hero-stat .label {
|
| 388 |
+
margin-bottom: 7px;
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
.hero-stat .value {
|
| 392 |
+
font-size: 22px;
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
.chips {
|
| 396 |
display: flex;
|
| 397 |
flex-wrap: wrap;
|
|
|
|
| 1042 |
background: rgba(10, 12, 8, 0.35);
|
| 1043 |
}
|
| 1044 |
|
| 1045 |
+
.judge-grid {
|
| 1046 |
+
display: grid;
|
| 1047 |
+
gap: 12px;
|
| 1048 |
+
}
|
| 1049 |
+
|
| 1050 |
+
.judge-stats {
|
| 1051 |
+
display: grid;
|
| 1052 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 1053 |
+
gap: 10px;
|
| 1054 |
+
}
|
| 1055 |
+
|
| 1056 |
+
.judge-card {
|
| 1057 |
+
min-height: 132px;
|
| 1058 |
+
border: 1px solid #394132;
|
| 1059 |
+
border-radius: 8px;
|
| 1060 |
+
padding: 13px;
|
| 1061 |
+
background: var(--panel-2);
|
| 1062 |
+
}
|
| 1063 |
+
|
| 1064 |
+
.judge-card.good {
|
| 1065 |
+
border-color: rgba(39, 224, 161, 0.4);
|
| 1066 |
+
background: var(--jade-soft);
|
| 1067 |
+
}
|
| 1068 |
+
|
| 1069 |
+
.judge-card.warn {
|
| 1070 |
+
border-color: rgba(245, 186, 65, 0.4);
|
| 1071 |
+
background: var(--amber-soft);
|
| 1072 |
+
}
|
| 1073 |
+
|
| 1074 |
+
.judge-card.bad {
|
| 1075 |
+
border-color: rgba(255, 95, 69, 0.4);
|
| 1076 |
+
background: var(--flame-soft);
|
| 1077 |
+
}
|
| 1078 |
+
|
| 1079 |
+
.judge-card .value {
|
| 1080 |
+
font-size: 28px;
|
| 1081 |
+
margin-top: 8px;
|
| 1082 |
+
}
|
| 1083 |
+
|
| 1084 |
+
.judge-card .muted {
|
| 1085 |
+
display: block;
|
| 1086 |
+
margin-top: 6px;
|
| 1087 |
+
line-height: 1.4;
|
| 1088 |
+
}
|
| 1089 |
+
|
| 1090 |
+
.judge-actions {
|
| 1091 |
+
display: grid;
|
| 1092 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 1093 |
+
gap: 10px;
|
| 1094 |
+
}
|
| 1095 |
+
|
| 1096 |
.readiness-list {
|
| 1097 |
display: grid;
|
| 1098 |
gap: 10px;
|
|
|
|
| 1251 |
.console {
|
| 1252 |
grid-template-columns: 1fr;
|
| 1253 |
grid-template-areas:
|
| 1254 |
+
"hero"
|
| 1255 |
"theater"
|
| 1256 |
"command"
|
| 1257 |
"mission"
|
| 1258 |
"trust"
|
| 1259 |
"playground"
|
| 1260 |
"story"
|
| 1261 |
+
"judge"
|
| 1262 |
"readiness"
|
| 1263 |
"proof"
|
| 1264 |
"events"
|
|
|
|
| 1286 |
|
| 1287 |
.stage-topline,
|
| 1288 |
.outcome-strip,
|
| 1289 |
+
.hero-grid,
|
| 1290 |
+
.hero-stats,
|
| 1291 |
.proof-grid,
|
| 1292 |
.json-grid,
|
| 1293 |
.playground-meta,
|
| 1294 |
.story-grid,
|
| 1295 |
+
.judge-stats,
|
| 1296 |
+
.judge-actions,
|
| 1297 |
.flow-line,
|
| 1298 |
.theme-grid,
|
| 1299 |
.stats-grid {
|
|
|
|
| 1341 |
width: 100%;
|
| 1342 |
}
|
| 1343 |
|
| 1344 |
+
.modebar {
|
| 1345 |
+
padding: 0 13px 12px;
|
| 1346 |
+
}
|
| 1347 |
+
|
| 1348 |
+
.view-tabs {
|
| 1349 |
+
width: 100%;
|
| 1350 |
+
flex-direction: column;
|
| 1351 |
+
align-items: stretch;
|
| 1352 |
+
}
|
| 1353 |
+
|
| 1354 |
+
.view-tab {
|
| 1355 |
+
width: 100%;
|
| 1356 |
+
}
|
| 1357 |
+
|
| 1358 |
.specialist-grid {
|
| 1359 |
grid-template-columns: 1fr;
|
| 1360 |
}
|
|
|
|
| 1389 |
<input id="seedInput" aria-label="Seed" type="number" value="42">
|
| 1390 |
<button id="resetBtn" class="primary" type="button">Reset Episode</button>
|
| 1391 |
<button id="swapBtn" class="warn" type="button">Swap Profiles</button>
|
| 1392 |
+
<button id="autoBtn" type="button">Heuristic Auto</button>
|
| 1393 |
</div>
|
| 1394 |
</header>
|
| 1395 |
|
| 1396 |
+
<div class="modebar">
|
| 1397 |
+
<div class="modebar-inner">
|
| 1398 |
+
<div class="view-tabs">
|
| 1399 |
+
<button id="viewOverviewBtn" class="view-tab active" type="button">Overview</button>
|
| 1400 |
+
<button id="viewPlaygroundBtn" class="view-tab" type="button">Playground</button>
|
| 1401 |
+
<button id="viewJudgeBtn" class="view-tab" type="button">Judge Demo</button>
|
| 1402 |
+
</div>
|
| 1403 |
+
<div id="viewCopy" class="view-copy">Overview turns the environment into a judge-readable system story: the problem, the learning signal, and the live failure mode it fixes.</div>
|
| 1404 |
+
</div>
|
| 1405 |
+
</div>
|
| 1406 |
+
|
| 1407 |
<main class="console">
|
| 1408 |
+
<section class="hero">
|
| 1409 |
+
<div class="section-head">
|
| 1410 |
+
<h2>System Overview</h2>
|
| 1411 |
+
<div class="chips">
|
| 1412 |
+
<span class="chip live">reset → step → state</span>
|
| 1413 |
+
<span class="chip">OpenEnv compatible</span>
|
| 1414 |
+
<span class="chip warn">skill, not identity</span>
|
| 1415 |
+
</div>
|
| 1416 |
+
</div>
|
| 1417 |
+
<div class="body">
|
| 1418 |
+
<div class="hero-grid">
|
| 1419 |
+
<div class="hero-panel primary">
|
| 1420 |
+
<h3>What SENTINEL actually teaches</h3>
|
| 1421 |
+
<p>SENTINEL is not training a specialist to solve one domain task. It trains the orchestrator to decide who to trust, when to verify, when to self-solve, and how to recover when one public slot turns unreliable or adversarial inside a long multi-agent task graph.</p>
|
| 1422 |
+
<div class="hero-callouts">
|
| 1423 |
+
<div class="hero-callout">
|
| 1424 |
+
<strong>Observation model</strong>
|
| 1425 |
+
The orchestrator only sees behavior: public slots, trust scores, stakes, step budget, and outcomes.
|
| 1426 |
+
</div>
|
| 1427 |
+
<div class="hero-callout">
|
| 1428 |
+
<strong>Core novelty</strong>
|
| 1429 |
+
Hidden specialist profiles reshuffle every reset, so the agent cannot memorize that S2 or S3 is dangerous.
|
| 1430 |
+
</div>
|
| 1431 |
+
<div class="hero-callout">
|
| 1432 |
+
<strong>Judge takeaway</strong>
|
| 1433 |
+
This environment turns blind agent-to-agent trust into a trainable oversight skill.
|
| 1434 |
+
</div>
|
| 1435 |
+
</div>
|
| 1436 |
+
<div class="hero-stats">
|
| 1437 |
+
<div class="hero-stat">
|
| 1438 |
+
<div class="label">Random overall</div>
|
| 1439 |
+
<div id="heroRandomScore" class="value">0.714</div>
|
| 1440 |
+
</div>
|
| 1441 |
+
<div class="hero-stat">
|
| 1442 |
+
<div class="label">Heuristic overall</div>
|
| 1443 |
+
<div id="heroHeuristicScore" class="value">0.816</div>
|
| 1444 |
+
</div>
|
| 1445 |
+
<div class="hero-stat">
|
| 1446 |
+
<div class="label">Task 3 detect</div>
|
| 1447 |
+
<div id="heroDetectionScore" class="value">0.735</div>
|
| 1448 |
+
</div>
|
| 1449 |
+
</div>
|
| 1450 |
+
</div>
|
| 1451 |
+
<div class="hero-panel">
|
| 1452 |
+
<h3>How to test this fast</h3>
|
| 1453 |
+
<div class="hero-steps">
|
| 1454 |
+
<div class="hero-step">
|
| 1455 |
+
<strong>1. Overview mode</strong>
|
| 1456 |
+
Read the before/after lanes and reward proof. This tells the story in judge language.
|
| 1457 |
+
</div>
|
| 1458 |
+
<div class="hero-step">
|
| 1459 |
+
<strong>2. Playground mode</strong>
|
| 1460 |
+
Reset an episode, click Auto Policy, and watch the API payloads, trust bars, and reward stream update.
|
| 1461 |
+
</div>
|
| 1462 |
+
<div class="hero-step">
|
| 1463 |
+
<strong>3. Judge Demo mode</strong>
|
| 1464 |
+
Run Random, then Heuristic, then Swap + Replay. That is the live finale sequence.
|
| 1465 |
+
</div>
|
| 1466 |
+
</div>
|
| 1467 |
+
</div>
|
| 1468 |
+
</div>
|
| 1469 |
+
</div>
|
| 1470 |
+
</section>
|
| 1471 |
+
|
| 1472 |
<section class="theater">
|
| 1473 |
<div class="section-head">
|
| 1474 |
<h2>Live Trust Theater</h2>
|
|
|
|
| 1683 |
<div class="story-lane before">
|
| 1684 |
<div class="story-title">
|
| 1685 |
<strong>Without SENTINEL</strong>
|
| 1686 |
+
<span id="storyBeforeScore" class="story-score">task3 random 0.699</span>
|
| 1687 |
</div>
|
| 1688 |
<div class="story-flow">
|
| 1689 |
<div class="story-step">All public slots start near the same trust. The orchestrator delegates with weak evidence.</div>
|
|
|
|
| 1696 |
<div class="story-lane after">
|
| 1697 |
<div class="story-title">
|
| 1698 |
<strong>With SENTINEL</strong>
|
| 1699 |
+
<span id="storyAfterScore" class="story-score">task3 heuristic 0.784</span>
|
| 1700 |
</div>
|
| 1701 |
<div class="story-flow">
|
| 1702 |
<div class="story-step">Behavior updates the TrustLedger after every step, so public slots diverge quickly.</div>
|
|
|
|
| 1710 |
</div>
|
| 1711 |
</section>
|
| 1712 |
|
| 1713 |
+
<section class="judge">
|
| 1714 |
+
<div class="section-head">
|
| 1715 |
+
<h2>Judge Demo Rail</h2>
|
| 1716 |
+
<div class="chips">
|
| 1717 |
+
<span class="chip live">3-minute flow</span>
|
| 1718 |
+
<span class="chip">one-click policies</span>
|
| 1719 |
+
</div>
|
| 1720 |
+
</div>
|
| 1721 |
+
<div class="body">
|
| 1722 |
+
<div class="judge-grid">
|
| 1723 |
+
<div class="judge-stats">
|
| 1724 |
+
<div class="judge-card bad">
|
| 1725 |
+
<div class="label">Random baseline</div>
|
| 1726 |
+
<div id="judgeRandomScore" class="value">0.714</div>
|
| 1727 |
+
<span class="muted">Blind delegation baseline. Good enough to move, weak at skepticism.</span>
|
| 1728 |
+
</div>
|
| 1729 |
+
<div class="judge-card warn">
|
| 1730 |
+
<div class="label">Heuristic policy</div>
|
| 1731 |
+
<div id="judgeHeuristicScore" class="value">0.816</div>
|
| 1732 |
+
<span class="muted">Trust-weighted routing plus verification at risky gates.</span>
|
| 1733 |
+
</div>
|
| 1734 |
+
<div class="judge-card good">
|
| 1735 |
+
<div class="label">Task 3 detection</div>
|
| 1736 |
+
<div id="judgeDetectionScore" class="value">0.735</div>
|
| 1737 |
+
<span class="muted">Adversarial detections before poison can cascade into later nodes.</span>
|
| 1738 |
+
</div>
|
| 1739 |
+
</div>
|
| 1740 |
+
<div class="judge-actions">
|
| 1741 |
+
<button id="randomPolicyBtn" class="danger" type="button">Run Random</button>
|
| 1742 |
+
<button id="heuristicPolicyBtn" class="primary" type="button">Run Heuristic</button>
|
| 1743 |
+
<button id="judgeSwapBtn" class="warn" type="button">Swap + Replay</button>
|
| 1744 |
+
</div>
|
| 1745 |
+
<div class="judge-list">
|
| 1746 |
+
<div class="judge-step">
|
| 1747 |
+
<strong>Step 1 — show the failure</strong>
|
| 1748 |
+
Run Random to show how similar-looking trust scores lead to brittle routing and weak detection.
|
| 1749 |
+
</div>
|
| 1750 |
+
<div class="judge-step">
|
| 1751 |
+
<strong>Step 2 — show the learned behavior</strong>
|
| 1752 |
+
Run Heuristic to show trust divergence, verification at risky gates, and cleaner recovery.
|
| 1753 |
+
</div>
|
| 1754 |
+
<div class="judge-step">
|
| 1755 |
+
<strong>Step 3 — show generalization</strong>
|
| 1756 |
+
Hit Swap + Replay so hidden roles reshuffle and the orchestrator has to learn from fresh evidence again.
|
| 1757 |
+
</div>
|
| 1758 |
+
</div>
|
| 1759 |
+
</div>
|
| 1760 |
+
</div>
|
| 1761 |
+
</section>
|
| 1762 |
+
|
| 1763 |
<section class="readiness">
|
| 1764 |
<div class="section-head">
|
| 1765 |
<h2>Hackathon Readiness</h2>
|
|
|
|
| 1797 |
<div class="baseline-table">
|
| 1798 |
<div class="baseline-row">
|
| 1799 |
<span>Random</span>
|
| 1800 |
+
<div class="mini-bar"><span id="proofRandomBar" style="width:71.4%;background:#ff5f45"></span></div>
|
| 1801 |
+
<strong id="proofRandomScore">0.714</strong>
|
| 1802 |
</div>
|
| 1803 |
<div class="baseline-row">
|
| 1804 |
<span>Heuristic</span>
|
| 1805 |
+
<div class="mini-bar"><span id="proofHeuristicBar" style="width:81.6%;background:#73a7ff"></span></div>
|
| 1806 |
+
<strong id="proofHeuristicScore">0.816</strong>
|
| 1807 |
</div>
|
| 1808 |
<div class="baseline-row">
|
| 1809 |
<span>Oracle-lite</span>
|
| 1810 |
+
<div class="mini-bar"><span id="proofOracleBar" style="width:87.2%;background:#27e0a1"></span></div>
|
| 1811 |
+
<strong id="proofOracleScore">0.872</strong>
|
| 1812 |
</div>
|
| 1813 |
<div class="baseline-row">
|
| 1814 |
<span>T3 detect</span>
|
| 1815 |
+
<div class="mini-bar"><span id="proofDetectBar" style="width:73.5%;background:#f5ba41"></span></div>
|
| 1816 |
+
<strong id="proofDetectScore">0.735</strong>
|
| 1817 |
</div>
|
| 1818 |
</div>
|
| 1819 |
<div class="chart-frame">
|
|
|
|
| 1908 |
events: [],
|
| 1909 |
lastRequest: null,
|
| 1910 |
lastResult: null,
|
| 1911 |
+
lastMode: "reset()",
|
| 1912 |
+
view: "overview",
|
| 1913 |
+
evaluation: null,
|
| 1914 |
+
demoPolicy: "heuristic"
|
| 1915 |
};
|
| 1916 |
|
| 1917 |
const el = {
|
|
|
|
| 1922 |
swapBtn: document.getElementById("swapBtn"),
|
| 1923 |
swapPanelBtn: document.getElementById("swapPanelBtn"),
|
| 1924 |
autoBtn: document.getElementById("autoBtn"),
|
| 1925 |
+
viewOverviewBtn: document.getElementById("viewOverviewBtn"),
|
| 1926 |
+
viewPlaygroundBtn: document.getElementById("viewPlaygroundBtn"),
|
| 1927 |
+
viewJudgeBtn: document.getElementById("viewJudgeBtn"),
|
| 1928 |
+
viewCopy: document.getElementById("viewCopy"),
|
| 1929 |
+
randomPolicyBtn: document.getElementById("randomPolicyBtn"),
|
| 1930 |
+
heuristicPolicyBtn: document.getElementById("heuristicPolicyBtn"),
|
| 1931 |
+
judgeSwapBtn: document.getElementById("judgeSwapBtn"),
|
| 1932 |
specialistSelect: document.getElementById("specialistSelect"),
|
| 1933 |
recommendChip: document.getElementById("recommendChip"),
|
| 1934 |
recommendText: document.getElementById("recommendText"),
|
|
|
|
| 1961 |
leadMove: document.getElementById("leadMove"),
|
| 1962 |
stageMove: document.getElementById("stageMove"),
|
| 1963 |
stageSignals: document.getElementById("stageSignals"),
|
| 1964 |
+
rewardText: document.getElementById("rewardText"),
|
| 1965 |
+
heroRandomScore: document.getElementById("heroRandomScore"),
|
| 1966 |
+
heroHeuristicScore: document.getElementById("heroHeuristicScore"),
|
| 1967 |
+
heroDetectionScore: document.getElementById("heroDetectionScore"),
|
| 1968 |
+
storyBeforeScore: document.getElementById("storyBeforeScore"),
|
| 1969 |
+
storyAfterScore: document.getElementById("storyAfterScore"),
|
| 1970 |
+
judgeRandomScore: document.getElementById("judgeRandomScore"),
|
| 1971 |
+
judgeHeuristicScore: document.getElementById("judgeHeuristicScore"),
|
| 1972 |
+
judgeDetectionScore: document.getElementById("judgeDetectionScore"),
|
| 1973 |
+
proofRandomBar: document.getElementById("proofRandomBar"),
|
| 1974 |
+
proofRandomScore: document.getElementById("proofRandomScore"),
|
| 1975 |
+
proofHeuristicBar: document.getElementById("proofHeuristicBar"),
|
| 1976 |
+
proofHeuristicScore: document.getElementById("proofHeuristicScore"),
|
| 1977 |
+
proofOracleBar: document.getElementById("proofOracleBar"),
|
| 1978 |
+
proofOracleScore: document.getElementById("proofOracleScore"),
|
| 1979 |
+
proofDetectBar: document.getElementById("proofDetectBar"),
|
| 1980 |
+
proofDetectScore: document.getElementById("proofDetectScore")
|
| 1981 |
};
|
| 1982 |
|
| 1983 |
function trustColor(value) {
|
|
|
|
| 2016 |
return {type: "delegate", specialist: best, trust};
|
| 2017 |
}
|
| 2018 |
|
| 2019 |
+
function randomMove() {
|
| 2020 |
+
const obs = state.observation;
|
| 2021 |
+
if (!obs) return {type: "delegate", specialist: "S0", trust: 0.5};
|
| 2022 |
+
const available = obs.available_specialists || ids;
|
| 2023 |
+
const specialist = available[Math.floor(Math.random() * available.length)] || "S0";
|
| 2024 |
+
return {type: "delegate", specialist, trust: obs.trust_snapshot?.[specialist] ?? 0.5};
|
| 2025 |
+
}
|
| 2026 |
+
|
| 2027 |
+
function setView(view) {
|
| 2028 |
+
state.view = view;
|
| 2029 |
+
const sectionViews = {
|
| 2030 |
+
hero: ["overview", "judge"],
|
| 2031 |
+
theater: ["playground", "judge"],
|
| 2032 |
+
command: ["playground", "judge"],
|
| 2033 |
+
mission: ["playground", "judge"],
|
| 2034 |
+
trust: ["playground", "judge"],
|
| 2035 |
+
playground: ["playground", "judge"],
|
| 2036 |
+
story: ["overview", "judge"],
|
| 2037 |
+
judge: ["judge"],
|
| 2038 |
+
readiness: ["overview"],
|
| 2039 |
+
proof: ["overview", "judge"],
|
| 2040 |
+
events: ["playground", "judge"],
|
| 2041 |
+
flow: ["overview"],
|
| 2042 |
+
themes: ["overview"]
|
| 2043 |
+
};
|
| 2044 |
+
|
| 2045 |
+
Object.entries(sectionViews).forEach(([name, views]) => {
|
| 2046 |
+
const node = document.querySelector(`section.${name}`);
|
| 2047 |
+
if (!node) return;
|
| 2048 |
+
node.classList.toggle("section-hidden", !views.includes(view));
|
| 2049 |
+
});
|
| 2050 |
+
|
| 2051 |
+
el.viewOverviewBtn.classList.toggle("active", view === "overview");
|
| 2052 |
+
el.viewPlaygroundBtn.classList.toggle("active", view === "playground");
|
| 2053 |
+
el.viewJudgeBtn.classList.toggle("active", view === "judge");
|
| 2054 |
+
|
| 2055 |
+
const copy = {
|
| 2056 |
+
overview: "Overview turns the environment into a judge-readable system story: the problem, the learning signal, and the live failure mode it fixes.",
|
| 2057 |
+
playground: "Playground is the backend-visible mode: every reset() and step() payload is shown so you can understand exactly what the environment returns.",
|
| 2058 |
+
judge: "Judge Demo is the fast pitch mode: show baseline failure, show heuristic recovery, then swap profiles to prove the agent learned a skill instead of an identity."
|
| 2059 |
+
};
|
| 2060 |
+
if (el.viewCopy) {
|
| 2061 |
+
el.viewCopy.textContent = copy[view] || copy.overview;
|
| 2062 |
+
}
|
| 2063 |
+
}
|
| 2064 |
+
|
| 2065 |
+
async function loadEvaluation() {
|
| 2066 |
+
try {
|
| 2067 |
+
const response = await fetch("/assets/evaluation_results.json");
|
| 2068 |
+
if (!response.ok) throw new Error("evaluation asset missing");
|
| 2069 |
+
state.evaluation = await response.json();
|
| 2070 |
+
renderEvaluation();
|
| 2071 |
+
} catch (error) {
|
| 2072 |
+
console.warn("Failed to load evaluation results", error);
|
| 2073 |
+
}
|
| 2074 |
+
}
|
| 2075 |
+
|
| 2076 |
+
function setMetricText(node, value, digits = 3) {
|
| 2077 |
+
if (!node || value === undefined || value === null || Number.isNaN(Number(value))) return;
|
| 2078 |
+
node.textContent = Number(value).toFixed(digits);
|
| 2079 |
+
}
|
| 2080 |
+
|
| 2081 |
+
function setMetricBar(node, value) {
|
| 2082 |
+
if (!node || value === undefined || value === null || Number.isNaN(Number(value))) return;
|
| 2083 |
+
node.style.width = `${Math.max(0, Math.min(100, Number(value) * 100))}%`;
|
| 2084 |
+
}
|
| 2085 |
+
|
| 2086 |
+
function renderEvaluation() {
|
| 2087 |
+
const data = state.evaluation;
|
| 2088 |
+
if (!data) return;
|
| 2089 |
+
|
| 2090 |
+
const overall = data.summary || {};
|
| 2091 |
+
const task3 = data.by_task?.task3 || {};
|
| 2092 |
+
const random = overall.random || {};
|
| 2093 |
+
const heuristic = overall.heuristic || {};
|
| 2094 |
+
const oracle = overall.oracle_lite || {};
|
| 2095 |
+
const task3Random = task3.random || {};
|
| 2096 |
+
const task3Heuristic = task3.heuristic || {};
|
| 2097 |
+
|
| 2098 |
+
setMetricText(el.heroRandomScore, random.avg_score);
|
| 2099 |
+
setMetricText(el.heroHeuristicScore, heuristic.avg_score);
|
| 2100 |
+
setMetricText(el.heroDetectionScore, task3Heuristic.avg_detection_rate);
|
| 2101 |
+
|
| 2102 |
+
if (el.storyBeforeScore && task3Random.avg_score !== undefined) {
|
| 2103 |
+
el.storyBeforeScore.textContent = `task3 random ${Number(task3Random.avg_score).toFixed(3)}`;
|
| 2104 |
+
}
|
| 2105 |
+
if (el.storyAfterScore && task3Heuristic.avg_score !== undefined) {
|
| 2106 |
+
el.storyAfterScore.textContent = `task3 heuristic ${Number(task3Heuristic.avg_score).toFixed(3)}`;
|
| 2107 |
+
}
|
| 2108 |
+
|
| 2109 |
+
setMetricText(el.judgeRandomScore, random.avg_score);
|
| 2110 |
+
setMetricText(el.judgeHeuristicScore, heuristic.avg_score);
|
| 2111 |
+
setMetricText(el.judgeDetectionScore, task3Heuristic.avg_detection_rate);
|
| 2112 |
+
|
| 2113 |
+
setMetricBar(el.proofRandomBar, random.avg_score);
|
| 2114 |
+
setMetricBar(el.proofHeuristicBar, heuristic.avg_score);
|
| 2115 |
+
setMetricBar(el.proofOracleBar, oracle.avg_score);
|
| 2116 |
+
setMetricBar(el.proofDetectBar, task3Heuristic.avg_detection_rate);
|
| 2117 |
+
|
| 2118 |
+
setMetricText(el.proofRandomScore, random.avg_score);
|
| 2119 |
+
setMetricText(el.proofHeuristicScore, heuristic.avg_score);
|
| 2120 |
+
setMetricText(el.proofOracleScore, oracle.avg_score);
|
| 2121 |
+
setMetricText(el.proofDetectScore, task3Heuristic.avg_detection_rate);
|
| 2122 |
+
}
|
| 2123 |
+
|
| 2124 |
function renderTrust() {
|
| 2125 |
const trust = state.observation?.trust_snapshot || Object.fromEntries(ids.map(id => [id, 0.5]));
|
| 2126 |
const values = ids.map(id => Number(trust[id] ?? 0.5));
|
|
|
|
| 2231 |
el.selfBtn.disabled = disabled;
|
| 2232 |
el.skipBtn.disabled = disabled;
|
| 2233 |
el.applyRecommendBtn.disabled = disabled;
|
| 2234 |
+
if (el.randomPolicyBtn) el.randomPolicyBtn.disabled = state.running;
|
| 2235 |
+
if (el.heuristicPolicyBtn) el.heuristicPolicyBtn.disabled = state.running;
|
| 2236 |
+
if (el.judgeSwapBtn) el.judgeSwapBtn.disabled = state.running;
|
| 2237 |
}
|
| 2238 |
|
| 2239 |
function render(result) {
|
|
|
|
| 2376 |
}
|
| 2377 |
}
|
| 2378 |
|
| 2379 |
+
async function autoRun(policy = state.demoPolicy) {
|
| 2380 |
if (!state.observation || state.done) await resetEpisode();
|
| 2381 |
let guard = 0;
|
| 2382 |
while (!state.done && guard < 70) {
|
| 2383 |
+
const move = policy === "random" ? randomMove() : recommendedMove();
|
| 2384 |
await stepEpisode(move.type, move.specialist);
|
| 2385 |
guard += 1;
|
| 2386 |
await new Promise(resolve => setTimeout(resolve, 150));
|
|
|
|
| 2392 |
await stepEpisode(move.type, move.specialist);
|
| 2393 |
}
|
| 2394 |
|
| 2395 |
+
async function runPolicy(policy) {
|
| 2396 |
+
state.demoPolicy = policy;
|
| 2397 |
+
await resetEpisode();
|
| 2398 |
+
await autoRun(policy);
|
| 2399 |
+
}
|
| 2400 |
+
|
| 2401 |
+
async function swapProfiles(policy = null) {
|
| 2402 |
const nextSeed = Number(el.seedInput.value || 0) + 1;
|
| 2403 |
el.seedInput.value = String(nextSeed);
|
| 2404 |
await resetEpisode();
|
| 2405 |
+
if (policy) {
|
| 2406 |
+
await autoRun(policy);
|
| 2407 |
+
}
|
| 2408 |
}
|
| 2409 |
|
| 2410 |
el.resetBtn.addEventListener("click", resetEpisode);
|
| 2411 |
el.resetPanelBtn.addEventListener("click", resetEpisode);
|
| 2412 |
+
el.swapBtn.addEventListener("click", () => swapProfiles());
|
| 2413 |
+
el.swapPanelBtn.addEventListener("click", () => swapProfiles());
|
| 2414 |
el.delegateBtn.addEventListener("click", () => stepEpisode("delegate"));
|
| 2415 |
el.verifyBtn.addEventListener("click", () => stepEpisode("verify"));
|
| 2416 |
el.selfBtn.addEventListener("click", () => stepEpisode("solve_independently"));
|
| 2417 |
el.skipBtn.addEventListener("click", () => stepEpisode("skip"));
|
| 2418 |
+
el.autoBtn.addEventListener("click", () => autoRun("heuristic"));
|
| 2419 |
el.applyRecommendBtn.addEventListener("click", applyRecommendation);
|
| 2420 |
+
el.viewOverviewBtn.addEventListener("click", () => setView("overview"));
|
| 2421 |
+
el.viewPlaygroundBtn.addEventListener("click", () => setView("playground"));
|
| 2422 |
+
el.viewJudgeBtn.addEventListener("click", () => setView("judge"));
|
| 2423 |
+
el.randomPolicyBtn.addEventListener("click", () => runPolicy("random"));
|
| 2424 |
+
el.heuristicPolicyBtn.addEventListener("click", () => runPolicy("heuristic"));
|
| 2425 |
+
el.judgeSwapBtn.addEventListener("click", () => swapProfiles(state.demoPolicy));
|
| 2426 |
+
|
| 2427 |
+
setView("overview");
|
| 2428 |
+
loadEvaluation();
|
| 2429 |
render();
|
| 2430 |
resetEpisode();
|
| 2431 |
</script>
|