hallumaze / index.html
Be2Jay's picture
Fix AGGREGATE_STATS avg_hc values from analysis_final2.json
91cd961
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>HalluMaze Race — 10 Model Comparison</title>
<style>
@import url('https://fonts.googleapis.com/css2?family=DM+Mono:wght@300;400;500&family=Syne:wght@400;600;700;800&display=swap');
:root {
--bg: #080a10;
--bg2: #0e1118;
--bg3: #141720;
--border: #1e2235;
--border2: #252a3d;
--text: #e2e8f0;
--text2: #8892a4;
--text3: #535c6e;
--c0: #00d4aa; /* MiniMax teal */
--c1: #a78bfa; /* GLM purple */
--c2: #fbbf24; /* Llama amber */
--c3: #34d399; /* GPT emerald */
--c4: #60a5fa; /* Gemini blue */
--c5: #f472b6; /* Haiku pink */
--accent: #00d4aa;
--red: #f87171;
--orange: #fb923c;
--yellow: #fbbf24;
--gold: #ffd700;
}
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
html, body {
height: 100%;
background: var(--bg);
color: var(--text);
font-family: 'Syne', sans-serif;
overflow: hidden;
}
body {
display: flex;
flex-direction: column;
height: 100vh;
user-select: none;
}
/* ── HEADER ── */
#header {
display: flex;
align-items: center;
gap: 20px;
padding: 10px 20px;
background: var(--bg2);
border-bottom: 1px solid var(--border);
flex-shrink: 0;
flex-wrap: wrap;
min-height: 52px;
}
#title {
font-size: 15px;
font-weight: 800;
letter-spacing: 0.12em;
text-transform: uppercase;
color: var(--accent);
white-space: nowrap;
}
#title span { color: var(--text3); font-weight: 400; }
.meta-badge {
font-family: 'DM Mono', monospace;
font-size: 11px;
color: var(--text3);
background: var(--bg3);
border: 1px solid var(--border);
padding: 2px 8px;
border-radius: 4px;
white-space: nowrap;
}
#controls {
display: flex;
align-items: center;
gap: 8px;
margin-left: auto;
}
.ctrl-btn {
background: var(--bg3);
border: 1px solid var(--border2);
color: var(--text);
padding: 5px 12px;
font-family: 'DM Mono', monospace;
font-size: 12px;
cursor: pointer;
border-radius: 5px;
transition: all 0.15s;
white-space: nowrap;
}
.ctrl-btn:hover { background: var(--border2); border-color: var(--accent); color: var(--accent); }
.ctrl-btn.active { background: var(--accent); color: #080a10; border-color: var(--accent); font-weight: 600; }
#play-btn {
background: var(--accent);
color: #080a10;
border-color: var(--accent);
font-weight: 700;
min-width: 72px;
}
#play-btn:hover { opacity: 0.85; }
.step-counter {
font-family: 'DM Mono', monospace;
font-size: 12px;
color: var(--text2);
white-space: nowrap;
}
.speed-label {
font-family: 'DM Mono', monospace;
font-size: 11px;
color: var(--text3);
}
#speed-slider {
width: 70px;
accent-color: var(--accent);
}
.view-group { display: flex; gap: 2px; }
/* ── PROGRESS BAR ── */
#progress-wrap {
padding: 0 20px 6px;
background: var(--bg2);
border-bottom: 1px solid var(--border);
flex-shrink: 0;
}
#progress-track {
height: 4px;
background: var(--bg3);
border-radius: 2px;
cursor: pointer;
position: relative;
}
#progress-fill {
height: 100%;
background: var(--accent);
border-radius: 2px;
transition: width 0.1s;
}
/* ── MAIN AREA ── */
#main {
flex: 1;
display: flex;
flex-direction: column;
overflow: hidden;
padding: 12px 16px 8px;
gap: 10px;
}
/* ── VIEWS ── */
#view-grid, #view-overlay, #view-timeline {
display: none;
flex: 1;
min-height: 0;
}
/* Grid view */
#view-grid {
flex-direction: column;
gap: 10px;
}
#grid-row {
display: grid;
grid-template-columns: repeat(6, 1fr);
gap: 8px;
flex: 1;
min-height: 0;
}
.model-card {
background: var(--bg2);
border: 1px solid var(--border);
border-radius: 10px;
display: flex;
flex-direction: column;
overflow: hidden;
transition: border-color 0.2s, box-shadow 0.2s;
min-width: 0;
}
.model-card.active-step {
border-color: var(--card-color, var(--accent));
box-shadow: 0 0 16px -4px var(--card-color, var(--accent));
}
.model-card.solved-glow {
border-color: var(--gold);
box-shadow: 0 0 20px -4px var(--gold);
}
.card-header {
display: flex;
align-items: center;
gap: 6px;
padding: 7px 10px 5px;
border-bottom: 1px solid var(--border);
}
.card-dot {
width: 8px; height: 8px;
border-radius: 50%;
flex-shrink: 0;
}
.card-name {
font-size: 11px;
font-weight: 700;
letter-spacing: 0.04em;
flex: 1;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.card-badge {
font-family: 'DM Mono', monospace;
font-size: 9px;
padding: 2px 5px;
border-radius: 3px;
font-weight: 600;
white-space: nowrap;
}
.badge-solving { background: #1e3a2f; color: #34d399; }
.badge-solved { background: #2d2500; color: #ffd700; }
.badge-stuck { background: #2d1a1a; color: #f87171; }
.badge-waiting { background: var(--bg3); color: var(--text3); }
.card-canvas-wrap {
flex: 1;
display: flex;
align-items: center;
justify-content: center;
padding: 6px;
min-height: 0;
}
canvas.maze-canvas {
display: block;
max-width: 100%;
max-height: 100%;
}
.card-stats {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 0;
border-top: 1px solid var(--border);
}
.stat-cell {
padding: 4px 6px;
text-align: center;
border-right: 1px solid var(--border);
}
.stat-cell:last-child { border-right: none; }
.stat-val {
font-family: 'DM Mono', monospace;
font-size: 11px;
font-weight: 500;
display: block;
}
.stat-lbl {
font-size: 8px;
color: var(--text3);
text-transform: uppercase;
letter-spacing: 0.05em;
}
.card-mei {
padding: 4px 10px 6px;
border-top: 1px solid var(--border);
}
.mei-track {
height: 3px;
background: var(--bg3);
border-radius: 2px;
overflow: hidden;
}
.mei-fill {
height: 100%;
border-radius: 2px;
transition: width 0.3s;
}
/* ── OVERLAY VIEW ── */
#view-overlay {
align-items: center;
justify-content: center;
gap: 24px;
}
#overlay-canvas-wrap {
position: relative;
flex-shrink: 0;
}
canvas#overlay-canvas {
display: block;
}
#overlay-legend {
display: flex;
flex-direction: column;
gap: 10px;
min-width: 160px;
}
.legend-row {
display: flex;
align-items: center;
gap: 10px;
padding: 8px 12px;
background: var(--bg2);
border: 1px solid var(--border);
border-radius: 7px;
transition: border-color 0.2s;
}
.legend-row.active-step { border-color: var(--row-color); }
.legend-color {
width: 12px; height: 12px;
border-radius: 50%;
flex-shrink: 0;
}
.legend-name {
font-size: 12px;
font-weight: 600;
flex: 1;
}
.legend-steps {
font-family: 'DM Mono', monospace;
font-size: 11px;
color: var(--text3);
}
/* ── TIMELINE VIEW ── */
#view-timeline {
flex-direction: column;
gap: 0;
overflow-y: auto;
}
.timeline-row {
display: flex;
align-items: center;
gap: 12px;
padding: 8px 0;
border-bottom: 1px solid var(--border);
}
.tl-name {
font-size: 11px;
font-weight: 700;
width: 100px;
flex-shrink: 0;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.tl-bar-wrap {
flex: 1;
position: relative;
height: 28px;
background: var(--bg3);
border-radius: 4px;
overflow: hidden;
cursor: pointer;
}
.tl-segment {
position: absolute;
top: 0; height: 100%;
border-right: 1px solid var(--bg);
transition: opacity 0.15s;
}
.tl-segment:hover { opacity: 0.8; }
.tl-playhead {
position: absolute;
top: -2px; bottom: -2px;
width: 2px;
background: white;
pointer-events: none;
transition: left 0.1s;
z-index: 10;
}
.tl-stats {
display: flex;
gap: 8px;
min-width: 120px;
}
.tl-stat {
font-family: 'DM Mono', monospace;
font-size: 10px;
color: var(--text3);
white-space: nowrap;
}
.tl-stat span { color: var(--text2); }
/* ── LEADERBOARD ── */
#leaderboard {
background: var(--bg2);
border: 1px solid var(--border);
border-radius: 8px;
overflow: hidden;
flex-shrink: 0;
}
#lb-table {
width: 100%;
border-collapse: collapse;
font-family: 'DM Mono', monospace;
font-size: 11px;
}
#lb-table thead tr {
background: var(--bg3);
}
#lb-table th {
padding: 5px 10px;
text-align: left;
color: var(--text3);
font-size: 10px;
letter-spacing: 0.08em;
text-transform: uppercase;
font-weight: 500;
white-space: nowrap;
}
#lb-table td {
padding: 4px 10px;
border-top: 1px solid var(--border);
white-space: nowrap;
}
#lb-table tr.active-row td { background: rgba(255,255,255,0.03); }
.lb-rank { color: var(--text3); font-size: 10px; }
.lb-model { font-weight: 600; font-size: 11px; }
.lb-dot { display: inline-block; width: 7px; height: 7px; border-radius: 50%; margin-right: 5px; }
.lb-mei-bar {
display: inline-block;
height: 3px;
border-radius: 2px;
vertical-align: middle;
transition: width 0.3s;
}
/* ── METRICS PANEL ── */
/* Radar standalone panel */
#radar-panel {
background: var(--bg2);
border: 1px solid var(--border);
border-radius: 8px;
padding: 14px 16px;
display: flex;
flex-direction: column;
align-items: center;
}
#radar-panel h3 {
font-family: 'Syne', sans-serif;
font-size: 11px;
letter-spacing: 0.1em;
text-transform: uppercase;
color: var(--text3);
margin-bottom: 10px;
font-weight: 700;
align-self: flex-start;
}
/* Performance comparison panel (grid tab only) */
#metrics-panel {
background: var(--bg2);
border: 1px solid var(--border);
border-radius: 8px;
padding: 14px 16px;
}
#metrics-panel h3 {
font-family: 'Syne', sans-serif;
font-size: 11px;
letter-spacing: 0.1em;
text-transform: uppercase;
color: var(--text3);
margin-bottom: 12px;
font-weight: 700;
}
.metrics-bars-section {
width: 100%;
}
.metric-group {
margin-bottom: 10px;
}
.metric-label {
font-family: 'DM Mono', monospace;
font-size: 9px;
text-transform: uppercase;
letter-spacing: 0.08em;
color: var(--text3);
margin-bottom: 4px;
}
.metric-row {
display: flex;
align-items: center;
gap: 6px;
margin-bottom: 3px;
}
.metric-model-name {
font-family: 'DM Mono', monospace;
font-size: 9px;
color: var(--text3);
width: 78px;
flex-shrink: 0;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.metric-bar-track {
flex: 1;
height: 8px;
background: var(--bg3);
border-radius: 4px;
overflow: hidden;
position: relative;
}
.metric-bar-fill {
height: 100%;
border-radius: 4px;
transition: width 0.4s ease;
}
.metric-val {
font-family: 'DM Mono', monospace;
font-size: 9px;
color: var(--text2);
width: 72px;
text-align: right;
flex-shrink: 0;
white-space: nowrap;
}
.radar-section {
flex-shrink: 0;
display: flex;
flex-direction: column;
align-items: center;
}
.radar-section h4 {
font-family: 'DM Mono', monospace;
font-size: 9px;
color: var(--text3);
text-transform: uppercase;
letter-spacing: 0.08em;
margin-bottom: 6px;
}
#radar-canvas {
display: block;
}
#radar-legend {
display: flex;
flex-wrap: wrap;
gap: 2px 8px;
margin-top: 8px;
justify-content: center;
max-width: 240px;
}
.radar-leg-item {
display: flex;
align-items: center;
gap: 4px;
cursor: pointer;
padding: 1px 4px;
border-radius: 3px;
transition: background 0.15s, opacity 0.15s;
}
.radar-leg-item:hover { background: rgba(255,255,255,0.06); }
.radar-leg-item.dimmed { opacity: 0.25; }
.radar-leg-item.selected { background: rgba(255,255,255,0.1); }
.radar-leg-swatch {
width: 8px; height: 8px; border-radius: 2px; flex-shrink: 0;
}
.radar-leg-name {
font-family: 'DM Mono', monospace;
font-size: 8px;
color: rgba(255,255,255,0.5);
white-space: nowrap;
}
/* ── HALL FLASH OVERLAY ── */
.hall-flash {
position: absolute;
inset: 0;
background: rgba(248,113,113,0.15);
border-radius: 8px;
pointer-events: none;
animation: flashAnim 0.6s ease-out forwards;
}
@keyframes flashAnim {
0% { opacity: 1; }
100% { opacity: 0; }
}
/* ── CROWN ── */
.crown-overlay {
position: absolute;
top: 4px; right: 6px;
font-size: 16px;
animation: crownPop 0.4s cubic-bezier(0.175,0.885,0.32,1.275) forwards;
}
@keyframes crownPop {
0% { transform: scale(0) rotate(-20deg); opacity: 0; }
100% { transform: scale(1) rotate(0); opacity: 1; }
}
/* Keyboard hint */
#kbd-hint {
position: fixed;
bottom: 10px; right: 14px;
font-family: 'DM Mono', monospace;
font-size: 10px;
color: var(--text3);
pointer-events: none;
}
kbd {
background: var(--bg3);
border: 1px solid var(--border2);
border-radius: 3px;
padding: 1px 4px;
}
/* ── Layout fix: maze grid fully visible, scroll allowed ── */
html { overflow: auto !important; }
body { height: auto !important; min-height: 100vh; overflow: auto !important; }
#main { overflow-y: visible; height: auto; min-height: 100vh; }
/* Grid row: take remaining space, minimum 320px for canvas visibility */
#grid-row { min-height: 320px; flex: 1 1 auto; }
/* Model card: allow full content visibility */
.model-card { overflow: visible; }
.card-canvas-wrap { min-height: 120px; }
/* Leaderboard: compact, scrollable */
#leaderboard { max-height: 180px; overflow-y: auto; flex-shrink: 0; }
/* Panels inside grid tab */
#radar-panel { flex-shrink: 0; }
#metrics-panel { flex-shrink: 0; }
/* Canvas wraps: ensure minimum height */
.card-canvas-wrap { min-height: 120px; }
</style>
</head>
<body>
<!-- HEADER -->
<div id="header">
<div id="title">HalluMaze <span>/</span> Race</div>
<div class="meta-badge">10 Models</div>
<div class="meta-badge">Seed 4004</div>
<div class="meta-badge">5&times;5 Maze</div>
<div class="view-group">
<button class="ctrl-btn active" id="btn-grid" onclick="setView('grid')">Grid</button>
<button class="ctrl-btn" id="btn-overlay" onclick="setView('overlay')">Overlay</button>
<button class="ctrl-btn" id="btn-timeline" onclick="setView('timeline')">Timeline</button>
</div>
<div id="controls">
<button class="ctrl-btn" onclick="stepBackward()">&#9664;</button>
<button class="ctrl-btn active" id="play-btn" onclick="togglePlay()">&#9654; Play</button>
<button class="ctrl-btn" onclick="stepForward()">&#9654;&#9654;</button>
<span class="step-counter" id="step-counter">Step 0 / 0</span>
<span class="speed-label">Speed</span>
<input type="range" id="speed-slider" min="1" max="10" value="5">
</div>
<a href="hallumaze_final.html" class="ctrl-btn" style="text-decoration:none;margin-left:8px;" target="_blank">Leaderboard ↗</a>
<a href="https://github.com/jaytoone/HalluMaze" class="ctrl-btn" style="text-decoration:none;" target="_blank">GitHub ↗</a>
</div>
<div id="progress-wrap">
<div id="progress-track" onclick="seekProgress(event)">
<div id="progress-fill" style="width:0%"></div>
</div>
</div>
<!-- MAIN -->
<div id="main">
<!-- GRID VIEW -->
<div id="view-grid" style="display:flex">
<div id="grid-row"></div>
<div id="leaderboard">
<table id="lb-table">
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Steps</th>
<th>Halls</th>
<th>BT</th>
<th>MEI</th>
<th>Status</th>
</tr>
</thead>
<tbody id="lb-body"></tbody>
</table>
</div>
<!-- RADAR PANEL — standalone, above comparison -->
<div id="radar-panel">
<h3>5-Dimension Radar</h3>
<canvas id="radar-canvas" width="240" height="240"></canvas>
<div id="radar-legend"></div>
</div>
<!-- PERFORMANCE COMPARISON — bars only, grid tab only -->
<div id="metrics-panel">
<h3>Performance Metrics Comparison</h3>
<div class="metrics-bars-section">
<div class="metric-group">
<div class="metric-label">MEI — Metacognitive Escape Index</div>
<div id="metric-mei-rows"></div>
</div>
<div class="metric-group">
<div class="metric-label">Efficiency — Steps vs Optimal</div>
<div id="metric-eff-rows"></div>
</div>
<div class="metric-group">
<div class="metric-label">Hallucinations &amp; Backtracks (lower=better)</div>
<div id="metric-err-rows"></div>
</div>
</div>
</div>
</div>
<!-- OVERLAY VIEW -->
<div id="view-overlay">
<div id="overlay-canvas-wrap">
<canvas id="overlay-canvas"></canvas>
</div>
<div id="overlay-legend"></div>
</div>
<!-- TIMELINE VIEW -->
<div id="view-timeline" id="view-timeline"></div>
</div>
<div id="kbd-hint"><kbd>Space</kbd> play/pause &nbsp; <kbd>&larr;</kbd><kbd>&rarr;</kbd> step</div>
<script>
const RAW_DATA = {"seed":4004,"size":5,"timestamp":"2026-03-22 22:38","maze":{"N":5,"walls":[[{"N":true,"S":true,"E":false,"W":true},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":false,"E":true,"W":false},{"N":true,"S":false,"E":true,"W":true}],[{"N":true,"S":false,"E":false,"W":true},{"N":true,"S":false,"E":true,"W":false},{"N":true,"S":false,"E":true,"W":true},{"N":false,"S":true,"E":false,"W":true},{"N":false,"S":false,"E":true,"W":false}],[{"N":false,"S":false,"E":true,"W":true},{"N":false,"S":true,"E":false,"W":true},{"N":false,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":true,"W":false},{"N":false,"S":false,"E":true,"W":true}],[{"N":false,"S":false,"E":false,"W":true},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":false,"S":true,"E":true,"W":false}],[{"N":false,"S":true,"E":false,"W":true},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":true,"W":false}]],"start":[0,0],"end":[4,4],"solution":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"mirage_positions":[[1,4],[3,0]]},"results":[{"model":"MiniMax-M2.5","provider":"minimax","solved":true,"mei":0.593,"score":0.65,"hallucination_count":0,"backtrack_count":0,"loop_count":0,"brs":1.0,"latency_s":125.59,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":3,"r":0,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":4,"r":0,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":5,"r":1,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":6,"r":1,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":7,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":8,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":9,"r":3,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":10,"r":3,"c":2,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":11,"r":3,"c":1,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":12,"r":3,"c":0,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":13,"r":4,"c":0,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":14,"r":4,"c":1,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":15,"r":4,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":16,"r":4,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.533,"hrr":0.6,"avg_hc":0.1,"n_trials":60},{"model":"glm-4.7","provider":"glm","solved":true,"mei":0.615,"score":0.744,"hallucination_count":0,"backtrack_count":2,"loop_count":4,"brs":1.0,"latency_s":19.31,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":6,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":7,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":8,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":9,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":95},{"step":10,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":11,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":12,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":13,"r":3,"c":0,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":14,"r":2,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":15,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":16,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":17,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":18,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":19,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":20,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":21,"r":4,"c":4,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null}],"sr_agg":0.083,"hrr":0.718,"avg_hc":1.4,"n_trials":60},{"model":"Llama-4-Scout","solved":true,"mei":0.589,"score":0.604,"hallucination_count":2,"backtrack_count":2,"loop_count":1,"brs":1.0,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[0,4],[1,4],[2,4],[3,4],[3,3],[3,4],[2,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":77},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":61},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":78},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":94},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":6,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":86},{"step":7,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":50},{"step":8,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":9,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":94},{"step":10,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":74},{"step":11,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":45},{"step":12,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":73},{"step":13,"r":2,"c":4,"direction":"W\u2190","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":67},{"step":14,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":40},{"step":15,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":16,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":17,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":18,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":94},{"step":19,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":86},{"step":20,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":68},{"step":21,"r":4,"c":1,"direction":"S\u2193","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":22,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":86},{"step":23,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":91},{"step":24,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":84},{"step":25,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.083,"hrr":0.81,"avg_hc":2.0,"n_trials":60},{"model":"GPT-4o-mini","solved":true,"mei":0.345,"score":0.68,"hallucination_count":1,"backtrack_count":16,"loop_count":15,"brs":1.0,"path":[[0,0],[0,1],[0,0],[0,0],[0,1],[0,2],[0,3],[0,2],[0,3],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,2],[3,3],[3,2],[3,3],[3,4],[3,3],[3,4],[2,4],[3,4],[2,4],[1,4],[1,3],[1,4],[0,4],[1,4],[0,4],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":63},{"step":2,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":49},{"step":3,"r":0,"c":0,"direction":"W\u2190","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":64},{"step":4,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":71},{"step":5,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":68},{"step":6,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":76},{"step":7,"r":0,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":59},{"step":8,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":72},{"step":9,"r":0,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":59},{"step":10,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":42},{"step":11,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":12,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":66},{"step":13,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":14,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":83},{"step":15,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":97},{"step":16,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":17,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":18,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":55},{"step":19,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":59},{"step":20,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":34},{"step":21,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":54},{"step":22,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":52},{"step":23,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":32},{"step":24,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":54},{"step":25,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":48},{"step":26,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":22},{"step":27,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":58},{"step":28,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":59},{"step":29,"r":1,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":52},{"step":30,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":35},{"step":31,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":69},{"step":32,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":23},{"step":33,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":77},{"step":34,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":35,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":27},{"step":36,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":37,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":38,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":22},{"step":39,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":40,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":76},{"step":41,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":93},{"step":42,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":91},{"step":43,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":61},{"step":44,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":45,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":46,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.125,"hrr":0.312,"avg_hc":0.8,"n_trials":16},{"model":"Gemini-Flash","solved":true,"mei":0.432,"score":0.68,"hallucination_count":1,"backtrack_count":6,"loop_count":9,"brs":1.0,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,4],[3,3],[3,2],[3,1],[3,2],[3,3],[3,4],[2,4],[1,4],[0,4],[1,4],[1,4],[2,4],[3,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":79},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":64},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":6,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":7,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":8,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":9,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":54},{"step":10,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":62},{"step":11,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":12,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":76},{"step":13,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":55},{"step":14,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":56},{"step":15,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":49},{"step":16,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":63},{"step":17,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":51},{"step":18,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":73},{"step":19,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":53},{"step":20,"r":1,"c":4,"direction":"E\u2192","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":21,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":57},{"step":22,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":23,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":24,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":25,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":33},{"step":26,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":56},{"step":27,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":28,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":64},{"step":29,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":30,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":31,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":32,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":82},{"step":33,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":34,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.083,"hrr":0.403,"avg_hc":1.1,"n_trials":60},{"model":"Claude-Haiku","solved":false,"mei":0.398,"score":0.398,"hallucination_count":1,"backtrack_count":21,"loop_count":26,"brs":0.0,"path":[[0,0],[0,1],[0,0],[0,1],[0,0],[0,1],[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,3],[3,4],[3,4],[2,4],[3,4],[3,3],[3,4],[2,4],[3,4],[2,4],[1,4],[1,3],[1,4],[0,4],[1,4],[1,3],[0,3],[0,2],[0,3],[1,3],[0,3],[1,3],[1,4],[0,4],[1,4],[0,4],[1,4],[2,4],[3,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":2,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":49},{"step":3,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":64},{"step":4,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":54},{"step":5,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":50},{"step":6,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":7,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":55},{"step":8,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":9,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":81},{"step":10,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":81},{"step":11,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":95},{"step":12,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":83},{"step":13,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":77},{"step":14,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":84},{"step":15,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":16,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":61},{"step":17,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":81},{"step":18,"r":3,"c":4,"direction":"S\u2193","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":48},{"step":19,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":58},{"step":20,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":34},{"step":21,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":58},{"step":22,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":26},{"step":23,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":39},{"step":24,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":25,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":32},{"step":26,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":73},{"step":27,"r":1,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":28,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":66},{"step":29,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":83},{"step":30,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":50},{"step":31,"r":1,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":58},{"step":32,"r":1,"c":3,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":78},{"step":33,"r":0,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":34,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":43},{"step":35,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":44},{"step":36,"r":1,"c":3,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":27},{"step":37,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":38,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":27},{"step":39,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":45},{"step":40,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":41,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":42},{"step":42,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":43,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":33},{"step":44,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":45,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":46,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":47,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":42},{"step":48,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":78},{"step":49,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":96},{"step":50,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":74},{"step":51,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":52,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":89},{"step":53,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":77},{"step":54,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":66},{"step":55,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":56,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.05,"hrr":0.363,"avg_hc":3.93,"n_trials":60,"error":null},{"model":"Llama-4-Maverick","provider":"openrouter","solved":true,"mei":0.9,"score":0.9,"hallucination_count":0,"backtrack_count":5,"loop_count":0,"brs":1.0,"latency_s":92.52,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":6,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":7,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":8,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":9,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":10,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":11,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":12,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":13,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":14,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":15,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":16,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":17,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":18,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":30},{"step":19,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":20,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":21,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":22,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":23,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":24,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":25,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":26,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":27,"r":0,"c":0,"direction":"\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null}],"sr_agg":1.0,"hrr":1.0,"avg_hc":0,"n_trials":1},{"model":"Qwen-2.5-72B","provider":"openrouter","solved":false,"mei":0.476,"score":0.476,"hallucination_count":4,"backtrack_count":1,"loop_count":3,"brs":0.2,"latency_s":57.65,"path":[],"steps":[{"step":1,"r":0,"c":0,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":2,"r":0,"c":1,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":3,"r":0,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":4,"r":0,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":5,"r":1,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":6,"r":0,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":7,"r":0,"c":2,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":8,"r":1,"c":2,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":9,"r":2,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":10,"r":2,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":11,"r":2,"c":2,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":12,"r":1,"c":2,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":13,"r":0,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":14,"r":0,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":15,"r":1,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":16,"r":2,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":17,"r":1,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":18,"r":1,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":19,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":20,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":21,"r":3,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":22,"r":3,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":23,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":24,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":25,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":26,"r":3,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":27,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":28,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":29,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":30,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":31,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":32,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":33,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":34,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":35,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":36,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":37,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":38,"r":3,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":39,"r":3,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":40,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":41,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":42,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":43,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":44,"r":2,"c":4,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":45,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":46,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":47,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":48,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":49,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":50,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":51,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":52,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":53,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":54,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":55,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":56,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null}],"sr_agg":0.0,"hrr":0.2,"avg_hc":4,"n_trials":1},{"model":"Claude-3.7-Sonnet","provider":"openrouter","solved":true,"mei":0.9,"score":0.9,"hallucination_count":0,"backtrack_count":0,"loop_count":0,"brs":1.0,"latency_s":18.22,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":6,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":7,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":8,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":9,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":10,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":11,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":12,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":13,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":14,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":15,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":95},{"step":16,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":1.0,"hrr":1.0,"avg_hc":0,"n_trials":1},{"model":"GPT-4o","provider":"openrouter","solved":false,"mei":0.594,"score":0.594,"hallucination_count":0,"backtrack_count":0,"loop_count":0,"brs":1.0,"latency_s":14.25,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[2,3],[2,2],[1,2],[0,2],[1,3],[1,4],[0,4],[1,4],[2,4],[3,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":2,"r":0,"c":1,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":3,"r":0,"c":2,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":4,"r":0,"c":3,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":5,"r":1,"c":3,"direction":"E","confidence":80,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":6,"r":1,"c":4,"direction":"S","confidence":70,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":7,"r":2,"c":4,"direction":"S","confidence":70,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":3,"c":4,"direction":"N","confidence":90,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":3,"c":4,"direction":"W","confidence":80,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":7,"r":2,"c":3,"direction":"N","confidence":70,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":2,"c":2,"direction":"E","confidence":60,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":2,"c":2,"direction":"N","confidence":60,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":9,"r":1,"c":2,"direction":"N","confidence":60,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":10,"r":0,"c":2,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":11,"r":1,"c":3,"direction":"N","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":12,"r":1,"c":4,"direction":"N","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":13,"r":0,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":14,"r":1,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":15,"r":2,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":16,"r":3,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false}],"sr_agg":0.0,"hrr":1.0,"avg_hc":0,"n_trials":1}]};
// ── HELPERS ──
const SHORT_NAME_MAP = {
'Llama-4-Scout': 'Llama-4-Scout',
'Llama-4-Maverick': 'Llama-4-Maverick',
'Claude-3.7-Sonnet':'Claude-3.7',
'Claude-Haiku': 'Claude-Haiku',
'MiniMax-M2.5': 'MiniMax',
'GPT-4o-mini': 'GPT-4o-mini',
'GPT-4o': 'GPT-4o',
'Gemini-Flash': 'Gemini',
'Qwen-2.5-72B': 'Qwen-2.5',
'glm-4.7': 'GLM-4.7',
};
function getShortName(model) {
return SHORT_NAME_MAP[model] || model.split('-').slice(0,2).join('-');
}
// ── CONSTANTS ──
const MODEL_COLORS = [
'#00d4aa', '#a78bfa', '#fbbf24', '#34d399', '#60a5fa', '#f472b6',
'#fb923c', '#e879f9', '#38bdf8', '#f87171'
];
const MODELS = RAW_DATA.results;
// Aggregate stats (n=60 per model) — source: experiment_results/analysis_final2.json
const AGGREGATE_STATS = {
'Claude-3.7-Sonnet': { mei: 0.7742, hrr: 0.875, sr: 0.5667, avg_hc: 0.85, avg_bt: 0.2, n: 60 },
'glm-4.7': { mei: 0.6145, hrr: 0.7181, sr: 0.0833, avg_hc: 1.43, avg_bt: 2.0, n: 60 },
'Llama-4-Maverick': { mei: 0.6001, hrr: 0.8111, sr: 0.1333, avg_hc: 1.8, avg_bt: 5.0, n: 60 },
'MiniMax-M2.5': { mei: 0.5932, hrr: 0.6, sr: 0.5333, avg_hc: 0.07, avg_bt: 0.0, n: 60 },
'Llama-4-Scout': { mei: 0.5892, hrr: 0.81, sr: 0.0833, avg_hc: 2.05, avg_bt: 2.0, n: 60 },
'Qwen-2.5-72B': { mei: 0.5588, hrr: 0.607, sr: 0.1, avg_hc: 2.27, avg_bt: 1.0, n: 60 },
'Gemini-Flash': { mei: 0.4317, hrr: 0.4028, sr: 0.0833, avg_hc: 1.07, avg_bt: 6.0, n: 60 },
'Claude-Haiku': { mei: 0.3983, hrr: 0.3634, sr: 0.05, avg_hc: 3.93, avg_bt: 21.0, n: 60 },
'GPT-4o-mini': { mei: 0.3908, hrr: 0.3819, sr: 0.05, avg_hc: 1.28, avg_bt: 16.0, n: 60 },
'GPT-4o': { mei: 0.3146, hrr: 0.3528, sr: 0.0667, avg_hc: 0.7, avg_bt: 0.0, n: 60 },
};
const MAZE = RAW_DATA.maze;
const N = MAZE.N;
const WALLS = MAZE.walls;
const END = MAZE.end;
const SOLUTION_PATH = MAZE.solution;
const MAX_STEPS = Math.max(...MODELS.map(m => m.steps.length));
// Current state
let currentStep = 0;
let playing = false;
let playInterval = null;
let currentView = 'grid';
// Per-model canvases in grid view
const gridCanvases = [];
const gridCtxs = [];
// ── HELPERS ──
function canMove(r, c, dir) {
if (r < 0 || r >= N || c < 0 || c >= N) return false;
return !WALLS[r][c][dir];
}
function getStepsUpTo(modelIdx, step) {
const m = MODELS[modelIdx];
return m.steps.slice(0, Math.min(step, m.steps.length));
}
function getCurrentPos(modelIdx, step) {
const m = MODELS[modelIdx];
if (m.steps.length === 0) return [0, 0];
const s = m.steps[Math.min(step, m.steps.length) - 1];
if (!s) return [0, 0];
return [s.r, s.c];
}
function getModelStatus(modelIdx, step) {
const m = MODELS[modelIdx];
if (step === 0) return 'waiting';
const stepsAvail = Math.min(step, m.steps.length);
const pos = getCurrentPos(modelIdx, step);
if (pos[0] === END[0] && pos[1] === END[1]) return 'solved';
if (stepsAvail >= m.steps.length) return m.solved ? 'solved' : 'stuck';
return 'solving';
}
function meiAtStep(modelIdx, step) {
const m = MODELS[modelIdx];
const stepsUsed = Math.min(step, m.steps.length);
if (stepsUsed === 0) return 0;
const solLen = SOLUTION_PATH.length;
const progress = stepsUsed / m.steps.length;
return Math.min(m.mei, m.mei * Math.pow(progress, 0.5));
}
// ── MAZE DRAWING ──
function drawMaze(ctx, size, modelIdx, upToStep, options = {}) {
const { showTrail = true, trailAlpha = 1, agentSize = 0.35 } = options;
const cell = size / N;
const pad = cell * 0.08;
// Background
ctx.fillStyle = '#0e1118';
ctx.fillRect(0, 0, size, size);
// Solution path hint (very faint)
ctx.fillStyle = 'rgba(255,255,255,0.02)';
for (const [sr, sc] of SOLUTION_PATH) {
ctx.fillRect(sc * cell + 0.5, sr * cell + 0.5, cell - 1, cell - 1);
}
// Trail
const color = MODEL_COLORS[modelIdx];
const steps = getStepsUpTo(modelIdx, upToStep);
if (showTrail && steps.length > 0) {
const trailLen = steps.length;
for (let i = 0; i < trailLen; i++) {
const s = steps[i];
const alpha = trailAlpha * (0.15 + 0.65 * (i / trailLen));
ctx.fillStyle = hexToRgba(color, alpha);
ctx.fillRect(s.c * cell + pad, s.r * cell + pad, cell - 2*pad, cell - 2*pad);
}
// Hallucination flashes
for (const s of steps) {
if (s.is_hallucination) {
ctx.fillStyle = 'rgba(248,113,113,0.35)';
ctx.fillRect(s.c * cell + 1, s.r * cell + 1, cell - 2, cell - 2);
// lightning bolt
ctx.fillStyle = '#fbbf24';
ctx.font = `bold ${Math.floor(cell*0.45)}px sans-serif`;
ctx.textAlign = 'center';
ctx.textBaseline = 'middle';
ctx.fillText('!', s.c * cell + cell/2, s.r * cell + cell/2);
}
}
}
// Walls
ctx.strokeStyle = '#2a3048';
ctx.lineWidth = 1.5;
for (let r = 0; r < N; r++) {
for (let c = 0; c < N; c++) {
const w = WALLS[r][c];
const x = c * cell, y = r * cell;
ctx.beginPath();
if (w.N) { ctx.moveTo(x, y); ctx.lineTo(x + cell, y); }
ctx.stroke(); ctx.beginPath();
if (w.S) { ctx.moveTo(x, y+cell); ctx.lineTo(x+cell, y+cell); }
ctx.stroke(); ctx.beginPath();
if (w.W) { ctx.moveTo(x, y); ctx.lineTo(x, y+cell); }
ctx.stroke(); ctx.beginPath();
if (w.E) { ctx.moveTo(x+cell, y); ctx.lineTo(x+cell, y+cell); }
ctx.stroke();
}
}
// Border
ctx.strokeStyle = '#3a4060';
ctx.lineWidth = 2;
ctx.strokeRect(1, 1, size - 2, size - 2);
// Start marker
ctx.fillStyle = '#34d399';
ctx.font = `${Math.floor(cell*0.4)}px sans-serif`;
ctx.textAlign = 'center'; ctx.textBaseline = 'middle';
ctx.fillText('S', cell*0.5, cell*0.5);
// End marker
const status = getModelStatus(modelIdx, upToStep);
if (status === 'solved') {
ctx.fillStyle = '#ffd700';
ctx.font = `bold ${Math.floor(cell*0.5)}px sans-serif`;
ctx.fillText('G', END[1]*cell + cell/2, END[0]*cell + cell/2);
} else {
ctx.fillStyle = '#f87171';
ctx.fillText('G', END[1]*cell + cell/2, END[0]*cell + cell/2);
}
// Agent dot
if (upToStep > 0) {
const [ar, ac] = getCurrentPos(modelIdx, upToStep);
const cx = ac * cell + cell/2;
const cy = ar * cell + cell/2;
const r = cell * agentSize;
ctx.shadowColor = color;
ctx.shadowBlur = 10;
ctx.fillStyle = color;
ctx.beginPath();
ctx.arc(cx, cy, r, 0, Math.PI * 2);
ctx.fill();
ctx.shadowBlur = 0;
// Current step indicator (backtrack/hall)
if (steps.length > 0) {
const lastStep = steps[steps.length - 1];
if (lastStep.is_backtrack) {
ctx.strokeStyle = '#fb923c';
ctx.lineWidth = 2;
ctx.beginPath();
ctx.arc(cx, cy, r + 2, 0, Math.PI * 2);
ctx.stroke();
}
}
}
}
function hexToRgba(hex, alpha) {
const r = parseInt(hex.slice(1,3), 16);
const g = parseInt(hex.slice(3,5), 16);
const b = parseInt(hex.slice(5,7), 16);
return `rgba(${r},${g},${b},${alpha})`;
}
// ── BUILD GRID VIEW ──
function buildGridView() {
const container = document.getElementById('grid-row');
container.innerHTML = '';
gridCanvases.length = 0;
gridCtxs.length = 0;
MODELS.forEach((m, i) => {
const color = MODEL_COLORS[i];
const card = document.createElement('div');
card.className = 'model-card';
card.id = `card-${i}`;
card.style.setProperty('--card-color', color);
card.innerHTML = `
<div class="card-header">
<div class="card-dot" style="background:${color}"></div>
<div class="card-name">${m.model}</div>
<div class="card-badge badge-waiting" id="badge-${i}">WAITING</div>
</div>
<div class="card-canvas-wrap" id="canvas-wrap-${i}">
<canvas id="maze-${i}" class="maze-canvas"></canvas>
</div>
<div class="card-stats">
<div class="stat-cell">
<span class="stat-val" id="steps-${i}" style="color:${color}">0</span>
<span class="stat-lbl">Steps</span>
</div>
<div class="stat-cell">
<span class="stat-val" id="halls-${i}" style="color:var(--red)">0</span>
<span class="stat-lbl">Halls</span>
</div>
<div class="stat-cell">
<span class="stat-val" id="bt-${i}" style="color:var(--orange)">0</span>
<span class="stat-lbl">BT</span>
</div>
</div>
<div class="card-mei">
<div class="mei-track">
<div class="mei-fill" id="mei-fill-${i}" style="background:${color};width:0%"></div>
</div>
</div>
`;
container.appendChild(card);
const canvas = document.getElementById(`maze-${i}`);
gridCanvases.push(canvas);
gridCtxs.push(canvas.getContext('2d'));
});
resizeGridCanvases();
}
function resizeGridCanvases() {
MODELS.forEach((m, i) => {
const wrap = document.getElementById(`canvas-wrap-${i}`);
if (!wrap) return;
const w = wrap.clientWidth - 12;
const h = wrap.clientHeight - 12;
const s = Math.min(w, h, 240);
gridCanvases[i].width = s;
gridCanvases[i].height = s;
});
}
// ── BUILD OVERLAY VIEW ──
function buildOverlayView() {
const wrap = document.getElementById('overlay-canvas-wrap');
const canvas = document.getElementById('overlay-canvas');
const legend = document.getElementById('overlay-legend');
const mainArea = document.getElementById('main');
const avail = Math.min(mainArea.clientHeight - 60, 420);
const size = Math.max(avail, 260);
canvas.width = size;
canvas.height = size;
legend.innerHTML = '';
MODELS.forEach((m, i) => {
const row = document.createElement('div');
row.className = 'legend-row';
row.id = `legend-${i}`;
row.style.setProperty('--row-color', MODEL_COLORS[i]);
row.innerHTML = `
<div class="legend-color" style="background:${MODEL_COLORS[i]}"></div>
<div class="legend-name">${m.model}</div>
<div class="legend-steps" id="legend-steps-${i}">0 steps</div>
`;
legend.appendChild(row);
});
}
function drawOverlay(step) {
const canvas = document.getElementById('overlay-canvas');
if (!canvas) return;
const ctx = canvas.getContext('2d');
const size = canvas.width;
const cell = size / N;
// Background + walls (same as single maze)
ctx.fillStyle = '#0e1118';
ctx.fillRect(0, 0, size, size);
// Solution hint
ctx.fillStyle = 'rgba(255,255,255,0.025)';
for (const [sr, sc] of SOLUTION_PATH) {
ctx.fillRect(sc*cell+0.5, sr*cell+0.5, cell-1, cell-1);
}
// All trails first
MODELS.forEach((m, i) => {
const steps = getStepsUpTo(i, step);
const color = MODEL_COLORS[i];
steps.forEach((s, idx) => {
const alpha = 0.1 + 0.2 * (idx / Math.max(steps.length, 1));
ctx.fillStyle = hexToRgba(color, alpha);
const pad = cell * 0.08;
ctx.fillRect(s.c*cell+pad, s.r*cell+pad, cell-2*pad, cell-2*pad);
});
});
// Walls
ctx.strokeStyle = '#2a3048';
ctx.lineWidth = 1.5;
for (let r = 0; r < N; r++) {
for (let c = 0; c < N; c++) {
const w = WALLS[r][c];
const x = c*cell, y = r*cell;
['N','S','W','E'].forEach(dir => {
if (w[dir]) {
ctx.beginPath();
if (dir==='N') { ctx.moveTo(x,y); ctx.lineTo(x+cell,y); }
if (dir==='S') { ctx.moveTo(x,y+cell); ctx.lineTo(x+cell,y+cell); }
if (dir==='W') { ctx.moveTo(x,y); ctx.lineTo(x,y+cell); }
if (dir==='E') { ctx.moveTo(x+cell,y); ctx.lineTo(x+cell,y+cell); }
ctx.stroke();
}
});
}
}
ctx.strokeStyle = '#3a4060';
ctx.lineWidth = 2;
ctx.strokeRect(1, 1, size-2, size-2);
// Start / End
ctx.fillStyle = '#34d399';
ctx.font = `bold ${Math.floor(cell*0.4)}px sans-serif`;
ctx.textAlign = 'center'; ctx.textBaseline = 'middle';
ctx.fillText('S', cell*0.5, cell*0.5);
ctx.fillStyle = '#f87171';
ctx.fillText('G', END[1]*cell+cell/2, END[0]*cell+cell/2);
// Agent dots with initials
MODELS.forEach((m, i) => {
if (step === 0) return;
const [ar, ac] = getCurrentPos(i, step);
const cx = ac*cell + cell/2;
const cy = ar*cell + cell/2;
const rad = cell * 0.32;
const color = MODEL_COLORS[i];
ctx.shadowColor = color;
ctx.shadowBlur = 8;
ctx.fillStyle = hexToRgba(color, 0.9);
ctx.beginPath();
ctx.arc(cx, cy, rad, 0, Math.PI*2);
ctx.fill();
ctx.shadowBlur = 0;
// Initials
ctx.fillStyle = '#080a10';
ctx.font = `bold ${Math.floor(rad)}px DM Mono, monospace`;
ctx.textAlign = 'center'; ctx.textBaseline = 'middle';
ctx.fillText(m.model[0], cx, cy);
});
// Update legend
MODELS.forEach((m, i) => {
const el = document.getElementById(`legend-steps-${i}`);
if (el) el.textContent = `${Math.min(step, m.steps.length)} / ${m.steps.length}`;
const row = document.getElementById(`legend-${i}`);
if (row) row.classList.toggle('active-step', false);
});
}
// ── BUILD TIMELINE VIEW ──
function buildTimelineView() {
const container = document.getElementById('view-timeline');
container.innerHTML = '';
MODELS.forEach((m, i) => {
const color = MODEL_COLORS[i];
const row = document.createElement('div');
row.className = 'timeline-row';
const segW = 100 / m.steps.length;
let segsHTML = '';
m.steps.forEach((s, idx) => {
let bg = color;
let opacity = 0.7;
if (s.is_hallucination) { bg = '#f87171'; opacity = 1; }
else if (s.is_backtrack) { bg = '#fb923c'; opacity = 0.85; }
else if (s.is_loop) { bg = '#fbbf24'; opacity = 0.7; }
segsHTML += `<div class="tl-segment" style="left:${idx*segW}%;width:${segW}%;background:${bg};opacity:${opacity}" title="Step ${idx+1}: ${s.is_hallucination?'HALL':s.is_backtrack?'BT':s.is_loop?'LOOP':'OK'}"></div>`;
});
const halls = m.steps.filter(s => s.is_hallucination).length;
const bts = m.steps.filter(s => s.is_backtrack).length;
row.innerHTML = `
<div class="tl-name" style="color:${color}">${m.model}</div>
<div class="tl-bar-wrap" id="tl-bar-${i}" onclick="seekTimeline(${i}, event)">
${segsHTML}
<div class="tl-playhead" id="tl-ph-${i}" style="left:0%"></div>
</div>
<div class="tl-stats">
<div class="tl-stat"><span id="tl-step-${i}">0</span>/${m.steps.length}</div>
<div class="tl-stat">H:<span style="color:var(--red)">${halls}</span></div>
<div class="tl-stat">BT:<span style="color:var(--orange)">${bts}</span></div>
</div>
`;
container.appendChild(row);
});
}
function seekTimeline(modelIdx, event) {
const bar = event.currentTarget;
const rect = bar.getBoundingClientRect();
const pct = (event.clientX - rect.left) / rect.width;
const m = MODELS[modelIdx];
currentStep = Math.round(pct * m.steps.length);
updateAll();
}
// ── UPDATE ALL ──
function updateAll() {
const step = currentStep;
const maxS = MAX_STEPS;
// Progress bar
document.getElementById('progress-fill').style.width = `${(step / maxS) * 100}%`;
document.getElementById('step-counter').textContent = `Step ${step} / ${maxS}`;
if (currentView === 'grid') updateGrid(step);
if (currentView === 'overlay') drawOverlay(step);
if (currentView === 'timeline') updateTimeline(step);
updateLeaderboard(step);
}
function updateGrid(step) {
MODELS.forEach((m, i) => {
if (!gridCanvases[i]) return;
const size = gridCanvases[i].width;
if (size < 10) return;
drawMaze(gridCtxs[i], size, i, step);
const status = getModelStatus(i, step);
const badge = document.getElementById(`badge-${i}`);
const card = document.getElementById(`card-${i}`);
const stepsNow = Math.min(step, m.steps.length);
if (badge) {
badge.className = 'card-badge';
if (status === 'solved') { badge.classList.add('badge-solved'); badge.textContent = 'SOLVED'; }
else if (status === 'stuck') { badge.classList.add('badge-stuck'); badge.textContent = 'STUCK'; }
else if (status === 'waiting') { badge.classList.add('badge-waiting'); badge.textContent = 'WAIT'; }
else { badge.classList.add('badge-solving'); badge.textContent = 'SOLVING'; }
}
if (card) {
card.classList.toggle('solved-glow', status === 'solved');
// flash active on current step
if (step > 0 && step <= m.steps.length) {
card.classList.add('active-step');
setTimeout(() => card.classList.remove('active-step'), 200);
}
}
const hallsSoFar = m.steps.slice(0, stepsNow).filter(s => s.is_hallucination).length;
const btSoFar = m.steps.slice(0, stepsNow).filter(s => s.is_backtrack).length;
const el_s = document.getElementById(`steps-${i}`);
const el_h = document.getElementById(`halls-${i}`);
const el_b = document.getElementById(`bt-${i}`);
if (el_s) el_s.textContent = stepsNow;
if (el_h) el_h.textContent = hallsSoFar;
if (el_b) el_b.textContent = btSoFar;
const mei = meiAtStep(i, step);
const mf = document.getElementById(`mei-fill-${i}`);
if (mf) mf.style.width = `${mei * 100}%`;
});
}
function updateTimeline(step) {
MODELS.forEach((m, i) => {
const stepsNow = Math.min(step, m.steps.length);
const pct = m.steps.length > 0 ? (stepsNow / m.steps.length) * 100 : 0;
const ph = document.getElementById(`tl-ph-${i}`);
if (ph) ph.style.left = `${pct}%`;
const sc = document.getElementById(`tl-step-${i}`);
if (sc) sc.textContent = stepsNow;
});
}
// ── LEADERBOARD ──
function updateLeaderboard(step) {
const tbody = document.getElementById('lb-body');
if (!tbody) return;
const rows = MODELS.map((m, i) => {
const stepsNow = Math.min(step, m.steps.length);
const status = getModelStatus(i, step);
const mei = meiAtStep(i, step);
const halls = m.steps.slice(0, stepsNow).filter(s => s.is_hallucination).length;
const bts = m.steps.slice(0, stepsNow).filter(s => s.is_backtrack).length;
return { i, m, stepsNow, status, mei, halls, bts };
});
// Sort: solved first (by fewer steps), then by MEI desc
rows.sort((a, b) => {
if (a.status === 'solved' && b.status !== 'solved') return -1;
if (b.status === 'solved' && a.status !== 'solved') return 1;
if (a.status === 'solved' && b.status === 'solved') return a.stepsNow - b.stepsNow;
return b.mei - a.mei;
});
tbody.innerHTML = rows.map((row, rank) => {
const color = MODEL_COLORS[row.i];
const statusText = row.status === 'solved' ? '&#9733; DONE' :
row.status === 'stuck' ? 'STUCK' :
row.status === 'waiting'? '--' : `${row.stepsNow}/${row.m.steps.length}`;
const statusColor = row.status === 'solved' ? '#ffd700' :
row.status === 'stuck' ? '#f87171' : '';
const meiPct = (row.mei * 100).toFixed(0);
return `<tr>
<td class="lb-rank">${rank+1}</td>
<td class="lb-model"><span class="lb-dot" style="background:${color}"></span>${row.m.model}</td>
<td>${row.stepsNow}</td>
<td style="color:var(--red)">${row.halls}</td>
<td style="color:var(--orange)">${row.bts}</td>
<td>
<span style="margin-right:4px;font-size:10px">${meiPct}%</span>
<span class="lb-mei-bar" style="width:${meiPct}px;background:${color}"></span>
</td>
<td style="color:${statusColor};font-size:10px">${statusText}</td>
</tr>`;
}).join('');
}
// ── PLAYBACK ──
function getSpeed() {
const v = parseInt(document.getElementById('speed-slider').value);
// speed 1=slow(800ms), 10=fast(80ms)
return Math.round(800 - (v - 1) * 80);
}
function togglePlay() {
playing = !playing;
const btn = document.getElementById('play-btn');
if (playing) {
btn.textContent = '⏸ Pause';
scheduleNext();
} else {
btn.textContent = '▶ Play';
if (playInterval) clearTimeout(playInterval);
}
}
function scheduleNext() {
if (!playing) return;
playInterval = setTimeout(() => {
if (currentStep < MAX_STEPS) {
currentStep++;
updateAll();
scheduleNext();
} else {
playing = false;
document.getElementById('play-btn').textContent = '▶ Play';
}
}, getSpeed());
}
function stepForward() {
if (currentStep < MAX_STEPS) { currentStep++; updateAll(); }
}
function stepBackward() {
if (currentStep > 0) { currentStep--; updateAll(); }
}
function seekProgress(event) {
const track = document.getElementById('progress-track');
const rect = track.getBoundingClientRect();
const pct = (event.clientX - rect.left) / rect.width;
currentStep = Math.round(pct * MAX_STEPS);
updateAll();
}
// ── VIEW SWITCHING ──
function setView(v) {
currentView = v;
['grid','overlay','timeline'].forEach(name => {
document.getElementById(`view-${name}`).style.display = name === v ? 'flex' : 'none';
document.getElementById(`btn-${name}`).classList.toggle('active', name === v);
});
if (v === 'overlay') buildOverlayView();
updateAll();
}
// ── KEYBOARD ──
document.addEventListener('keydown', e => {
if (e.code === 'Space') { e.preventDefault(); togglePlay(); }
if (e.code === 'ArrowRight') { e.preventDefault(); stepForward(); }
if (e.code === 'ArrowLeft') { e.preventDefault(); stepBackward(); }
});
// ── RESIZE ──
let resizeTimer;
window.addEventListener('resize', () => {
clearTimeout(resizeTimer);
resizeTimer = setTimeout(() => {
resizeGridCanvases();
if (currentView === 'overlay') buildOverlayView();
updateAll();
}, 150);
});
// ── METRICS PANEL ──
const OPTIMAL_STEPS = SOLUTION_PATH.length;
function buildMetricsPanel() {
const meiRows = document.getElementById('metric-mei-rows');
const effRows = document.getElementById('metric-eff-rows');
const errRows = document.getElementById('metric-err-rows');
// Pre-compute values for sorting — aggregate stats (n=60) take priority
const modelData = MODELS.map((m, i) => {
const agg = AGGREGATE_STATS[m.model] || {};
const mei = agg.mei !== undefined ? agg.mei : m.mei;
const hrr = agg.hrr !== undefined ? agg.hrr : m.hrr;
const hasData = mei !== null && mei !== undefined;
const meiVal = hasData ? mei : 0;
const hrrVal = hrr !== null && hrr !== undefined ? hrr :
(m.solved ? Math.min(1, OPTIMAL_STEPS / m.steps.length) : 0);
const hcVal = agg.avg_hc !== undefined ? agg.avg_hc :
(m.avg_hc !== null && m.avg_hc !== undefined) ? m.avg_hc : m.hallucination_count;
const btVal = agg.avg_bt !== undefined ? agg.avg_bt : (m.backtrack_count || 0);
const errTotal = hcVal + btVal;
const n_trials = agg.n || m.n_trials;
return { i, m, agg, hasData, meiVal, hrrVal, hcVal, btVal, errTotal, n_trials };
});
// Sort indices descending by MEI
const meiOrder = [...modelData].sort((a, b) => b.meiVal - a.meiVal);
// Sort indices descending by HRR
const hrrOrder = [...modelData].sort((a, b) => b.hrrVal - a.hrrVal);
// Sort indices descending by error total (highest errors first = worst)
const errOrder = [...modelData].sort((a, b) => b.errTotal - a.errTotal);
const maxErr = Math.max(...modelData.map(d => d.errTotal)) || 1;
meiOrder.forEach(d => {
const { i, m, hasData, meiVal, n_trials } = d;
const color = MODEL_COLORS[i % MODEL_COLORS.length];
const shortName = getShortName(m.model);
const nLabel = n_trials ? ` n=${n_trials}` : '';
const errLabel = m.error ? ' !' : '';
meiRows.insertAdjacentHTML('beforeend', `
<div class="metric-row">
<div class="metric-model-name" style="color:${color}">${shortName}${errLabel}</div>
<div class="metric-bar-track">
<div class="metric-bar-fill" id="mei-bar-${i}" style="background:${color};width:${meiVal*100}%;opacity:${hasData?1:0.3}"></div>
</div>
<div class="metric-val" id="mei-val-${i}">${hasData ? meiVal.toFixed(3) : 'N/A'}${nLabel}</div>
</div>`);
});
hrrOrder.forEach(d => {
const { i, m, hasData, hrrVal } = d;
const color = MODEL_COLORS[i % MODEL_COLORS.length];
const shortName = getShortName(m.model);
const hrrLabel = (m.hrr !== null && m.hrr !== undefined) ? 'HRR' : 'Eff';
effRows.insertAdjacentHTML('beforeend', `
<div class="metric-row">
<div class="metric-model-name" style="color:${color}">${shortName}</div>
<div class="metric-bar-track">
<div class="metric-bar-fill" id="eff-bar-${i}" style="background:${color};width:${hrrVal*100}%;opacity:${hasData?1:0.3}"></div>
</div>
<div class="metric-val" id="eff-val-${i}">${hrrLabel}:${(hrrVal*100).toFixed(0)}%</div>
</div>`);
});
errOrder.forEach(d => {
const { i, m, hasData, hcVal } = d;
const color = MODEL_COLORS[i % MODEL_COLORS.length];
const shortName = getShortName(m.model);
const hallPct = (hcVal / maxErr * 100).toFixed(1);
const btPct = ((m.backtrack_count||0) / maxErr * 100).toFixed(1);
const hcDisplay = m.error ? 'Error' : `H${hcVal.toFixed(1)}/B${d.btVal.toFixed(0)}`;
errRows.insertAdjacentHTML('beforeend', `
<div class="metric-row">
<div class="metric-model-name" style="color:${color}">${shortName}</div>
<div class="metric-bar-track" style="position:relative">
<div style="position:absolute;left:0;top:0;height:100%;width:${hallPct}%;background:#f87171;border-radius:4px 0 0 4px;opacity:${hasData?1:0.3}"></div>
<div style="position:absolute;left:${hallPct}%;top:0;height:100%;width:${btPct}%;background:#fb923c;opacity:${hasData?1:0.3}"></div>
</div>
<div class="metric-val">${hcDisplay}</div>
</div>`);
});
drawRadar();
}
let selectedRadarModel = -1;
function buildRadarLegend() {
const legend = document.getElementById('radar-legend');
if (!legend) return;
legend.textContent = '';
MODELS.forEach((m, i) => {
const color = MODEL_COLORS[i % MODEL_COLORS.length];
const shortName = getShortName(m.model);
const item = document.createElement('div');
item.className = 'radar-leg-item' +
(selectedRadarModel >= 0 && selectedRadarModel !== i ? ' dimmed' : '') +
(selectedRadarModel === i ? ' selected' : '');
const swatch = document.createElement('span');
swatch.className = 'radar-leg-swatch';
swatch.style.background = color;
const name = document.createElement('span');
name.className = 'radar-leg-name';
name.textContent = shortName;
item.appendChild(swatch);
item.appendChild(name);
item.addEventListener('click', () => {
selectedRadarModel = selectedRadarModel === i ? -1 : i;
drawRadar();
});
legend.appendChild(item);
});
}
function drawRadar() {
const canvas = document.getElementById('radar-canvas');
if (!canvas) return;
const ctx = canvas.getContext('2d');
const W = canvas.width, H = canvas.height;
const cx = W/2, cy = H/2;
const R = Math.min(W,H)/2 - 24;
const axes = ['MEI','Recovery(HRR)','Confidence','Solve Rate','No-Error'];
const nAxes = axes.length;
ctx.clearRect(0,0,W,H);
// Background grid
for (let ring = 1; ring <= 4; ring++) {
ctx.beginPath();
for (let a = 0; a < nAxes; a++) {
const angle = (a/nAxes)*Math.PI*2 - Math.PI/2;
const r = R * ring/4;
const x = cx + Math.cos(angle)*r;
const y = cy + Math.sin(angle)*r;
a===0 ? ctx.moveTo(x,y) : ctx.lineTo(x,y);
}
ctx.closePath();
ctx.strokeStyle = 'rgba(255,255,255,0.07)';
ctx.lineWidth = 1;
ctx.stroke();
}
// Axis lines + labels
axes.forEach((label, a) => {
const angle = (a/nAxes)*Math.PI*2 - Math.PI/2;
const x = cx + Math.cos(angle)*R;
const y = cy + Math.sin(angle)*R;
ctx.beginPath();
ctx.moveTo(cx, cy);
ctx.lineTo(x, y);
ctx.strokeStyle = 'rgba(255,255,255,0.15)';
ctx.lineWidth = 1;
ctx.stroke();
// Label
const lx = cx + Math.cos(angle)*(R+14);
const ly = cy + Math.sin(angle)*(R+14);
ctx.fillStyle = 'rgba(255,255,255,0.4)';
ctx.font = '8px "DM Mono", monospace';
ctx.textAlign = 'center';
ctx.textBaseline = 'middle';
ctx.fillText(label, lx, ly);
});
// Model polygons
const maxBT = Math.max(...MODELS.map(m => m.backtrack_count)) || 1;
const maxHall = Math.max(...MODELS.map(m => {
const agg = AGGREGATE_STATS[m.model] || {};
return agg.avg_hc !== undefined ? agg.avg_hc : m.hallucination_count;
})) || 1;
const hasSel = selectedRadarModel >= 0;
MODELS.forEach((m, i) => {
const agg = AGGREGATE_STATS[m.model] || {};
const color = MODEL_COLORS[i % MODEL_COLORS.length];
const mei = agg.mei !== undefined ? agg.mei : m.mei;
const hasData = mei !== null && mei !== undefined;
const isSelected = selectedRadarModel === i;
const isDimmed = hasSel && !isSelected;
// Use aggregate stats (n=60) — fall back to single-run
const meiScore = hasData ? mei : 0;
const hrrScore = agg.hrr !== undefined ? agg.hrr :
(m.hrr !== null && m.hrr !== undefined) ? m.hrr :
(m.solved ? Math.min(1, OPTIMAL_STEPS / m.steps.length) : 0);
const srScore = agg.sr !== undefined ? agg.sr :
(m.sr_agg !== null && m.sr_agg !== undefined) ? m.sr_agg : (m.solved ? 1 : 0);
const hcVal = agg.avg_hc !== undefined ? agg.avg_hc :
(m.avg_hc !== null && m.avg_hc !== undefined) ? m.avg_hc : m.hallucination_count;
const noErrScore = hasData ? Math.max(0, 1 - (hcVal / (maxHall+1))) : 0;
// Compute avg confidence from steps (for display only)
const confSteps = m.steps.filter(s => s.confidence !== null && s.confidence !== undefined);
const avgConf = confSteps.length ? confSteps.reduce((s,x)=>s+x.confidence,0)/confSteps.length/100 : 0.5;
const scores = [
meiScore, // MEI
hrrScore, // Recovery Rate (HRR)
avgConf, // Avg Confidence
srScore, // Solve Rate (aggregate)
noErrScore // No-Error rate
];
ctx.beginPath();
scores.forEach((score, a) => {
const angle = (a/nAxes)*Math.PI*2 - Math.PI/2;
const r = R * score;
const x = cx + Math.cos(angle)*r;
const y = cy + Math.sin(angle)*r;
a===0 ? ctx.moveTo(x,y) : ctx.lineTo(x,y);
});
ctx.closePath();
if (isDimmed) {
ctx.fillStyle = color + '08';
ctx.fill();
ctx.strokeStyle = color + '30';
ctx.lineWidth = 1;
} else if (isSelected) {
ctx.fillStyle = color + '44';
ctx.fill();
ctx.shadowColor = color;
ctx.shadowBlur = 10;
ctx.strokeStyle = color;
ctx.lineWidth = 3;
} else {
ctx.fillStyle = color + '22';
ctx.fill();
ctx.strokeStyle = color + 'aa';
ctx.lineWidth = 1.5;
}
ctx.stroke();
ctx.shadowColor = 'transparent';
ctx.shadowBlur = 0;
});
// Update clickable legend below canvas
buildRadarLegend();
}
// ── INIT ──
buildGridView();
buildTimelineView();
buildMetricsPanel();
updateAll();
// Auto-size after layout — double RAF to ensure min-height CSS is computed
requestAnimationFrame(() => requestAnimationFrame(() => {
resizeGridCanvases();
updateAll();
}));
</script>
</body>
</html>