Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>HalluMaze Race — 10 Model Comparison</title> | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=DM+Mono:wght@300;400;500&family=Syne:wght@400;600;700;800&display=swap'); | |
| :root { | |
| --bg: #080a10; | |
| --bg2: #0e1118; | |
| --bg3: #141720; | |
| --border: #1e2235; | |
| --border2: #252a3d; | |
| --text: #e2e8f0; | |
| --text2: #8892a4; | |
| --text3: #535c6e; | |
| --c0: #00d4aa; /* MiniMax teal */ | |
| --c1: #a78bfa; /* GLM purple */ | |
| --c2: #fbbf24; /* Llama amber */ | |
| --c3: #34d399; /* GPT emerald */ | |
| --c4: #60a5fa; /* Gemini blue */ | |
| --c5: #f472b6; /* Haiku pink */ | |
| --accent: #00d4aa; | |
| --red: #f87171; | |
| --orange: #fb923c; | |
| --yellow: #fbbf24; | |
| --gold: #ffd700; | |
| } | |
| *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } | |
| html, body { | |
| height: 100%; | |
| background: var(--bg); | |
| color: var(--text); | |
| font-family: 'Syne', sans-serif; | |
| overflow: hidden; | |
| } | |
| body { | |
| display: flex; | |
| flex-direction: column; | |
| height: 100vh; | |
| user-select: none; | |
| } | |
| /* ── HEADER ── */ | |
| #header { | |
| display: flex; | |
| align-items: center; | |
| gap: 20px; | |
| padding: 10px 20px; | |
| background: var(--bg2); | |
| border-bottom: 1px solid var(--border); | |
| flex-shrink: 0; | |
| flex-wrap: wrap; | |
| min-height: 52px; | |
| } | |
| #title { | |
| font-size: 15px; | |
| font-weight: 800; | |
| letter-spacing: 0.12em; | |
| text-transform: uppercase; | |
| color: var(--accent); | |
| white-space: nowrap; | |
| } | |
| #title span { color: var(--text3); font-weight: 400; } | |
| .meta-badge { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 11px; | |
| color: var(--text3); | |
| background: var(--bg3); | |
| border: 1px solid var(--border); | |
| padding: 2px 8px; | |
| border-radius: 4px; | |
| white-space: nowrap; | |
| } | |
| #controls { | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| margin-left: auto; | |
| } | |
| .ctrl-btn { | |
| background: var(--bg3); | |
| border: 1px solid var(--border2); | |
| color: var(--text); | |
| padding: 5px 12px; | |
| font-family: 'DM Mono', monospace; | |
| font-size: 12px; | |
| cursor: pointer; | |
| border-radius: 5px; | |
| transition: all 0.15s; | |
| white-space: nowrap; | |
| } | |
| .ctrl-btn:hover { background: var(--border2); border-color: var(--accent); color: var(--accent); } | |
| .ctrl-btn.active { background: var(--accent); color: #080a10; border-color: var(--accent); font-weight: 600; } | |
| #play-btn { | |
| background: var(--accent); | |
| color: #080a10; | |
| border-color: var(--accent); | |
| font-weight: 700; | |
| min-width: 72px; | |
| } | |
| #play-btn:hover { opacity: 0.85; } | |
| .step-counter { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 12px; | |
| color: var(--text2); | |
| white-space: nowrap; | |
| } | |
| .speed-label { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 11px; | |
| color: var(--text3); | |
| } | |
| #speed-slider { | |
| width: 70px; | |
| accent-color: var(--accent); | |
| } | |
| .view-group { display: flex; gap: 2px; } | |
| /* ── PROGRESS BAR ── */ | |
| #progress-wrap { | |
| padding: 0 20px 6px; | |
| background: var(--bg2); | |
| border-bottom: 1px solid var(--border); | |
| flex-shrink: 0; | |
| } | |
| #progress-track { | |
| height: 4px; | |
| background: var(--bg3); | |
| border-radius: 2px; | |
| cursor: pointer; | |
| position: relative; | |
| } | |
| #progress-fill { | |
| height: 100%; | |
| background: var(--accent); | |
| border-radius: 2px; | |
| transition: width 0.1s; | |
| } | |
| /* ── MAIN AREA ── */ | |
| #main { | |
| flex: 1; | |
| display: flex; | |
| flex-direction: column; | |
| overflow: hidden; | |
| padding: 12px 16px 8px; | |
| gap: 10px; | |
| } | |
| /* ── VIEWS ── */ | |
| #view-grid, #view-overlay, #view-timeline { | |
| display: none; | |
| flex: 1; | |
| min-height: 0; | |
| } | |
| /* Grid view */ | |
| #view-grid { | |
| flex-direction: column; | |
| gap: 10px; | |
| } | |
| #grid-row { | |
| display: grid; | |
| grid-template-columns: repeat(6, 1fr); | |
| gap: 8px; | |
| flex: 1; | |
| min-height: 0; | |
| } | |
| .model-card { | |
| background: var(--bg2); | |
| border: 1px solid var(--border); | |
| border-radius: 10px; | |
| display: flex; | |
| flex-direction: column; | |
| overflow: hidden; | |
| transition: border-color 0.2s, box-shadow 0.2s; | |
| min-width: 0; | |
| } | |
| .model-card.active-step { | |
| border-color: var(--card-color, var(--accent)); | |
| box-shadow: 0 0 16px -4px var(--card-color, var(--accent)); | |
| } | |
| .model-card.solved-glow { | |
| border-color: var(--gold); | |
| box-shadow: 0 0 20px -4px var(--gold); | |
| } | |
| .card-header { | |
| display: flex; | |
| align-items: center; | |
| gap: 6px; | |
| padding: 7px 10px 5px; | |
| border-bottom: 1px solid var(--border); | |
| } | |
| .card-dot { | |
| width: 8px; height: 8px; | |
| border-radius: 50%; | |
| flex-shrink: 0; | |
| } | |
| .card-name { | |
| font-size: 11px; | |
| font-weight: 700; | |
| letter-spacing: 0.04em; | |
| flex: 1; | |
| white-space: nowrap; | |
| overflow: hidden; | |
| text-overflow: ellipsis; | |
| } | |
| .card-badge { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 9px; | |
| padding: 2px 5px; | |
| border-radius: 3px; | |
| font-weight: 600; | |
| white-space: nowrap; | |
| } | |
| .badge-solving { background: #1e3a2f; color: #34d399; } | |
| .badge-solved { background: #2d2500; color: #ffd700; } | |
| .badge-stuck { background: #2d1a1a; color: #f87171; } | |
| .badge-waiting { background: var(--bg3); color: var(--text3); } | |
| .card-canvas-wrap { | |
| flex: 1; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| padding: 6px; | |
| min-height: 0; | |
| } | |
| canvas.maze-canvas { | |
| display: block; | |
| max-width: 100%; | |
| max-height: 100%; | |
| } | |
| .card-stats { | |
| display: grid; | |
| grid-template-columns: repeat(3, 1fr); | |
| gap: 0; | |
| border-top: 1px solid var(--border); | |
| } | |
| .stat-cell { | |
| padding: 4px 6px; | |
| text-align: center; | |
| border-right: 1px solid var(--border); | |
| } | |
| .stat-cell:last-child { border-right: none; } | |
| .stat-val { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 11px; | |
| font-weight: 500; | |
| display: block; | |
| } | |
| .stat-lbl { | |
| font-size: 8px; | |
| color: var(--text3); | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .card-mei { | |
| padding: 4px 10px 6px; | |
| border-top: 1px solid var(--border); | |
| } | |
| .mei-track { | |
| height: 3px; | |
| background: var(--bg3); | |
| border-radius: 2px; | |
| overflow: hidden; | |
| } | |
| .mei-fill { | |
| height: 100%; | |
| border-radius: 2px; | |
| transition: width 0.3s; | |
| } | |
| /* ── OVERLAY VIEW ── */ | |
| #view-overlay { | |
| align-items: center; | |
| justify-content: center; | |
| gap: 24px; | |
| } | |
| #overlay-canvas-wrap { | |
| position: relative; | |
| flex-shrink: 0; | |
| } | |
| canvas#overlay-canvas { | |
| display: block; | |
| } | |
| #overlay-legend { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 10px; | |
| min-width: 160px; | |
| } | |
| .legend-row { | |
| display: flex; | |
| align-items: center; | |
| gap: 10px; | |
| padding: 8px 12px; | |
| background: var(--bg2); | |
| border: 1px solid var(--border); | |
| border-radius: 7px; | |
| transition: border-color 0.2s; | |
| } | |
| .legend-row.active-step { border-color: var(--row-color); } | |
| .legend-color { | |
| width: 12px; height: 12px; | |
| border-radius: 50%; | |
| flex-shrink: 0; | |
| } | |
| .legend-name { | |
| font-size: 12px; | |
| font-weight: 600; | |
| flex: 1; | |
| } | |
| .legend-steps { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 11px; | |
| color: var(--text3); | |
| } | |
| /* ── TIMELINE VIEW ── */ | |
| #view-timeline { | |
| flex-direction: column; | |
| gap: 0; | |
| overflow-y: auto; | |
| } | |
| .timeline-row { | |
| display: flex; | |
| align-items: center; | |
| gap: 12px; | |
| padding: 8px 0; | |
| border-bottom: 1px solid var(--border); | |
| } | |
| .tl-name { | |
| font-size: 11px; | |
| font-weight: 700; | |
| width: 100px; | |
| flex-shrink: 0; | |
| white-space: nowrap; | |
| overflow: hidden; | |
| text-overflow: ellipsis; | |
| } | |
| .tl-bar-wrap { | |
| flex: 1; | |
| position: relative; | |
| height: 28px; | |
| background: var(--bg3); | |
| border-radius: 4px; | |
| overflow: hidden; | |
| cursor: pointer; | |
| } | |
| .tl-segment { | |
| position: absolute; | |
| top: 0; height: 100%; | |
| border-right: 1px solid var(--bg); | |
| transition: opacity 0.15s; | |
| } | |
| .tl-segment:hover { opacity: 0.8; } | |
| .tl-playhead { | |
| position: absolute; | |
| top: -2px; bottom: -2px; | |
| width: 2px; | |
| background: white; | |
| pointer-events: none; | |
| transition: left 0.1s; | |
| z-index: 10; | |
| } | |
| .tl-stats { | |
| display: flex; | |
| gap: 8px; | |
| min-width: 120px; | |
| } | |
| .tl-stat { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 10px; | |
| color: var(--text3); | |
| white-space: nowrap; | |
| } | |
| .tl-stat span { color: var(--text2); } | |
| /* ── LEADERBOARD ── */ | |
| #leaderboard { | |
| background: var(--bg2); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| overflow: hidden; | |
| flex-shrink: 0; | |
| } | |
| #lb-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-family: 'DM Mono', monospace; | |
| font-size: 11px; | |
| } | |
| #lb-table thead tr { | |
| background: var(--bg3); | |
| } | |
| #lb-table th { | |
| padding: 5px 10px; | |
| text-align: left; | |
| color: var(--text3); | |
| font-size: 10px; | |
| letter-spacing: 0.08em; | |
| text-transform: uppercase; | |
| font-weight: 500; | |
| white-space: nowrap; | |
| } | |
| #lb-table td { | |
| padding: 4px 10px; | |
| border-top: 1px solid var(--border); | |
| white-space: nowrap; | |
| } | |
| #lb-table tr.active-row td { background: rgba(255,255,255,0.03); } | |
| .lb-rank { color: var(--text3); font-size: 10px; } | |
| .lb-model { font-weight: 600; font-size: 11px; } | |
| .lb-dot { display: inline-block; width: 7px; height: 7px; border-radius: 50%; margin-right: 5px; } | |
| .lb-mei-bar { | |
| display: inline-block; | |
| height: 3px; | |
| border-radius: 2px; | |
| vertical-align: middle; | |
| transition: width 0.3s; | |
| } | |
| /* ── METRICS PANEL ── */ | |
| /* Radar standalone panel */ | |
| #radar-panel { | |
| background: var(--bg2); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| padding: 14px 16px; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| } | |
| #radar-panel h3 { | |
| font-family: 'Syne', sans-serif; | |
| font-size: 11px; | |
| letter-spacing: 0.1em; | |
| text-transform: uppercase; | |
| color: var(--text3); | |
| margin-bottom: 10px; | |
| font-weight: 700; | |
| align-self: flex-start; | |
| } | |
| /* Performance comparison panel (grid tab only) */ | |
| #metrics-panel { | |
| background: var(--bg2); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| padding: 14px 16px; | |
| } | |
| #metrics-panel h3 { | |
| font-family: 'Syne', sans-serif; | |
| font-size: 11px; | |
| letter-spacing: 0.1em; | |
| text-transform: uppercase; | |
| color: var(--text3); | |
| margin-bottom: 12px; | |
| font-weight: 700; | |
| } | |
| .metrics-bars-section { | |
| width: 100%; | |
| } | |
| .metric-group { | |
| margin-bottom: 10px; | |
| } | |
| .metric-label { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 9px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| color: var(--text3); | |
| margin-bottom: 4px; | |
| } | |
| .metric-row { | |
| display: flex; | |
| align-items: center; | |
| gap: 6px; | |
| margin-bottom: 3px; | |
| } | |
| .metric-model-name { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 9px; | |
| color: var(--text3); | |
| width: 78px; | |
| flex-shrink: 0; | |
| overflow: hidden; | |
| text-overflow: ellipsis; | |
| white-space: nowrap; | |
| } | |
| .metric-bar-track { | |
| flex: 1; | |
| height: 8px; | |
| background: var(--bg3); | |
| border-radius: 4px; | |
| overflow: hidden; | |
| position: relative; | |
| } | |
| .metric-bar-fill { | |
| height: 100%; | |
| border-radius: 4px; | |
| transition: width 0.4s ease; | |
| } | |
| .metric-val { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 9px; | |
| color: var(--text2); | |
| width: 72px; | |
| text-align: right; | |
| flex-shrink: 0; | |
| white-space: nowrap; | |
| } | |
| .radar-section { | |
| flex-shrink: 0; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| } | |
| .radar-section h4 { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 9px; | |
| color: var(--text3); | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| margin-bottom: 6px; | |
| } | |
| #radar-canvas { | |
| display: block; | |
| } | |
| #radar-legend { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 2px 8px; | |
| margin-top: 8px; | |
| justify-content: center; | |
| max-width: 240px; | |
| } | |
| .radar-leg-item { | |
| display: flex; | |
| align-items: center; | |
| gap: 4px; | |
| cursor: pointer; | |
| padding: 1px 4px; | |
| border-radius: 3px; | |
| transition: background 0.15s, opacity 0.15s; | |
| } | |
| .radar-leg-item:hover { background: rgba(255,255,255,0.06); } | |
| .radar-leg-item.dimmed { opacity: 0.25; } | |
| .radar-leg-item.selected { background: rgba(255,255,255,0.1); } | |
| .radar-leg-swatch { | |
| width: 8px; height: 8px; border-radius: 2px; flex-shrink: 0; | |
| } | |
| .radar-leg-name { | |
| font-family: 'DM Mono', monospace; | |
| font-size: 8px; | |
| color: rgba(255,255,255,0.5); | |
| white-space: nowrap; | |
| } | |
| /* ── HALL FLASH OVERLAY ── */ | |
| .hall-flash { | |
| position: absolute; | |
| inset: 0; | |
| background: rgba(248,113,113,0.15); | |
| border-radius: 8px; | |
| pointer-events: none; | |
| animation: flashAnim 0.6s ease-out forwards; | |
| } | |
| @keyframes flashAnim { | |
| 0% { opacity: 1; } | |
| 100% { opacity: 0; } | |
| } | |
| /* ── CROWN ── */ | |
| .crown-overlay { | |
| position: absolute; | |
| top: 4px; right: 6px; | |
| font-size: 16px; | |
| animation: crownPop 0.4s cubic-bezier(0.175,0.885,0.32,1.275) forwards; | |
| } | |
| @keyframes crownPop { | |
| 0% { transform: scale(0) rotate(-20deg); opacity: 0; } | |
| 100% { transform: scale(1) rotate(0); opacity: 1; } | |
| } | |
| /* Keyboard hint */ | |
| #kbd-hint { | |
| position: fixed; | |
| bottom: 10px; right: 14px; | |
| font-family: 'DM Mono', monospace; | |
| font-size: 10px; | |
| color: var(--text3); | |
| pointer-events: none; | |
| } | |
| kbd { | |
| background: var(--bg3); | |
| border: 1px solid var(--border2); | |
| border-radius: 3px; | |
| padding: 1px 4px; | |
| } | |
| /* ── Layout fix: maze grid fully visible, scroll allowed ── */ | |
| html { overflow: auto ; } | |
| body { height: auto ; min-height: 100vh; overflow: auto ; } | |
| #main { overflow-y: visible; height: auto; min-height: 100vh; } | |
| /* Grid row: take remaining space, minimum 320px for canvas visibility */ | |
| #grid-row { min-height: 320px; flex: 1 1 auto; } | |
| /* Model card: allow full content visibility */ | |
| .model-card { overflow: visible; } | |
| .card-canvas-wrap { min-height: 120px; } | |
| /* Leaderboard: compact, scrollable */ | |
| #leaderboard { max-height: 180px; overflow-y: auto; flex-shrink: 0; } | |
| /* Panels inside grid tab */ | |
| #radar-panel { flex-shrink: 0; } | |
| #metrics-panel { flex-shrink: 0; } | |
| /* Canvas wraps: ensure minimum height */ | |
| .card-canvas-wrap { min-height: 120px; } | |
| </style> | |
| </head> | |
| <body> | |
| <!-- HEADER --> | |
| <div id="header"> | |
| <div id="title">HalluMaze <span>/</span> Race</div> | |
| <div class="meta-badge">10 Models</div> | |
| <div class="meta-badge">Seed 4004</div> | |
| <div class="meta-badge">5×5 Maze</div> | |
| <div class="view-group"> | |
| <button class="ctrl-btn active" id="btn-grid" onclick="setView('grid')">Grid</button> | |
| <button class="ctrl-btn" id="btn-overlay" onclick="setView('overlay')">Overlay</button> | |
| <button class="ctrl-btn" id="btn-timeline" onclick="setView('timeline')">Timeline</button> | |
| </div> | |
| <div id="controls"> | |
| <button class="ctrl-btn" onclick="stepBackward()">◀</button> | |
| <button class="ctrl-btn active" id="play-btn" onclick="togglePlay()">▶ Play</button> | |
| <button class="ctrl-btn" onclick="stepForward()">▶▶</button> | |
| <span class="step-counter" id="step-counter">Step 0 / 0</span> | |
| <span class="speed-label">Speed</span> | |
| <input type="range" id="speed-slider" min="1" max="10" value="5"> | |
| </div> | |
| <a href="hallumaze_final.html" class="ctrl-btn" style="text-decoration:none;margin-left:8px;" target="_blank">Leaderboard ↗</a> | |
| <a href="https://github.com/jaytoone/HalluMaze" class="ctrl-btn" style="text-decoration:none;" target="_blank">GitHub ↗</a> | |
| </div> | |
| <div id="progress-wrap"> | |
| <div id="progress-track" onclick="seekProgress(event)"> | |
| <div id="progress-fill" style="width:0%"></div> | |
| </div> | |
| </div> | |
| <!-- MAIN --> | |
| <div id="main"> | |
| <!-- GRID VIEW --> | |
| <div id="view-grid" style="display:flex"> | |
| <div id="grid-row"></div> | |
| <div id="leaderboard"> | |
| <table id="lb-table"> | |
| <thead> | |
| <tr> | |
| <th>Rank</th> | |
| <th>Model</th> | |
| <th>Steps</th> | |
| <th>Halls</th> | |
| <th>BT</th> | |
| <th>MEI</th> | |
| <th>Status</th> | |
| </tr> | |
| </thead> | |
| <tbody id="lb-body"></tbody> | |
| </table> | |
| </div> | |
| <!-- RADAR PANEL — standalone, above comparison --> | |
| <div id="radar-panel"> | |
| <h3>5-Dimension Radar</h3> | |
| <canvas id="radar-canvas" width="240" height="240"></canvas> | |
| <div id="radar-legend"></div> | |
| </div> | |
| <!-- PERFORMANCE COMPARISON — bars only, grid tab only --> | |
| <div id="metrics-panel"> | |
| <h3>Performance Metrics Comparison</h3> | |
| <div class="metrics-bars-section"> | |
| <div class="metric-group"> | |
| <div class="metric-label">MEI — Metacognitive Escape Index</div> | |
| <div id="metric-mei-rows"></div> | |
| </div> | |
| <div class="metric-group"> | |
| <div class="metric-label">Efficiency — Steps vs Optimal</div> | |
| <div id="metric-eff-rows"></div> | |
| </div> | |
| <div class="metric-group"> | |
| <div class="metric-label">Hallucinations & Backtracks (lower=better)</div> | |
| <div id="metric-err-rows"></div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- OVERLAY VIEW --> | |
| <div id="view-overlay"> | |
| <div id="overlay-canvas-wrap"> | |
| <canvas id="overlay-canvas"></canvas> | |
| </div> | |
| <div id="overlay-legend"></div> | |
| </div> | |
| <!-- TIMELINE VIEW --> | |
| <div id="view-timeline" id="view-timeline"></div> | |
| </div> | |
| <div id="kbd-hint"><kbd>Space</kbd> play/pause <kbd>←</kbd><kbd>→</kbd> step</div> | |
| <script> | |
| const RAW_DATA = {"seed":4004,"size":5,"timestamp":"2026-03-22 22:38","maze":{"N":5,"walls":[[{"N":true,"S":true,"E":false,"W":true},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":false,"E":true,"W":false},{"N":true,"S":false,"E":true,"W":true}],[{"N":true,"S":false,"E":false,"W":true},{"N":true,"S":false,"E":true,"W":false},{"N":true,"S":false,"E":true,"W":true},{"N":false,"S":true,"E":false,"W":true},{"N":false,"S":false,"E":true,"W":false}],[{"N":false,"S":false,"E":true,"W":true},{"N":false,"S":true,"E":false,"W":true},{"N":false,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":true,"W":false},{"N":false,"S":false,"E":true,"W":true}],[{"N":false,"S":false,"E":false,"W":true},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":false,"S":true,"E":true,"W":false}],[{"N":false,"S":true,"E":false,"W":true},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":false,"W":false},{"N":true,"S":true,"E":true,"W":false}]],"start":[0,0],"end":[4,4],"solution":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"mirage_positions":[[1,4],[3,0]]},"results":[{"model":"MiniMax-M2.5","provider":"minimax","solved":true,"mei":0.593,"score":0.65,"hallucination_count":0,"backtrack_count":0,"loop_count":0,"brs":1.0,"latency_s":125.59,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":3,"r":0,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":4,"r":0,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":5,"r":1,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":6,"r":1,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":7,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":8,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":9,"r":3,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":10,"r":3,"c":2,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":11,"r":3,"c":1,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":12,"r":3,"c":0,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":13,"r":4,"c":0,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":14,"r":4,"c":1,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":15,"r":4,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":16,"r":4,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.533,"hrr":0.6,"avg_hc":0.1,"n_trials":60},{"model":"glm-4.7","provider":"glm","solved":true,"mei":0.615,"score":0.744,"hallucination_count":0,"backtrack_count":2,"loop_count":4,"brs":1.0,"latency_s":19.31,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":6,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":7,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":8,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":9,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":95},{"step":10,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":11,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":12,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":13,"r":3,"c":0,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":14,"r":2,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":15,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":16,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":17,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":18,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":19,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":20,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":21,"r":4,"c":4,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null}],"sr_agg":0.083,"hrr":0.718,"avg_hc":1.4,"n_trials":60},{"model":"Llama-4-Scout","solved":true,"mei":0.589,"score":0.604,"hallucination_count":2,"backtrack_count":2,"loop_count":1,"brs":1.0,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[0,4],[1,4],[2,4],[3,4],[3,3],[3,4],[2,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":77},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":61},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":78},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":94},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":6,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":86},{"step":7,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":50},{"step":8,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":9,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":94},{"step":10,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":74},{"step":11,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":45},{"step":12,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":73},{"step":13,"r":2,"c":4,"direction":"W\u2190","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":67},{"step":14,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":40},{"step":15,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":16,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":17,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":18,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":94},{"step":19,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":86},{"step":20,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":68},{"step":21,"r":4,"c":1,"direction":"S\u2193","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":22,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":86},{"step":23,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":91},{"step":24,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":84},{"step":25,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.083,"hrr":0.81,"avg_hc":2.0,"n_trials":60},{"model":"GPT-4o-mini","solved":true,"mei":0.345,"score":0.68,"hallucination_count":1,"backtrack_count":16,"loop_count":15,"brs":1.0,"path":[[0,0],[0,1],[0,0],[0,0],[0,1],[0,2],[0,3],[0,2],[0,3],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,2],[3,3],[3,2],[3,3],[3,4],[3,3],[3,4],[2,4],[3,4],[2,4],[1,4],[1,3],[1,4],[0,4],[1,4],[0,4],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":63},{"step":2,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":49},{"step":3,"r":0,"c":0,"direction":"W\u2190","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":64},{"step":4,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":71},{"step":5,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":68},{"step":6,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":76},{"step":7,"r":0,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":59},{"step":8,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":72},{"step":9,"r":0,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":59},{"step":10,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":42},{"step":11,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":12,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":66},{"step":13,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":14,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":83},{"step":15,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":97},{"step":16,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":17,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":18,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":55},{"step":19,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":59},{"step":20,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":34},{"step":21,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":54},{"step":22,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":52},{"step":23,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":32},{"step":24,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":54},{"step":25,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":48},{"step":26,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":22},{"step":27,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":58},{"step":28,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":59},{"step":29,"r":1,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":52},{"step":30,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":35},{"step":31,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":69},{"step":32,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":23},{"step":33,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":77},{"step":34,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":35,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":27},{"step":36,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":37,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":38,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":22},{"step":39,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":40,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":76},{"step":41,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":93},{"step":42,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":91},{"step":43,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":61},{"step":44,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":45,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":46,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.125,"hrr":0.312,"avg_hc":0.8,"n_trials":16},{"model":"Gemini-Flash","solved":true,"mei":0.432,"score":0.68,"hallucination_count":1,"backtrack_count":6,"loop_count":9,"brs":1.0,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,4],[3,3],[3,2],[3,1],[3,2],[3,3],[3,4],[2,4],[1,4],[0,4],[1,4],[1,4],[2,4],[3,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":79},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":64},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":6,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":7,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":8,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":98},{"step":9,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":54},{"step":10,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":62},{"step":11,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":12,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":76},{"step":13,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":55},{"step":14,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":56},{"step":15,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":49},{"step":16,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":63},{"step":17,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":51},{"step":18,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":73},{"step":19,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":53},{"step":20,"r":1,"c":4,"direction":"E\u2192","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":21,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":57},{"step":22,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":23,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":24,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":25,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":33},{"step":26,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":56},{"step":27,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":28,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":64},{"step":29,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":30,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":31,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":32,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":82},{"step":33,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":34,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.083,"hrr":0.403,"avg_hc":1.1,"n_trials":60},{"model":"Claude-Haiku","solved":false,"mei":0.398,"score":0.398,"hallucination_count":1,"backtrack_count":21,"loop_count":26,"brs":0.0,"path":[[0,0],[0,1],[0,0],[0,1],[0,0],[0,1],[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,3],[3,4],[3,4],[2,4],[3,4],[3,3],[3,4],[2,4],[3,4],[2,4],[1,4],[1,3],[1,4],[0,4],[1,4],[1,3],[0,3],[0,2],[0,3],[1,3],[0,3],[1,3],[1,4],[0,4],[1,4],[0,4],[1,4],[2,4],[3,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":99},{"step":2,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":49},{"step":3,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":64},{"step":4,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":54},{"step":5,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":50},{"step":6,"r":0,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":7,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":55},{"step":8,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":9,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":81},{"step":10,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":81},{"step":11,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":95},{"step":12,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":83},{"step":13,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":77},{"step":14,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":84},{"step":15,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":16,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":false,"confidence":61},{"step":17,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":81},{"step":18,"r":3,"c":4,"direction":"S\u2193","is_hallucination":true,"is_backtrack":false,"is_loop":false,"confidence":48},{"step":19,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":58},{"step":20,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":34},{"step":21,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":58},{"step":22,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":26},{"step":23,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":39},{"step":24,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":25,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":32},{"step":26,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":73},{"step":27,"r":1,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":72},{"step":28,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":66},{"step":29,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":83},{"step":30,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":50},{"step":31,"r":1,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":58},{"step":32,"r":1,"c":3,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":78},{"step":33,"r":0,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":34,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":43},{"step":35,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":44},{"step":36,"r":1,"c":3,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":27},{"step":37,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":38,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":27},{"step":39,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":45},{"step":40,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":41,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":42},{"step":42,"r":0,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":43,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":33},{"step":44,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":20},{"step":45,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":46,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":true,"is_loop":true,"confidence":20},{"step":47,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":true,"confidence":42},{"step":48,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":78},{"step":49,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":96},{"step":50,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":74},{"step":51,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":52,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":89},{"step":53,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":77},{"step":54,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":66},{"step":55,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":56,"r":4,"c":4,"direction":"GOAL","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":0.05,"hrr":0.363,"avg_hc":3.93,"n_trials":60,"error":null},{"model":"Llama-4-Maverick","provider":"openrouter","solved":true,"mei":0.9,"score":0.9,"hallucination_count":0,"backtrack_count":5,"loop_count":0,"brs":1.0,"latency_s":92.52,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":6,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":7,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":8,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":9,"r":2,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":10,"r":1,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":20},{"step":11,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":12,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":13,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":14,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":15,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":16,"r":3,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":17,"r":3,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":18,"r":3,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":30},{"step":19,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":20,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":21,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":22,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":23,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":24,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":60},{"step":25,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":26,"r":3,"c":4,"direction":"N\u2191","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":40},{"step":27,"r":0,"c":0,"direction":"\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null}],"sr_agg":1.0,"hrr":1.0,"avg_hc":0,"n_trials":1},{"model":"Qwen-2.5-72B","provider":"openrouter","solved":false,"mei":0.476,"score":0.476,"hallucination_count":4,"backtrack_count":1,"loop_count":3,"brs":0.2,"latency_s":57.65,"path":[],"steps":[{"step":1,"r":0,"c":0,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":2,"r":0,"c":1,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":3,"r":0,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":4,"r":0,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":5,"r":1,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":6,"r":0,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":50},{"step":7,"r":0,"c":2,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":8,"r":1,"c":2,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":9,"r":2,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":10,"r":2,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":65},{"step":11,"r":2,"c":2,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":12,"r":1,"c":2,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":13,"r":0,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":14,"r":0,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":15,"r":1,"c":3,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":16,"r":2,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":17,"r":1,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":70},{"step":18,"r":1,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":75},{"step":19,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":20,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":21,"r":3,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":22,"r":3,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":23,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":24,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":25,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":26,"r":3,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":27,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":28,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":29,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":30,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":31,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":32,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":33,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":34,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":35,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":36,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":37,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":38,"r":3,"c":3,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":39,"r":3,"c":2,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":40,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":41,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":42,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":43,"r":3,"c":4,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":44,"r":2,"c":4,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":45,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":46,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":47,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":48,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":49,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":50,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":51,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":52,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":53,"r":2,"c":4,"direction":"S","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":54,"r":3,"c":4,"direction":"W","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":55,"r":3,"c":3,"direction":"N","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null},{"step":56,"r":2,"c":3,"direction":"E","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":null}],"sr_agg":0.0,"hrr":0.2,"avg_hc":4,"n_trials":1},{"model":"Claude-3.7-Sonnet","provider":"openrouter","solved":true,"mei":0.9,"score":0.9,"hallucination_count":0,"backtrack_count":0,"loop_count":0,"brs":1.0,"latency_s":18.22,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[3,3],[3,2],[3,1],[3,0],[4,0],[4,1],[4,2],[4,3],[4,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100},{"step":2,"r":0,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":3,"r":0,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":4,"r":0,"c":3,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":5,"r":1,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":6,"r":1,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":7,"r":2,"c":4,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":8,"r":3,"c":4,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":9,"r":3,"c":3,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":10,"r":3,"c":2,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":11,"r":3,"c":1,"direction":"W\u2190","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":80},{"step":12,"r":3,"c":0,"direction":"S\u2193","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":85},{"step":13,"r":4,"c":0,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":14,"r":4,"c":1,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":90},{"step":15,"r":4,"c":2,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":95},{"step":16,"r":4,"c":3,"direction":"E\u2192","is_hallucination":false,"is_backtrack":false,"is_loop":false,"confidence":100}],"sr_agg":1.0,"hrr":1.0,"avg_hc":0,"n_trials":1},{"model":"GPT-4o","provider":"openrouter","solved":false,"mei":0.594,"score":0.594,"hallucination_count":0,"backtrack_count":0,"loop_count":0,"brs":1.0,"latency_s":14.25,"path":[[0,0],[0,1],[0,2],[0,3],[1,3],[1,4],[2,4],[3,4],[2,3],[2,2],[1,2],[0,2],[1,3],[1,4],[0,4],[1,4],[2,4],[3,4]],"steps":[{"step":1,"r":0,"c":0,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":2,"r":0,"c":1,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":3,"r":0,"c":2,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":4,"r":0,"c":3,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":5,"r":1,"c":3,"direction":"E","confidence":80,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":6,"r":1,"c":4,"direction":"S","confidence":70,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":7,"r":2,"c":4,"direction":"S","confidence":70,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":3,"c":4,"direction":"N","confidence":90,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":3,"c":4,"direction":"W","confidence":80,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":7,"r":2,"c":3,"direction":"N","confidence":70,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":2,"c":2,"direction":"E","confidence":60,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":8,"r":2,"c":2,"direction":"N","confidence":60,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":9,"r":1,"c":2,"direction":"N","confidence":60,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":10,"r":0,"c":2,"direction":"E","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":11,"r":1,"c":3,"direction":"N","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":12,"r":1,"c":4,"direction":"N","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":13,"r":0,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":14,"r":1,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":15,"r":2,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false},{"step":16,"r":3,"c":4,"direction":"S","confidence":100,"is_hallucination":false,"is_backtrack":false,"is_loop":false}],"sr_agg":0.0,"hrr":1.0,"avg_hc":0,"n_trials":1}]}; | |
| // ── HELPERS ── | |
| const SHORT_NAME_MAP = { | |
| 'Llama-4-Scout': 'Llama-4-Scout', | |
| 'Llama-4-Maverick': 'Llama-4-Maverick', | |
| 'Claude-3.7-Sonnet':'Claude-3.7', | |
| 'Claude-Haiku': 'Claude-Haiku', | |
| 'MiniMax-M2.5': 'MiniMax', | |
| 'GPT-4o-mini': 'GPT-4o-mini', | |
| 'GPT-4o': 'GPT-4o', | |
| 'Gemini-Flash': 'Gemini', | |
| 'Qwen-2.5-72B': 'Qwen-2.5', | |
| 'glm-4.7': 'GLM-4.7', | |
| }; | |
| function getShortName(model) { | |
| return SHORT_NAME_MAP[model] || model.split('-').slice(0,2).join('-'); | |
| } | |
| // ── CONSTANTS ── | |
| const MODEL_COLORS = [ | |
| '#00d4aa', '#a78bfa', '#fbbf24', '#34d399', '#60a5fa', '#f472b6', | |
| '#fb923c', '#e879f9', '#38bdf8', '#f87171' | |
| ]; | |
| const MODELS = RAW_DATA.results; | |
| // Aggregate stats (n=60 per model) — source: experiment_results/analysis_final2.json | |
| const AGGREGATE_STATS = { | |
| 'Claude-3.7-Sonnet': { mei: 0.7742, hrr: 0.875, sr: 0.5667, avg_hc: 0.85, avg_bt: 0.2, n: 60 }, | |
| 'glm-4.7': { mei: 0.6145, hrr: 0.7181, sr: 0.0833, avg_hc: 1.43, avg_bt: 2.0, n: 60 }, | |
| 'Llama-4-Maverick': { mei: 0.6001, hrr: 0.8111, sr: 0.1333, avg_hc: 1.8, avg_bt: 5.0, n: 60 }, | |
| 'MiniMax-M2.5': { mei: 0.5932, hrr: 0.6, sr: 0.5333, avg_hc: 0.07, avg_bt: 0.0, n: 60 }, | |
| 'Llama-4-Scout': { mei: 0.5892, hrr: 0.81, sr: 0.0833, avg_hc: 2.05, avg_bt: 2.0, n: 60 }, | |
| 'Qwen-2.5-72B': { mei: 0.5588, hrr: 0.607, sr: 0.1, avg_hc: 2.27, avg_bt: 1.0, n: 60 }, | |
| 'Gemini-Flash': { mei: 0.4317, hrr: 0.4028, sr: 0.0833, avg_hc: 1.07, avg_bt: 6.0, n: 60 }, | |
| 'Claude-Haiku': { mei: 0.3983, hrr: 0.3634, sr: 0.05, avg_hc: 3.93, avg_bt: 21.0, n: 60 }, | |
| 'GPT-4o-mini': { mei: 0.3908, hrr: 0.3819, sr: 0.05, avg_hc: 1.28, avg_bt: 16.0, n: 60 }, | |
| 'GPT-4o': { mei: 0.3146, hrr: 0.3528, sr: 0.0667, avg_hc: 0.7, avg_bt: 0.0, n: 60 }, | |
| }; | |
| const MAZE = RAW_DATA.maze; | |
| const N = MAZE.N; | |
| const WALLS = MAZE.walls; | |
| const END = MAZE.end; | |
| const SOLUTION_PATH = MAZE.solution; | |
| const MAX_STEPS = Math.max(...MODELS.map(m => m.steps.length)); | |
| // Current state | |
| let currentStep = 0; | |
| let playing = false; | |
| let playInterval = null; | |
| let currentView = 'grid'; | |
| // Per-model canvases in grid view | |
| const gridCanvases = []; | |
| const gridCtxs = []; | |
| // ── HELPERS ── | |
| function canMove(r, c, dir) { | |
| if (r < 0 || r >= N || c < 0 || c >= N) return false; | |
| return !WALLS[r][c][dir]; | |
| } | |
| function getStepsUpTo(modelIdx, step) { | |
| const m = MODELS[modelIdx]; | |
| return m.steps.slice(0, Math.min(step, m.steps.length)); | |
| } | |
| function getCurrentPos(modelIdx, step) { | |
| const m = MODELS[modelIdx]; | |
| if (m.steps.length === 0) return [0, 0]; | |
| const s = m.steps[Math.min(step, m.steps.length) - 1]; | |
| if (!s) return [0, 0]; | |
| return [s.r, s.c]; | |
| } | |
| function getModelStatus(modelIdx, step) { | |
| const m = MODELS[modelIdx]; | |
| if (step === 0) return 'waiting'; | |
| const stepsAvail = Math.min(step, m.steps.length); | |
| const pos = getCurrentPos(modelIdx, step); | |
| if (pos[0] === END[0] && pos[1] === END[1]) return 'solved'; | |
| if (stepsAvail >= m.steps.length) return m.solved ? 'solved' : 'stuck'; | |
| return 'solving'; | |
| } | |
| function meiAtStep(modelIdx, step) { | |
| const m = MODELS[modelIdx]; | |
| const stepsUsed = Math.min(step, m.steps.length); | |
| if (stepsUsed === 0) return 0; | |
| const solLen = SOLUTION_PATH.length; | |
| const progress = stepsUsed / m.steps.length; | |
| return Math.min(m.mei, m.mei * Math.pow(progress, 0.5)); | |
| } | |
| // ── MAZE DRAWING ── | |
| function drawMaze(ctx, size, modelIdx, upToStep, options = {}) { | |
| const { showTrail = true, trailAlpha = 1, agentSize = 0.35 } = options; | |
| const cell = size / N; | |
| const pad = cell * 0.08; | |
| // Background | |
| ctx.fillStyle = '#0e1118'; | |
| ctx.fillRect(0, 0, size, size); | |
| // Solution path hint (very faint) | |
| ctx.fillStyle = 'rgba(255,255,255,0.02)'; | |
| for (const [sr, sc] of SOLUTION_PATH) { | |
| ctx.fillRect(sc * cell + 0.5, sr * cell + 0.5, cell - 1, cell - 1); | |
| } | |
| // Trail | |
| const color = MODEL_COLORS[modelIdx]; | |
| const steps = getStepsUpTo(modelIdx, upToStep); | |
| if (showTrail && steps.length > 0) { | |
| const trailLen = steps.length; | |
| for (let i = 0; i < trailLen; i++) { | |
| const s = steps[i]; | |
| const alpha = trailAlpha * (0.15 + 0.65 * (i / trailLen)); | |
| ctx.fillStyle = hexToRgba(color, alpha); | |
| ctx.fillRect(s.c * cell + pad, s.r * cell + pad, cell - 2*pad, cell - 2*pad); | |
| } | |
| // Hallucination flashes | |
| for (const s of steps) { | |
| if (s.is_hallucination) { | |
| ctx.fillStyle = 'rgba(248,113,113,0.35)'; | |
| ctx.fillRect(s.c * cell + 1, s.r * cell + 1, cell - 2, cell - 2); | |
| // lightning bolt | |
| ctx.fillStyle = '#fbbf24'; | |
| ctx.font = `bold ${Math.floor(cell*0.45)}px sans-serif`; | |
| ctx.textAlign = 'center'; | |
| ctx.textBaseline = 'middle'; | |
| ctx.fillText('!', s.c * cell + cell/2, s.r * cell + cell/2); | |
| } | |
| } | |
| } | |
| // Walls | |
| ctx.strokeStyle = '#2a3048'; | |
| ctx.lineWidth = 1.5; | |
| for (let r = 0; r < N; r++) { | |
| for (let c = 0; c < N; c++) { | |
| const w = WALLS[r][c]; | |
| const x = c * cell, y = r * cell; | |
| ctx.beginPath(); | |
| if (w.N) { ctx.moveTo(x, y); ctx.lineTo(x + cell, y); } | |
| ctx.stroke(); ctx.beginPath(); | |
| if (w.S) { ctx.moveTo(x, y+cell); ctx.lineTo(x+cell, y+cell); } | |
| ctx.stroke(); ctx.beginPath(); | |
| if (w.W) { ctx.moveTo(x, y); ctx.lineTo(x, y+cell); } | |
| ctx.stroke(); ctx.beginPath(); | |
| if (w.E) { ctx.moveTo(x+cell, y); ctx.lineTo(x+cell, y+cell); } | |
| ctx.stroke(); | |
| } | |
| } | |
| // Border | |
| ctx.strokeStyle = '#3a4060'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(1, 1, size - 2, size - 2); | |
| // Start marker | |
| ctx.fillStyle = '#34d399'; | |
| ctx.font = `${Math.floor(cell*0.4)}px sans-serif`; | |
| ctx.textAlign = 'center'; ctx.textBaseline = 'middle'; | |
| ctx.fillText('S', cell*0.5, cell*0.5); | |
| // End marker | |
| const status = getModelStatus(modelIdx, upToStep); | |
| if (status === 'solved') { | |
| ctx.fillStyle = '#ffd700'; | |
| ctx.font = `bold ${Math.floor(cell*0.5)}px sans-serif`; | |
| ctx.fillText('G', END[1]*cell + cell/2, END[0]*cell + cell/2); | |
| } else { | |
| ctx.fillStyle = '#f87171'; | |
| ctx.fillText('G', END[1]*cell + cell/2, END[0]*cell + cell/2); | |
| } | |
| // Agent dot | |
| if (upToStep > 0) { | |
| const [ar, ac] = getCurrentPos(modelIdx, upToStep); | |
| const cx = ac * cell + cell/2; | |
| const cy = ar * cell + cell/2; | |
| const r = cell * agentSize; | |
| ctx.shadowColor = color; | |
| ctx.shadowBlur = 10; | |
| ctx.fillStyle = color; | |
| ctx.beginPath(); | |
| ctx.arc(cx, cy, r, 0, Math.PI * 2); | |
| ctx.fill(); | |
| ctx.shadowBlur = 0; | |
| // Current step indicator (backtrack/hall) | |
| if (steps.length > 0) { | |
| const lastStep = steps[steps.length - 1]; | |
| if (lastStep.is_backtrack) { | |
| ctx.strokeStyle = '#fb923c'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| ctx.arc(cx, cy, r + 2, 0, Math.PI * 2); | |
| ctx.stroke(); | |
| } | |
| } | |
| } | |
| } | |
| function hexToRgba(hex, alpha) { | |
| const r = parseInt(hex.slice(1,3), 16); | |
| const g = parseInt(hex.slice(3,5), 16); | |
| const b = parseInt(hex.slice(5,7), 16); | |
| return `rgba(${r},${g},${b},${alpha})`; | |
| } | |
| // ── BUILD GRID VIEW ── | |
| function buildGridView() { | |
| const container = document.getElementById('grid-row'); | |
| container.innerHTML = ''; | |
| gridCanvases.length = 0; | |
| gridCtxs.length = 0; | |
| MODELS.forEach((m, i) => { | |
| const color = MODEL_COLORS[i]; | |
| const card = document.createElement('div'); | |
| card.className = 'model-card'; | |
| card.id = `card-${i}`; | |
| card.style.setProperty('--card-color', color); | |
| card.innerHTML = ` | |
| <div class="card-header"> | |
| <div class="card-dot" style="background:${color}"></div> | |
| <div class="card-name">${m.model}</div> | |
| <div class="card-badge badge-waiting" id="badge-${i}">WAITING</div> | |
| </div> | |
| <div class="card-canvas-wrap" id="canvas-wrap-${i}"> | |
| <canvas id="maze-${i}" class="maze-canvas"></canvas> | |
| </div> | |
| <div class="card-stats"> | |
| <div class="stat-cell"> | |
| <span class="stat-val" id="steps-${i}" style="color:${color}">0</span> | |
| <span class="stat-lbl">Steps</span> | |
| </div> | |
| <div class="stat-cell"> | |
| <span class="stat-val" id="halls-${i}" style="color:var(--red)">0</span> | |
| <span class="stat-lbl">Halls</span> | |
| </div> | |
| <div class="stat-cell"> | |
| <span class="stat-val" id="bt-${i}" style="color:var(--orange)">0</span> | |
| <span class="stat-lbl">BT</span> | |
| </div> | |
| </div> | |
| <div class="card-mei"> | |
| <div class="mei-track"> | |
| <div class="mei-fill" id="mei-fill-${i}" style="background:${color};width:0%"></div> | |
| </div> | |
| </div> | |
| `; | |
| container.appendChild(card); | |
| const canvas = document.getElementById(`maze-${i}`); | |
| gridCanvases.push(canvas); | |
| gridCtxs.push(canvas.getContext('2d')); | |
| }); | |
| resizeGridCanvases(); | |
| } | |
| function resizeGridCanvases() { | |
| MODELS.forEach((m, i) => { | |
| const wrap = document.getElementById(`canvas-wrap-${i}`); | |
| if (!wrap) return; | |
| const w = wrap.clientWidth - 12; | |
| const h = wrap.clientHeight - 12; | |
| const s = Math.min(w, h, 240); | |
| gridCanvases[i].width = s; | |
| gridCanvases[i].height = s; | |
| }); | |
| } | |
| // ── BUILD OVERLAY VIEW ── | |
| function buildOverlayView() { | |
| const wrap = document.getElementById('overlay-canvas-wrap'); | |
| const canvas = document.getElementById('overlay-canvas'); | |
| const legend = document.getElementById('overlay-legend'); | |
| const mainArea = document.getElementById('main'); | |
| const avail = Math.min(mainArea.clientHeight - 60, 420); | |
| const size = Math.max(avail, 260); | |
| canvas.width = size; | |
| canvas.height = size; | |
| legend.innerHTML = ''; | |
| MODELS.forEach((m, i) => { | |
| const row = document.createElement('div'); | |
| row.className = 'legend-row'; | |
| row.id = `legend-${i}`; | |
| row.style.setProperty('--row-color', MODEL_COLORS[i]); | |
| row.innerHTML = ` | |
| <div class="legend-color" style="background:${MODEL_COLORS[i]}"></div> | |
| <div class="legend-name">${m.model}</div> | |
| <div class="legend-steps" id="legend-steps-${i}">0 steps</div> | |
| `; | |
| legend.appendChild(row); | |
| }); | |
| } | |
| function drawOverlay(step) { | |
| const canvas = document.getElementById('overlay-canvas'); | |
| if (!canvas) return; | |
| const ctx = canvas.getContext('2d'); | |
| const size = canvas.width; | |
| const cell = size / N; | |
| // Background + walls (same as single maze) | |
| ctx.fillStyle = '#0e1118'; | |
| ctx.fillRect(0, 0, size, size); | |
| // Solution hint | |
| ctx.fillStyle = 'rgba(255,255,255,0.025)'; | |
| for (const [sr, sc] of SOLUTION_PATH) { | |
| ctx.fillRect(sc*cell+0.5, sr*cell+0.5, cell-1, cell-1); | |
| } | |
| // All trails first | |
| MODELS.forEach((m, i) => { | |
| const steps = getStepsUpTo(i, step); | |
| const color = MODEL_COLORS[i]; | |
| steps.forEach((s, idx) => { | |
| const alpha = 0.1 + 0.2 * (idx / Math.max(steps.length, 1)); | |
| ctx.fillStyle = hexToRgba(color, alpha); | |
| const pad = cell * 0.08; | |
| ctx.fillRect(s.c*cell+pad, s.r*cell+pad, cell-2*pad, cell-2*pad); | |
| }); | |
| }); | |
| // Walls | |
| ctx.strokeStyle = '#2a3048'; | |
| ctx.lineWidth = 1.5; | |
| for (let r = 0; r < N; r++) { | |
| for (let c = 0; c < N; c++) { | |
| const w = WALLS[r][c]; | |
| const x = c*cell, y = r*cell; | |
| ['N','S','W','E'].forEach(dir => { | |
| if (w[dir]) { | |
| ctx.beginPath(); | |
| if (dir==='N') { ctx.moveTo(x,y); ctx.lineTo(x+cell,y); } | |
| if (dir==='S') { ctx.moveTo(x,y+cell); ctx.lineTo(x+cell,y+cell); } | |
| if (dir==='W') { ctx.moveTo(x,y); ctx.lineTo(x,y+cell); } | |
| if (dir==='E') { ctx.moveTo(x+cell,y); ctx.lineTo(x+cell,y+cell); } | |
| ctx.stroke(); | |
| } | |
| }); | |
| } | |
| } | |
| ctx.strokeStyle = '#3a4060'; | |
| ctx.lineWidth = 2; | |
| ctx.strokeRect(1, 1, size-2, size-2); | |
| // Start / End | |
| ctx.fillStyle = '#34d399'; | |
| ctx.font = `bold ${Math.floor(cell*0.4)}px sans-serif`; | |
| ctx.textAlign = 'center'; ctx.textBaseline = 'middle'; | |
| ctx.fillText('S', cell*0.5, cell*0.5); | |
| ctx.fillStyle = '#f87171'; | |
| ctx.fillText('G', END[1]*cell+cell/2, END[0]*cell+cell/2); | |
| // Agent dots with initials | |
| MODELS.forEach((m, i) => { | |
| if (step === 0) return; | |
| const [ar, ac] = getCurrentPos(i, step); | |
| const cx = ac*cell + cell/2; | |
| const cy = ar*cell + cell/2; | |
| const rad = cell * 0.32; | |
| const color = MODEL_COLORS[i]; | |
| ctx.shadowColor = color; | |
| ctx.shadowBlur = 8; | |
| ctx.fillStyle = hexToRgba(color, 0.9); | |
| ctx.beginPath(); | |
| ctx.arc(cx, cy, rad, 0, Math.PI*2); | |
| ctx.fill(); | |
| ctx.shadowBlur = 0; | |
| // Initials | |
| ctx.fillStyle = '#080a10'; | |
| ctx.font = `bold ${Math.floor(rad)}px DM Mono, monospace`; | |
| ctx.textAlign = 'center'; ctx.textBaseline = 'middle'; | |
| ctx.fillText(m.model[0], cx, cy); | |
| }); | |
| // Update legend | |
| MODELS.forEach((m, i) => { | |
| const el = document.getElementById(`legend-steps-${i}`); | |
| if (el) el.textContent = `${Math.min(step, m.steps.length)} / ${m.steps.length}`; | |
| const row = document.getElementById(`legend-${i}`); | |
| if (row) row.classList.toggle('active-step', false); | |
| }); | |
| } | |
| // ── BUILD TIMELINE VIEW ── | |
| function buildTimelineView() { | |
| const container = document.getElementById('view-timeline'); | |
| container.innerHTML = ''; | |
| MODELS.forEach((m, i) => { | |
| const color = MODEL_COLORS[i]; | |
| const row = document.createElement('div'); | |
| row.className = 'timeline-row'; | |
| const segW = 100 / m.steps.length; | |
| let segsHTML = ''; | |
| m.steps.forEach((s, idx) => { | |
| let bg = color; | |
| let opacity = 0.7; | |
| if (s.is_hallucination) { bg = '#f87171'; opacity = 1; } | |
| else if (s.is_backtrack) { bg = '#fb923c'; opacity = 0.85; } | |
| else if (s.is_loop) { bg = '#fbbf24'; opacity = 0.7; } | |
| segsHTML += `<div class="tl-segment" style="left:${idx*segW}%;width:${segW}%;background:${bg};opacity:${opacity}" title="Step ${idx+1}: ${s.is_hallucination?'HALL':s.is_backtrack?'BT':s.is_loop?'LOOP':'OK'}"></div>`; | |
| }); | |
| const halls = m.steps.filter(s => s.is_hallucination).length; | |
| const bts = m.steps.filter(s => s.is_backtrack).length; | |
| row.innerHTML = ` | |
| <div class="tl-name" style="color:${color}">${m.model}</div> | |
| <div class="tl-bar-wrap" id="tl-bar-${i}" onclick="seekTimeline(${i}, event)"> | |
| ${segsHTML} | |
| <div class="tl-playhead" id="tl-ph-${i}" style="left:0%"></div> | |
| </div> | |
| <div class="tl-stats"> | |
| <div class="tl-stat"><span id="tl-step-${i}">0</span>/${m.steps.length}</div> | |
| <div class="tl-stat">H:<span style="color:var(--red)">${halls}</span></div> | |
| <div class="tl-stat">BT:<span style="color:var(--orange)">${bts}</span></div> | |
| </div> | |
| `; | |
| container.appendChild(row); | |
| }); | |
| } | |
| function seekTimeline(modelIdx, event) { | |
| const bar = event.currentTarget; | |
| const rect = bar.getBoundingClientRect(); | |
| const pct = (event.clientX - rect.left) / rect.width; | |
| const m = MODELS[modelIdx]; | |
| currentStep = Math.round(pct * m.steps.length); | |
| updateAll(); | |
| } | |
| // ── UPDATE ALL ── | |
| function updateAll() { | |
| const step = currentStep; | |
| const maxS = MAX_STEPS; | |
| // Progress bar | |
| document.getElementById('progress-fill').style.width = `${(step / maxS) * 100}%`; | |
| document.getElementById('step-counter').textContent = `Step ${step} / ${maxS}`; | |
| if (currentView === 'grid') updateGrid(step); | |
| if (currentView === 'overlay') drawOverlay(step); | |
| if (currentView === 'timeline') updateTimeline(step); | |
| updateLeaderboard(step); | |
| } | |
| function updateGrid(step) { | |
| MODELS.forEach((m, i) => { | |
| if (!gridCanvases[i]) return; | |
| const size = gridCanvases[i].width; | |
| if (size < 10) return; | |
| drawMaze(gridCtxs[i], size, i, step); | |
| const status = getModelStatus(i, step); | |
| const badge = document.getElementById(`badge-${i}`); | |
| const card = document.getElementById(`card-${i}`); | |
| const stepsNow = Math.min(step, m.steps.length); | |
| if (badge) { | |
| badge.className = 'card-badge'; | |
| if (status === 'solved') { badge.classList.add('badge-solved'); badge.textContent = 'SOLVED'; } | |
| else if (status === 'stuck') { badge.classList.add('badge-stuck'); badge.textContent = 'STUCK'; } | |
| else if (status === 'waiting') { badge.classList.add('badge-waiting'); badge.textContent = 'WAIT'; } | |
| else { badge.classList.add('badge-solving'); badge.textContent = 'SOLVING'; } | |
| } | |
| if (card) { | |
| card.classList.toggle('solved-glow', status === 'solved'); | |
| // flash active on current step | |
| if (step > 0 && step <= m.steps.length) { | |
| card.classList.add('active-step'); | |
| setTimeout(() => card.classList.remove('active-step'), 200); | |
| } | |
| } | |
| const hallsSoFar = m.steps.slice(0, stepsNow).filter(s => s.is_hallucination).length; | |
| const btSoFar = m.steps.slice(0, stepsNow).filter(s => s.is_backtrack).length; | |
| const el_s = document.getElementById(`steps-${i}`); | |
| const el_h = document.getElementById(`halls-${i}`); | |
| const el_b = document.getElementById(`bt-${i}`); | |
| if (el_s) el_s.textContent = stepsNow; | |
| if (el_h) el_h.textContent = hallsSoFar; | |
| if (el_b) el_b.textContent = btSoFar; | |
| const mei = meiAtStep(i, step); | |
| const mf = document.getElementById(`mei-fill-${i}`); | |
| if (mf) mf.style.width = `${mei * 100}%`; | |
| }); | |
| } | |
| function updateTimeline(step) { | |
| MODELS.forEach((m, i) => { | |
| const stepsNow = Math.min(step, m.steps.length); | |
| const pct = m.steps.length > 0 ? (stepsNow / m.steps.length) * 100 : 0; | |
| const ph = document.getElementById(`tl-ph-${i}`); | |
| if (ph) ph.style.left = `${pct}%`; | |
| const sc = document.getElementById(`tl-step-${i}`); | |
| if (sc) sc.textContent = stepsNow; | |
| }); | |
| } | |
| // ── LEADERBOARD ── | |
| function updateLeaderboard(step) { | |
| const tbody = document.getElementById('lb-body'); | |
| if (!tbody) return; | |
| const rows = MODELS.map((m, i) => { | |
| const stepsNow = Math.min(step, m.steps.length); | |
| const status = getModelStatus(i, step); | |
| const mei = meiAtStep(i, step); | |
| const halls = m.steps.slice(0, stepsNow).filter(s => s.is_hallucination).length; | |
| const bts = m.steps.slice(0, stepsNow).filter(s => s.is_backtrack).length; | |
| return { i, m, stepsNow, status, mei, halls, bts }; | |
| }); | |
| // Sort: solved first (by fewer steps), then by MEI desc | |
| rows.sort((a, b) => { | |
| if (a.status === 'solved' && b.status !== 'solved') return -1; | |
| if (b.status === 'solved' && a.status !== 'solved') return 1; | |
| if (a.status === 'solved' && b.status === 'solved') return a.stepsNow - b.stepsNow; | |
| return b.mei - a.mei; | |
| }); | |
| tbody.innerHTML = rows.map((row, rank) => { | |
| const color = MODEL_COLORS[row.i]; | |
| const statusText = row.status === 'solved' ? '★ DONE' : | |
| row.status === 'stuck' ? 'STUCK' : | |
| row.status === 'waiting'? '--' : `${row.stepsNow}/${row.m.steps.length}`; | |
| const statusColor = row.status === 'solved' ? '#ffd700' : | |
| row.status === 'stuck' ? '#f87171' : ''; | |
| const meiPct = (row.mei * 100).toFixed(0); | |
| return `<tr> | |
| <td class="lb-rank">${rank+1}</td> | |
| <td class="lb-model"><span class="lb-dot" style="background:${color}"></span>${row.m.model}</td> | |
| <td>${row.stepsNow}</td> | |
| <td style="color:var(--red)">${row.halls}</td> | |
| <td style="color:var(--orange)">${row.bts}</td> | |
| <td> | |
| <span style="margin-right:4px;font-size:10px">${meiPct}%</span> | |
| <span class="lb-mei-bar" style="width:${meiPct}px;background:${color}"></span> | |
| </td> | |
| <td style="color:${statusColor};font-size:10px">${statusText}</td> | |
| </tr>`; | |
| }).join(''); | |
| } | |
| // ── PLAYBACK ── | |
| function getSpeed() { | |
| const v = parseInt(document.getElementById('speed-slider').value); | |
| // speed 1=slow(800ms), 10=fast(80ms) | |
| return Math.round(800 - (v - 1) * 80); | |
| } | |
| function togglePlay() { | |
| playing = !playing; | |
| const btn = document.getElementById('play-btn'); | |
| if (playing) { | |
| btn.textContent = '⏸ Pause'; | |
| scheduleNext(); | |
| } else { | |
| btn.textContent = '▶ Play'; | |
| if (playInterval) clearTimeout(playInterval); | |
| } | |
| } | |
| function scheduleNext() { | |
| if (!playing) return; | |
| playInterval = setTimeout(() => { | |
| if (currentStep < MAX_STEPS) { | |
| currentStep++; | |
| updateAll(); | |
| scheduleNext(); | |
| } else { | |
| playing = false; | |
| document.getElementById('play-btn').textContent = '▶ Play'; | |
| } | |
| }, getSpeed()); | |
| } | |
| function stepForward() { | |
| if (currentStep < MAX_STEPS) { currentStep++; updateAll(); } | |
| } | |
| function stepBackward() { | |
| if (currentStep > 0) { currentStep--; updateAll(); } | |
| } | |
| function seekProgress(event) { | |
| const track = document.getElementById('progress-track'); | |
| const rect = track.getBoundingClientRect(); | |
| const pct = (event.clientX - rect.left) / rect.width; | |
| currentStep = Math.round(pct * MAX_STEPS); | |
| updateAll(); | |
| } | |
| // ── VIEW SWITCHING ── | |
| function setView(v) { | |
| currentView = v; | |
| ['grid','overlay','timeline'].forEach(name => { | |
| document.getElementById(`view-${name}`).style.display = name === v ? 'flex' : 'none'; | |
| document.getElementById(`btn-${name}`).classList.toggle('active', name === v); | |
| }); | |
| if (v === 'overlay') buildOverlayView(); | |
| updateAll(); | |
| } | |
| // ── KEYBOARD ── | |
| document.addEventListener('keydown', e => { | |
| if (e.code === 'Space') { e.preventDefault(); togglePlay(); } | |
| if (e.code === 'ArrowRight') { e.preventDefault(); stepForward(); } | |
| if (e.code === 'ArrowLeft') { e.preventDefault(); stepBackward(); } | |
| }); | |
| // ── RESIZE ── | |
| let resizeTimer; | |
| window.addEventListener('resize', () => { | |
| clearTimeout(resizeTimer); | |
| resizeTimer = setTimeout(() => { | |
| resizeGridCanvases(); | |
| if (currentView === 'overlay') buildOverlayView(); | |
| updateAll(); | |
| }, 150); | |
| }); | |
| // ── METRICS PANEL ── | |
| const OPTIMAL_STEPS = SOLUTION_PATH.length; | |
| function buildMetricsPanel() { | |
| const meiRows = document.getElementById('metric-mei-rows'); | |
| const effRows = document.getElementById('metric-eff-rows'); | |
| const errRows = document.getElementById('metric-err-rows'); | |
| // Pre-compute values for sorting — aggregate stats (n=60) take priority | |
| const modelData = MODELS.map((m, i) => { | |
| const agg = AGGREGATE_STATS[m.model] || {}; | |
| const mei = agg.mei !== undefined ? agg.mei : m.mei; | |
| const hrr = agg.hrr !== undefined ? agg.hrr : m.hrr; | |
| const hasData = mei !== null && mei !== undefined; | |
| const meiVal = hasData ? mei : 0; | |
| const hrrVal = hrr !== null && hrr !== undefined ? hrr : | |
| (m.solved ? Math.min(1, OPTIMAL_STEPS / m.steps.length) : 0); | |
| const hcVal = agg.avg_hc !== undefined ? agg.avg_hc : | |
| (m.avg_hc !== null && m.avg_hc !== undefined) ? m.avg_hc : m.hallucination_count; | |
| const btVal = agg.avg_bt !== undefined ? agg.avg_bt : (m.backtrack_count || 0); | |
| const errTotal = hcVal + btVal; | |
| const n_trials = agg.n || m.n_trials; | |
| return { i, m, agg, hasData, meiVal, hrrVal, hcVal, btVal, errTotal, n_trials }; | |
| }); | |
| // Sort indices descending by MEI | |
| const meiOrder = [...modelData].sort((a, b) => b.meiVal - a.meiVal); | |
| // Sort indices descending by HRR | |
| const hrrOrder = [...modelData].sort((a, b) => b.hrrVal - a.hrrVal); | |
| // Sort indices descending by error total (highest errors first = worst) | |
| const errOrder = [...modelData].sort((a, b) => b.errTotal - a.errTotal); | |
| const maxErr = Math.max(...modelData.map(d => d.errTotal)) || 1; | |
| meiOrder.forEach(d => { | |
| const { i, m, hasData, meiVal, n_trials } = d; | |
| const color = MODEL_COLORS[i % MODEL_COLORS.length]; | |
| const shortName = getShortName(m.model); | |
| const nLabel = n_trials ? ` n=${n_trials}` : ''; | |
| const errLabel = m.error ? ' !' : ''; | |
| meiRows.insertAdjacentHTML('beforeend', ` | |
| <div class="metric-row"> | |
| <div class="metric-model-name" style="color:${color}">${shortName}${errLabel}</div> | |
| <div class="metric-bar-track"> | |
| <div class="metric-bar-fill" id="mei-bar-${i}" style="background:${color};width:${meiVal*100}%;opacity:${hasData?1:0.3}"></div> | |
| </div> | |
| <div class="metric-val" id="mei-val-${i}">${hasData ? meiVal.toFixed(3) : 'N/A'}${nLabel}</div> | |
| </div>`); | |
| }); | |
| hrrOrder.forEach(d => { | |
| const { i, m, hasData, hrrVal } = d; | |
| const color = MODEL_COLORS[i % MODEL_COLORS.length]; | |
| const shortName = getShortName(m.model); | |
| const hrrLabel = (m.hrr !== null && m.hrr !== undefined) ? 'HRR' : 'Eff'; | |
| effRows.insertAdjacentHTML('beforeend', ` | |
| <div class="metric-row"> | |
| <div class="metric-model-name" style="color:${color}">${shortName}</div> | |
| <div class="metric-bar-track"> | |
| <div class="metric-bar-fill" id="eff-bar-${i}" style="background:${color};width:${hrrVal*100}%;opacity:${hasData?1:0.3}"></div> | |
| </div> | |
| <div class="metric-val" id="eff-val-${i}">${hrrLabel}:${(hrrVal*100).toFixed(0)}%</div> | |
| </div>`); | |
| }); | |
| errOrder.forEach(d => { | |
| const { i, m, hasData, hcVal } = d; | |
| const color = MODEL_COLORS[i % MODEL_COLORS.length]; | |
| const shortName = getShortName(m.model); | |
| const hallPct = (hcVal / maxErr * 100).toFixed(1); | |
| const btPct = ((m.backtrack_count||0) / maxErr * 100).toFixed(1); | |
| const hcDisplay = m.error ? 'Error' : `H${hcVal.toFixed(1)}/B${d.btVal.toFixed(0)}`; | |
| errRows.insertAdjacentHTML('beforeend', ` | |
| <div class="metric-row"> | |
| <div class="metric-model-name" style="color:${color}">${shortName}</div> | |
| <div class="metric-bar-track" style="position:relative"> | |
| <div style="position:absolute;left:0;top:0;height:100%;width:${hallPct}%;background:#f87171;border-radius:4px 0 0 4px;opacity:${hasData?1:0.3}"></div> | |
| <div style="position:absolute;left:${hallPct}%;top:0;height:100%;width:${btPct}%;background:#fb923c;opacity:${hasData?1:0.3}"></div> | |
| </div> | |
| <div class="metric-val">${hcDisplay}</div> | |
| </div>`); | |
| }); | |
| drawRadar(); | |
| } | |
| let selectedRadarModel = -1; | |
| function buildRadarLegend() { | |
| const legend = document.getElementById('radar-legend'); | |
| if (!legend) return; | |
| legend.textContent = ''; | |
| MODELS.forEach((m, i) => { | |
| const color = MODEL_COLORS[i % MODEL_COLORS.length]; | |
| const shortName = getShortName(m.model); | |
| const item = document.createElement('div'); | |
| item.className = 'radar-leg-item' + | |
| (selectedRadarModel >= 0 && selectedRadarModel !== i ? ' dimmed' : '') + | |
| (selectedRadarModel === i ? ' selected' : ''); | |
| const swatch = document.createElement('span'); | |
| swatch.className = 'radar-leg-swatch'; | |
| swatch.style.background = color; | |
| const name = document.createElement('span'); | |
| name.className = 'radar-leg-name'; | |
| name.textContent = shortName; | |
| item.appendChild(swatch); | |
| item.appendChild(name); | |
| item.addEventListener('click', () => { | |
| selectedRadarModel = selectedRadarModel === i ? -1 : i; | |
| drawRadar(); | |
| }); | |
| legend.appendChild(item); | |
| }); | |
| } | |
| function drawRadar() { | |
| const canvas = document.getElementById('radar-canvas'); | |
| if (!canvas) return; | |
| const ctx = canvas.getContext('2d'); | |
| const W = canvas.width, H = canvas.height; | |
| const cx = W/2, cy = H/2; | |
| const R = Math.min(W,H)/2 - 24; | |
| const axes = ['MEI','Recovery(HRR)','Confidence','Solve Rate','No-Error']; | |
| const nAxes = axes.length; | |
| ctx.clearRect(0,0,W,H); | |
| // Background grid | |
| for (let ring = 1; ring <= 4; ring++) { | |
| ctx.beginPath(); | |
| for (let a = 0; a < nAxes; a++) { | |
| const angle = (a/nAxes)*Math.PI*2 - Math.PI/2; | |
| const r = R * ring/4; | |
| const x = cx + Math.cos(angle)*r; | |
| const y = cy + Math.sin(angle)*r; | |
| a===0 ? ctx.moveTo(x,y) : ctx.lineTo(x,y); | |
| } | |
| ctx.closePath(); | |
| ctx.strokeStyle = 'rgba(255,255,255,0.07)'; | |
| ctx.lineWidth = 1; | |
| ctx.stroke(); | |
| } | |
| // Axis lines + labels | |
| axes.forEach((label, a) => { | |
| const angle = (a/nAxes)*Math.PI*2 - Math.PI/2; | |
| const x = cx + Math.cos(angle)*R; | |
| const y = cy + Math.sin(angle)*R; | |
| ctx.beginPath(); | |
| ctx.moveTo(cx, cy); | |
| ctx.lineTo(x, y); | |
| ctx.strokeStyle = 'rgba(255,255,255,0.15)'; | |
| ctx.lineWidth = 1; | |
| ctx.stroke(); | |
| // Label | |
| const lx = cx + Math.cos(angle)*(R+14); | |
| const ly = cy + Math.sin(angle)*(R+14); | |
| ctx.fillStyle = 'rgba(255,255,255,0.4)'; | |
| ctx.font = '8px "DM Mono", monospace'; | |
| ctx.textAlign = 'center'; | |
| ctx.textBaseline = 'middle'; | |
| ctx.fillText(label, lx, ly); | |
| }); | |
| // Model polygons | |
| const maxBT = Math.max(...MODELS.map(m => m.backtrack_count)) || 1; | |
| const maxHall = Math.max(...MODELS.map(m => { | |
| const agg = AGGREGATE_STATS[m.model] || {}; | |
| return agg.avg_hc !== undefined ? agg.avg_hc : m.hallucination_count; | |
| })) || 1; | |
| const hasSel = selectedRadarModel >= 0; | |
| MODELS.forEach((m, i) => { | |
| const agg = AGGREGATE_STATS[m.model] || {}; | |
| const color = MODEL_COLORS[i % MODEL_COLORS.length]; | |
| const mei = agg.mei !== undefined ? agg.mei : m.mei; | |
| const hasData = mei !== null && mei !== undefined; | |
| const isSelected = selectedRadarModel === i; | |
| const isDimmed = hasSel && !isSelected; | |
| // Use aggregate stats (n=60) — fall back to single-run | |
| const meiScore = hasData ? mei : 0; | |
| const hrrScore = agg.hrr !== undefined ? agg.hrr : | |
| (m.hrr !== null && m.hrr !== undefined) ? m.hrr : | |
| (m.solved ? Math.min(1, OPTIMAL_STEPS / m.steps.length) : 0); | |
| const srScore = agg.sr !== undefined ? agg.sr : | |
| (m.sr_agg !== null && m.sr_agg !== undefined) ? m.sr_agg : (m.solved ? 1 : 0); | |
| const hcVal = agg.avg_hc !== undefined ? agg.avg_hc : | |
| (m.avg_hc !== null && m.avg_hc !== undefined) ? m.avg_hc : m.hallucination_count; | |
| const noErrScore = hasData ? Math.max(0, 1 - (hcVal / (maxHall+1))) : 0; | |
| // Compute avg confidence from steps (for display only) | |
| const confSteps = m.steps.filter(s => s.confidence !== null && s.confidence !== undefined); | |
| const avgConf = confSteps.length ? confSteps.reduce((s,x)=>s+x.confidence,0)/confSteps.length/100 : 0.5; | |
| const scores = [ | |
| meiScore, // MEI | |
| hrrScore, // Recovery Rate (HRR) | |
| avgConf, // Avg Confidence | |
| srScore, // Solve Rate (aggregate) | |
| noErrScore // No-Error rate | |
| ]; | |
| ctx.beginPath(); | |
| scores.forEach((score, a) => { | |
| const angle = (a/nAxes)*Math.PI*2 - Math.PI/2; | |
| const r = R * score; | |
| const x = cx + Math.cos(angle)*r; | |
| const y = cy + Math.sin(angle)*r; | |
| a===0 ? ctx.moveTo(x,y) : ctx.lineTo(x,y); | |
| }); | |
| ctx.closePath(); | |
| if (isDimmed) { | |
| ctx.fillStyle = color + '08'; | |
| ctx.fill(); | |
| ctx.strokeStyle = color + '30'; | |
| ctx.lineWidth = 1; | |
| } else if (isSelected) { | |
| ctx.fillStyle = color + '44'; | |
| ctx.fill(); | |
| ctx.shadowColor = color; | |
| ctx.shadowBlur = 10; | |
| ctx.strokeStyle = color; | |
| ctx.lineWidth = 3; | |
| } else { | |
| ctx.fillStyle = color + '22'; | |
| ctx.fill(); | |
| ctx.strokeStyle = color + 'aa'; | |
| ctx.lineWidth = 1.5; | |
| } | |
| ctx.stroke(); | |
| ctx.shadowColor = 'transparent'; | |
| ctx.shadowBlur = 0; | |
| }); | |
| // Update clickable legend below canvas | |
| buildRadarLegend(); | |
| } | |
| // ── INIT ── | |
| buildGridView(); | |
| buildTimelineView(); | |
| buildMetricsPanel(); | |
| updateAll(); | |
| // Auto-size after layout — double RAF to ensure min-height CSS is computed | |
| requestAnimationFrame(() => requestAnimationFrame(() => { | |
| resizeGridCanvases(); | |
| updateAll(); | |
| })); | |
| </script> | |
| </body> | |
| </html> |