Spaces:

EXD-AI
/

inference-simulator-v2

Running

App Files Files Community

inference-simulator-v2 / index.html

EXDai

Upload index.html with huggingface_hub

1f8babb verified 10 days ago

Raw

History Blame Contribute Delete

42.4 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>LLM Inference — Pipeline Simulator</title>
	<style>
	:root {
	--bg: #0d1117;
	--surface: #161b22;
	--border: #30363d;
	--text: #c9d1d9;
	--dim: #8b949e;
	--accent: #58a6ff;
	--cpu: #f0883e;
	--ram: #3fb950;
	--gpu: #bc8cff;
	--vram: #f778ba;
	--prefill: #da3633;
	--decode: #d29922;
	--active: #58a6ff;
	}
	* { margin: 0; padding: 0; box-sizing: border-box; }
	body {
	background: var(--bg);
	color: var(--text);
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', monospace;
	display: flex; justify-content: center; align-items: center;
	min-height: 100vh; padding: 2rem;
	}
	.sim {
	max-width: 820px; width: 100%;
	background: var(--surface);
	border: 1px solid var(--border);
	border-radius: 12px; padding: 2rem;
	}
	h1 { font-size: 1.25rem; font-weight: 600; margin-bottom: 0.25rem; color: var(--text); }
	.subtitle { font-size: 0.8rem; color: var(--dim); margin-bottom: 1.5rem; }

	/* Tabs */
	.tabs {
	display: flex; gap: 0; margin-bottom: 1.5rem;
	border-bottom: 2px solid var(--border);
	}
	.tab-btn {
	padding: 0.5rem 1.25rem; border: none; border-bottom: 2px solid transparent;
	background: transparent; color: var(--dim); font-size: 0.85rem;
	cursor: pointer; font-family: inherit; margin-bottom: -2px;
	transition: all 0.2s;
	}
	.tab-btn:hover { color: var(--text); }
	.tab-btn.active {
	color: var(--accent); border-bottom-color: var(--accent);
	background: transparent;
	}
	.tab-panel { display: none; }
	.tab-panel.visible { display: block; }

	/* ---- Concurrency Tab Styles ---- */
	.conc-header {
	display: flex; justify-content: space-between; align-items: flex-start;
	margin-bottom: 1rem; gap: 1rem;
	}
	.conc-meters {
	display: flex; gap: 1.5rem; flex-shrink: 0;
	}
	.conc-meter {
	text-align: center;
	}
	.conc-meter-label {
	font-size: 0.6rem; text-transform: uppercase;
	letter-spacing: 0.05em; color: var(--dim); margin-bottom: 0.2rem;
	}
	.conc-meter-bar {
	width: 60px; height: 8px; background: var(--bg);
	border: 1px solid var(--border); border-radius: 4px;
	overflow: hidden; margin-bottom: 0.15rem;
	}
	.conc-meter-fill {
	height: 100%; border-radius: 3px; transition: width 0.4s ease;
	}
	.conc-meter-fill.gpu-util { background: var(--gpu); }
	.conc-meter-fill.vram-usage { background: var(--vram); }
	.conc-meter-val {
	font-size: 0.7rem; font-weight: 600; font-family: monospace;
	}
	.conc-legend {
	display: flex; gap: 1rem; flex-wrap: wrap; font-size: 0.65rem;
	color: var(--dim);
	}
	.conc-legend span { display: flex; align-items: center; gap: 4px; }
	.conc-legend-dot {
	width: 10px; height: 10px; border-radius: 2px; flex-shrink: 0;
	}
	.conc-legend-dot.prefill-dot { background: var(--prefill); }
	.conc-legend-dot.decode-dot { background: var(--decode); }
	.conc-legend-dot.idle-dot { background: var(--border); }
	.conc-legend-dot.wait-dot { background: var(--dim); }

	/* Swimlane */
	.swimlane {
	background: var(--bg); border: 1px solid var(--border);
	border-radius: 8px; overflow: hidden; margin-bottom: 1rem;
	}
	.swimlane-row {
	display: flex; align-items: center;
	border-bottom: 1px solid var(--border);
	min-height: 36px;
	}
	.swimlane-row:last-child { border-bottom: none; }
	.swimlane-label {
	width: 100px; flex-shrink: 0; padding: 0.5rem 0.75rem;
	font-size: 0.7rem; font-weight: 600;
	border-right: 1px solid var(--border);
	background: var(--surface);
	}
	.swimlane-label .user-icon { margin-right: 4px; }
	.swimlane-track {
	flex: 1; display: flex; position: relative;
	height: 100%; min-height: 36px;
	}
	.swimlane-block {
	position: absolute; top: 3px; bottom: 3px;
	border-radius: 4px; display: flex; align-items: center;
	justify-content: center; font-size: 0.6rem; font-weight: 600;
	transition: all 0.3s ease; overflow: hidden;
	white-space: nowrap; text-overflow: ellipsis;
	}
	.swimlane-block.prefill-block {
	background: rgba(218,54,51,0.25);
	border: 1px solid var(--prefill);
	color: #f09090;
	}
	.swimlane-block.decode-block {
	background: rgba(210,153,34,0.2);
	border: 1px solid var(--decode);
	color: #e0c060;
	}
	.swimlane-block.wait-block {
	background: rgba(139,148,158,0.1);
	border: 1px dashed var(--dim);
	color: var(--dim);
	}
	.swimlane-block.input-block {
	background: rgba(240,136,62,0.15);
	border: 1px solid var(--cpu);
	color: var(--cpu);
	}

	/* GPU queue */
	.gpu-queue {
	background: var(--bg); border: 1px solid var(--border);
	border-radius: 8px; padding: 0.75rem 1rem; margin-bottom: 1rem;
	display: flex; align-items: center; gap: 0.75rem;
	}
	.gpu-queue-label {
	font-size: 0.65rem; text-transform: uppercase;
	letter-spacing: 0.05em; color: var(--dim); white-space: nowrap;
	}
	.gpu-queue-slots {
	display: flex; gap: 6px; flex-wrap: wrap;
	}
	.gpu-slot {
	padding: 3px 10px; border-radius: 4px;
	font-size: 0.65rem; border: 1px solid var(--border);
	background: var(--surface);
	}
	.gpu-slot.busy-prefill {
	border-color: var(--prefill); background: rgba(218,54,51,0.15);
	color: var(--prefill);
	}
	.gpu-slot.busy-decode {
	border-color: var(--decode); background: rgba(210,153,34,0.15);
	color: var(--decode);
	}

	.conc-status {
	font-size: 0.75rem; color: var(--dim); margin-bottom: 1rem;
	text-align: center; min-height: 1.2em; line-height: 1.5;
	}
	.conc-status .highlight { color: var(--active); }

	/* Pipeline */
	.pipeline {
	display: flex; align-items: center; justify-content: center;
	gap: 0; margin-bottom: 1.5rem; flex-wrap: wrap;
	}
	.stage {
	padding: 0.6rem 1rem; border-radius: 8px;
	border: 2px solid var(--border); background: var(--bg);
	font-size: 0.85rem; font-weight: 600;
	text-align: center; min-width: 80px;
	transition: all 0.3s ease;
	position: relative; z-index: 2;
	}
	.stage.active {
	border-color: var(--active);
	background: #1a2332;
	box-shadow: 0 0 16px rgba(88,166,255,0.25);
	color: var(--active);
	}
	.arrow {
	width: 32px; height: 2px; background: var(--border);
	margin: 0 4px; flex-shrink: 0; position: relative; z-index: 1;
	}
	.arrow::after {
	content: '▸'; position: absolute; right: -6px; top: -8px;
	font-size: 0.7rem; color: var(--border);
	}

	/* Decode loop indicator */
	.decode-loop {
	display: flex; align-items: center; gap: 6px;
	margin-top: 0.3rem; justify-content: center;
	font-size: 0.7rem; color: var(--dim);
	}
	.loop-arrow {
	width: 20px; height: 2px; background: var(--dim);
	}
	.loop-count {
	padding: 2px 8px; border: 1px solid var(--border);
	border-radius: 4px; font-size: 0.7rem;
	}
	.loop-count.active {
	border-color: var(--decode); color: var(--decode);
	background: rgba(210,153,34,0.1);
	}

	/* Resources */
	.resources {
	display: grid; grid-template-columns: repeat(4, 1fr);
	gap: 1rem; margin-bottom: 1.5rem;
	}
	.res {
	border: 2px solid var(--border); border-radius: 8px;
	padding: 0.6rem 0.4rem; background: var(--bg); text-align: center;
	transition: all 0.3s ease;
	}
	.res-label {
	font-size: 0.7rem; text-transform: uppercase;
	letter-spacing: 0.05em; margin-bottom: 0.15rem;
	}
	.res-status { font-size: 0.65rem; color: var(--dim); }
	.res.cpu .res-label { color: var(--cpu); }
	.res.ram .res-label { color: var(--ram); }
	.res.gpu .res-label { color: var(--gpu); }
	.res.vram .res-label { color: var(--vram); }
	.res.active.cpu { border-color: var(--cpu); box-shadow: 0 0 10px rgba(240,136,62,0.25); }
	.res.active.ram { border-color: var(--ram); box-shadow: 0 0 10px rgba(63,185,80,0.25); }
	.res.active.gpu { border-color: var(--gpu); box-shadow: 0 0 10px rgba(188,140,255,0.25); }
	.res.active.vram { border-color: var(--vram); box-shadow: 0 0 10px rgba(247,120,186,0.25); }
	.res.active .res-status { color: inherit; }

	/* Token display */
	.token-area {
	background: var(--bg); border: 1px solid var(--border);
	border-radius: 8px; padding: 1rem; margin-bottom: 1.5rem;
	min-height: 80px;
	}
	.token-label {
	font-size: 0.65rem; text-transform: uppercase;
	letter-spacing: 0.05em; color: var(--dim); margin-bottom: 0.5rem;
	}
	.token-row {
	display: flex; gap: 6px; flex-wrap: wrap; align-items: center;
	}
	.token {
	padding: 4px 10px; border-radius: 6px; font-size: 0.8rem;
	border: 1px solid var(--border); background: var(--surface);
	font-family: monospace; transition: all 0.3s ease;
	}
	.token.prefill { border-color: var(--prefill); background: rgba(218,54,51,0.1); }
	.token.decode { border-color: var(--decode); background: rgba(210,153,34,0.1); }
	.token.new { border-color: var(--active); background: rgba(88,166,255,0.1); animation: pulse 0.6s ease; }
	.token.cached { border-color: var(--ram); opacity: 0.6; font-size: 0.7rem; }
	@keyframes pulse {
	0%,100% { transform: scale(1); }
	50% { transform: scale(1.08); }
	}

	/* KV cache */
	.kvcache {
	background: var(--bg); border: 1px solid var(--border);
	border-radius: 8px; padding: 0.75rem 1rem; margin-bottom: 1.5rem;
	display: flex; align-items: center; gap: 0.75rem;
	}
	.kvcache-label {
	font-size: 0.65rem; text-transform: uppercase;
	letter-spacing: 0.05em; color: var(--dim); white-space: nowrap;
	}
	.kvcache-entries {
	display: flex; gap: 4px; flex-wrap: wrap;
	}
	.kv-cell {
	width: 14px; height: 14px; border-radius: 3px;
	border: 1px solid var(--border); background: var(--surface);
	transition: all 0.3s ease;
	}
	.kv-cell.filled { background: var(--vram); border-color: var(--vram); }

	/* Status */
	.status {
	font-size: 0.8rem; color: var(--text); margin-bottom: 1rem;
	text-align: center; min-height: 1.2em;
	}
	.status .highlight { color: var(--active); }

	/* Controls */
	.controls {
	display: flex; gap: 0.75rem; justify-content: center;
	}
	button {
	padding: 0.5rem 1.25rem; border-radius: 6px; border: 1px solid var(--border);
	background: var(--bg); color: var(--text); font-size: 0.8rem;
	cursor: pointer; font-family: inherit; transition: all 0.2s;
	}
	button:hover { border-color: var(--dim); background: #1c2128; }
	button.primary {
	border-color: var(--accent); color: var(--accent);
	background: rgba(88,166,255,0.1);
	}
	button.primary:hover { background: rgba(88,166,255,0.2); }
	button:disabled { opacity: 0.4; cursor: not-allowed; }

	/* Step indicator dots */
	.steps {
	display: flex; gap: 6px; justify-content: center; margin-bottom: 1rem;
	}
	.step-dot {
	width: 8px; height: 8px; border-radius: 50%;
	background: var(--border); transition: background 0.3s;
	}
	.step-dot.done { background: var(--accent); }
	.step-dot.current { background: var(--accent); box-shadow: 0 0 6px var(--accent); }
	</style>
	</head>
	<body>
	<div class="sim">
	<h1>LLM Inference — Interactive Simulator</h1>

	<!-- Tabs -->
	<div class="tabs">
	<button class="tab-btn active" onclick="switchTab('pipeline')">🔬 Pipeline View</button>
	<button class="tab-btn" onclick="switchTab('concurrency')">👥 Multi-User Concurrency</button>
	</div>

	<!-- ===== TAB 1: Pipeline View ===== -->
	<div class="tab-panel visible" id="tab-pipeline">
	<p class="subtitle">Prompt: <strong>"The cat sat"</strong> — watch how it flows through the system</p>

	<!-- Step dots -->
	<div class="steps" id="stepDots">
	<div class="step-dot"></div>
	<div class="step-dot"></div>
	<div class="step-dot"></div>
	<div class="step-dot"></div>
	<div class="step-dot"></div>
	</div>

	<!-- Pipeline stages -->
	<div class="pipeline">
	<div class="stage" id="s_input">📥 Input</div>
	<div class="arrow"></div>
	<div class="stage" id="s_tokenize">🔤 Tokenize</div>
	<div class="arrow"></div>
	<div class="stage" id="s_prefill">⚡ Prefill</div>
	<div class="arrow"></div>
	<div class="stage" id="s_decode">🔄 Decode</div>
	<div class="arrow"></div>
	<div class="stage" id="s_output">📤 Output</div>
	</div>
	<div class="decode-loop" id="loopInfo">
	<span class="loop-arrow"></span>
	<span>loop</span>
	<span class="loop-count" id="loopCount">0 / 3</span>
	</div>

	<!-- Resources -->
	<div class="resources">
	<div class="res cpu" id="resCpu">
	<div class="res-label">CPU</div>
	<div class="res-status">idle</div>
	</div>
	<div class="res ram" id="resRam">
	<div class="res-label">RAM</div>
	<div class="res-status">idle</div>
	</div>
	<div class="res gpu" id="resGpu">
	<div class="res-label">GPU Compute</div>
	<div class="res-status">idle</div>
	</div>
	<div class="res vram" id="resVram">
	<div class="res-label">VRAM / KV Cache</div>
	<div class="res-status">idle</div>
	</div>
	</div>

	<!-- Tokens -->
	<div class="token-area">
	<div class="token-label">Tokens</div>
	<div class="token-row" id="tokenRow">
	<span style="color:var(--dim); font-size:0.8rem;">Waiting for input...</span>
	</div>
	</div>

	<!-- KV Cache -->
	<div class="kvcache">
	<span class="kvcache-label">KV Cache entries:</span>
	<span class="kvcache-entries" id="kvEntries">
	<span style="color:var(--dim); font-size:0.75rem;">empty</span>
	</span>
	</div>

	<!-- Status -->
	<div class="status" id="status">Click <span class="highlight">Auto</span> to run the full pipeline, or use <span class="highlight">Forward</span> / <span class="highlight">Back</span> to walk through one stage at a time.</div>

	<!-- Controls -->
	<div class="controls">
	<button class="primary" id="btnAuto" onclick="toggleAuto()">Auto ▶</button>
	<button id="btnBack" onclick="back()">◀ Back</button>
	<button id="btnStep" onclick="step()">Forward ▶</button>
	<button id="btnReset" onclick="reset()">↺ Reset</button>
	</div>
	</div><!-- end tab-pipeline -->

	<!-- ===== TAB 2: Multi-User Concurrency ===== -->
	<div class="tab-panel" id="tab-concurrency">
	<p class="subtitle">4 concurrent users sharing one GPU — watch batching, queuing & KV cache pressure</p>

	<!-- Meters -->
	<div class="conc-header">
	<div class="conc-meters">
	<div class="conc-meter">
	<div class="conc-meter-label">GPU Util</div>
	<div class="conc-meter-bar"><div class="conc-meter-fill gpu-util" id="gpuUtilBar" style="width:0%"></div></div>
	<div class="conc-meter-val" id="gpuUtilVal">0%</div>
	</div>
	<div class="conc-meter">
	<div class="conc-meter-label">VRAM / KV Cache</div>
	<div class="conc-meter-bar"><div class="conc-meter-fill vram-usage" id="vramBar" style="width:0%"></div></div>
	<div class="conc-meter-val" id="vramVal">0 GB</div>
	</div>
	</div>
	<div class="conc-legend">
	<span><span class="conc-legend-dot prefill-dot"></span> Prefill (compute)</span>
	<span><span class="conc-legend-dot decode-dot"></span> Decode (memory)</span>
	<span><span class="conc-legend-dot input-dot" style="background:var(--cpu)"></span> Tokenize</span>
	<span><span class="conc-legend-dot wait-dot"></span> Queued</span>
	</div>
	</div>

	<!-- Swimlane -->
	<div class="swimlane" id="swimlane">
	</div>

	<!-- GPU Queue -->
	<div class="gpu-queue">
	<span class="gpu-queue-label">GPU Scheduler:</span>
	<span class="gpu-queue-slots" id="gpuSlots">
	<span class="gpu-slot" style="color:var(--dim)">idle</span>
	</span>
	</div>

	<!-- Status -->
	<div class="conc-status" id="concStatus">
	Click <span class="highlight">Start</span> to watch 4 users stream through the pipeline concurrently.
	</div>

	<!-- Controls -->
	<div class="controls">
	<button class="primary" id="concBtnAuto" onclick="concToggleAuto()">▶ Start</button>
	<button id="concBtnBack" onclick="concBack()">◀ Back</button>
	<button id="concBtnStep" onclick="concStep()">Forward ▶</button>
	<button id="concBtnReset" onclick="concReset()">↺ Reset</button>
	</div>
	</div><!-- end tab-concurrency -->

	</div>

	<script>
	// --- State ---
	const TOKENS = [
	{ id: 576, text: 'The' },
	{ id: 3797, text: ' cat' },
	{ id: 7236, text: ' sat' },
	];
	const OUTPUT = [
	{ id: 389, text: ' on' },
	{ id: 278, text: ' the' },
	{ id: 3098, text: ' mat' },
	];

	const TOTAL_STEPS = 7; // 0:idle, 1:input, 2:tokenize, 3:prefill, 4-6:decode steps
	let currentStep = 0;
	let autoTimer = null;
	let decodeIdx = 0;

	// --- DOM refs ---
	const els = {
	s_input: document.getElementById('s_input'),
	s_tokenize: document.getElementById('s_tokenize'),
	s_prefill: document.getElementById('s_prefill'),
	s_decode: document.getElementById('s_decode'),
	s_output: document.getElementById('s_output'),
	resCpu: document.getElementById('resCpu'),
	resRam: document.getElementById('resRam'),
	resGpu: document.getElementById('resGpu'),
	resVram: document.getElementById('resVram'),
	tokenRow: document.getElementById('tokenRow'),
	kvEntries: document.getElementById('kvEntries'),
	status: document.getElementById('status'),
	loopCount: document.getElementById('loopCount'),
	loopInfo: document.getElementById('loopInfo'),
	btnAuto: document.getElementById('btnAuto'),
	btnStep: document.getElementById('btnStep'),
	btnBack: document.getElementById('btnBack'),
	stepDots: document.getElementById('stepDots').children,
	};

	function clearActive() {
	Object.values(els).forEach(el => {
	if (el && el.classList) el.classList.remove('active');
	});
	}

	function setActiveResources(names) {
	const map = { cpu: els.resCpu, ram: els.resRam, gpu: els.resGpu, vram: els.resVram };
	for (const [key, el] of Object.entries(map)) {
	if (names.includes(key)) {
	el.classList.add('active');
	el.querySelector('.res-status').textContent = 'active';
	} else {
	el.classList.remove('active');
	el.querySelector('.res-status').textContent = 'idle';
	}
	}
	}

	function setStatus(msg) { els.status.innerHTML = msg; }
	function setTokens(html) { els.tokenRow.innerHTML = html; }
	function setKV(html) { els.kvEntries.innerHTML = html; }

	function setDots(n) {
	for (let i = 0; i < els.stepDots.length; i++) {
	els.stepDots[i].classList.remove('done', 'current');
	if (i < n) els.stepDots[i].classList.add('done');
	if (i === n) els.stepDots[i].classList.add('current');
	}
	}

	// --- Step implementations ---
	function doIdle() {
	clearActive();
	setActiveResources([]);
	setTokens('<span style="color:var(--dim); font-size:0.8rem;">Waiting for input...</span>');
	setKV('<span style="color:var(--dim); font-size:0.75rem;">empty</span>');
	setStatus('Click <span class="highlight">Auto</span> to run the full pipeline, or use <span class="highlight">Forward</span> / <span class="highlight">Back</span> to walk through one stage at a time.');
	els.loopInfo.style.opacity = '0.3';
	els.loopCount.textContent = '0 / ' + OUTPUT.length;
	els.loopCount.classList.remove('active');
	setDots(0);
	}

	function doInput() {
	clearActive();
	els.s_input.classList.add('active');
	setActiveResources(['cpu']);
	setTokens('<span style="color:var(--text); font-size:1rem; font-style:italic;">"The cat sat"</span>');
	setKV('<span style="color:var(--dim); font-size:0.75rem;">empty</span>');
	setStatus('📥 <span class="highlight">Input</span> — user submits the prompt text.<br><span style="color:var(--dim); font-size:0.75rem;">CPU & RAM hold the raw string. GPU is idle.</span>');
	els.loopInfo.style.opacity = '0.3';
	setDots(1);
	}

	function doTokenize() {
	clearActive();
	els.s_tokenize.classList.add('active');
	setActiveResources(['cpu', 'ram']);
	let html = '';
	TOKENS.forEach(t => {
	html += `<span class="token" style="border-color:var(--cpu);">${t.id}</span>`;
	});
	setTokens(html + '<span style="color:var(--dim); margin-left:6px; font-size:0.7rem;">← token IDs</span>');
	setKV('<span style="color:var(--dim); font-size:0.75rem;">empty</span>');
	setStatus('🔤 <span class="highlight">Tokenize</span> — CPU converts text to token IDs: "The cat sat" → [576, 3797, 7236]<br><span style="color:var(--dim); font-size:0.75rem;">This is a fast CPU-bound step. Tokenizer runs on CPU.</span>');
	els.loopInfo.style.opacity = '0.3';
	setDots(2);
	}

	function doPrefill() {
	clearActive();
	els.s_prefill.classList.add('active');
	setActiveResources(['gpu', 'vram']);
	let html = '';
	TOKENS.forEach(t => {
	html += `<span class="token prefill">${t.id}<br><small>${t.text}</small></span>`;
	});
	html += '<span style="font-size:1.5rem; margin:0 4px;">→</span>';
	html += `<span class="token new">${OUTPUT[0].id}<br><small>${OUTPUT[0].text}</small></span>`;
	// V
	setTokens(html);
	// KV cache built
	let kvHtml = '';
	TOKENS.forEach((_, i) => {
	kvHtml += `<div class="kv-cell filled" title="K${i+1} V${i+1}"></div>`;
	});
	setKV(kvHtml + '<span style="color:var(--dim); margin-left:4px; font-size:0.7rem;">3 entries built</span>');
	setStatus('⚡ <span class="highlight">Prefill</span> — all 3 tokens run through the model <em>simultaneously</em>.<br><span style="color:var(--dim); font-size:0.75rem;">GPU compute spikes. KV cache is built (stored in VRAM). First output token appears: " on".<br><strong>Compute-bound:</strong> GPU is doing heavy matrix math.</span>');
	els.loopInfo.style.opacity = '0.3';
	els.loopCount.textContent = '1 / ' + OUTPUT.length;
	setDots(3);
	}

	function doDecodeStep(idx) {
	clearActive();
	els.s_decode.classList.add('active');

	// During decode, GPU is busy but more moderate (memory-bound)
	setActiveResources(['gpu', 'vram']);

	// Show all tokens so far
	let html = '';
	// Input tokens (cached)
	TOKENS.forEach(t => {
	html += `<span class="token cached">${t.id}</span>`;
	});
	// Previously generated output tokens (cached)
	for (let i = 0; i < idx; i++) {
	html += `<span class="token cached">${OUTPUT[i].id}</span>`;
	}
	// New token being generated
	html += `<span class="token new">${OUTPUT[idx].id}<br><small>${OUTPUT[idx].text}</small></span>`;
	// Show what's being read
	html += '<span style="color:var(--dim); margin-left:8px; font-size:0.7rem;">← reads KV cache</span>';
	setTokens(html);

	// KV cache grows
	let kvHtml = '';
	const totalKV = TOKENS.length + idx;
	for (let i = 0; i < totalKV; i++) {
	kvHtml += `<div class="kv-cell filled" title="K${i+1} V${i+1}"></div>`;
	}
	setKV(kvHtml + `<span style="color:var(--dim); margin-left:4px; font-size:0.7rem;">${totalKV} entries — growing each step</span>`);

	const tokenNum = idx + 1;
	setStatus('🔄 <span class="highlight">Decode step ' + tokenNum + '/' + OUTPUT.length + '</span> — generating token: <span style="color:var(--active);">"' + OUTPUT[idx].text.trim() + '"</span><br><span style="color:var(--dim); font-size:0.75rem;">Only the NEW token needs fresh GPU compute. All previous tokens reuse their KV cache entries.<br><strong>Memory-bound:</strong> GPU spends most time reading the growing KV cache from VRAM.</span>');

	els.loopInfo.style.opacity = '1';
	els.loopCount.textContent = tokenNum + ' / ' + OUTPUT.length;
	els.loopCount.classList.add('active');
	els.loopInfo.querySelector('.loop-arrow').style.background = 'var(--decode)';
	setDots(3 + idx);
	}

	function doDetokenize() {
	clearActive();
	els.s_output.classList.add('active');
	setActiveResources(['cpu']);

	// Show all tokens
	let html = '';
	TOKENS.forEach(t => { html += `<span class="token cached">${t.id}</span>`; });
	OUTPUT.forEach(t => { html += `<span class="token cached">${t.id}</span>`; });
	html += '<span style="font-size:1.5rem; margin:0 4px;">→</span>';
	html += '<span style="color:var(--ram); font-size:1rem; font-style:italic;">"The cat sat on the mat"</span>';
	setTokens(html);

	// Final KV cache
	let kvHtml = '';
	for (let i = 0; i < TOKENS.length + OUTPUT.length; i++) {
	kvHtml += `<div class="kv-cell filled" title="K${i+1} V${i+1}"></div>`;
	}
	setKV(kvHtml + '<span style="color:var(--dim); margin-left:4px; font-size:0.7rem;">6 total entries (freed after request)</span>');

	setStatus('📤 <span class="highlight">Detokenize & Output</span> — CPU converts token IDs back to text.<br><span style="color:var(--dim); font-size:0.75rem;">Final: "The cat sat on the mat" — KV cache is freed, GPU goes idle.</span>');
	els.loopInfo.style.opacity = '0.3';
	setDots(6);
	}

	// --- Step state machine ---
	function step() {
	if (autoTimer) return; // don't mix auto + manual

	// step numbering:
	// 0 idle, 1 input, 2 tokenize, 3 prefill, 4 decode[0], 5 decode[1], 6 decode[2], 7 detokenize
	// idle returned by reset
	const actions = [doIdle, doInput, doTokenize, doPrefill, ...OUTPUT.map((_, i) => () => doDecodeStep(i)), doDetokenize];

	if (currentStep < actions.length - 1) {
	currentStep++;
	actions[currentStep]();
	}

	if (currentStep >= actions.length - 1) {
	els.btnStep.disabled = true;
	}
	els.btnBack.disabled = (currentStep <= 0);
	}

	function back() {
	if (autoTimer) return;
	if (currentStep <= 0) return;

	const actions = [doIdle, doInput, doTokenize, doPrefill, ...OUTPUT.map((_, i) => () => doDecodeStep(i)), doDetokenize];

	currentStep--;
	actions[currentStep]();

	els.btnStep.disabled = false;
	els.btnBack.disabled = (currentStep <= 0);
	}

	function reset() {
	stopAuto();
	currentStep = 0;
	decodeIdx = 0;
	doIdle();
	els.btnStep.disabled = false;
	els.btnBack.disabled = true;
	}

	// --- Auto play ---
	function toggleAuto() {
	if (autoTimer) { stopAuto(); return; }

	reset();
	els.btnAuto.textContent = '⏸ Stop';
	els.btnAuto.classList.remove('primary');
	els.btnStep.disabled = true;
	els.btnBack.disabled = true;

	const actions = [doIdle, doInput, doTokenize, doPrefill, ...OUTPUT.map((_, i) => () => doDecodeStep(i)), doDetokenize];

	let i = 0;
	function tick() {
	if (i >= actions.length) { stopAuto(); return; }
	actions[i]();
	currentStep = i;
	i++;

	// Adjust timing: longer pauses for prefill/decode to let viewer read
	let delay = 1200;
	if (i === 4) delay = 1500; // prefill — let it sink in
	if (i >= 5 && i <= actions.length - 2) delay = 1800; // decode steps — slower

	autoTimer = setTimeout(tick, delay);
	}
	// Small initial delay so viewer sees the fresh start
	autoTimer = setTimeout(tick, 400);
	}

	function stopAuto() {
	if (autoTimer) clearTimeout(autoTimer);
	autoTimer = null;
	els.btnAuto.textContent = '▶ Auto';
	els.btnAuto.classList.add('primary');
	if (currentStep < 7) els.btnStep.disabled = false;
	els.btnBack.disabled = (currentStep <= 0);
	}

	// --- Init ---
	doIdle();

	// ============================================================
	// CONCURRENCY TAB — Multi-User Timeline Simulator
	// ============================================================

	const NUM_USERS = 4;
	const TOTAL_TICKS = 17;

	// Each user: { name, icon, prompt, outputTokens, schedule: [{tick, type, label}]
	// Types: 'tokenize', 'prefill', 'decode', 'output', 'wait', 'done'
	const USER_SCHEDULES = [
	{ // User A — arrives immediately, 6 output tokens
	name: 'A', icon: '🟦', prompt: '"The cat sat"',
	outputTokens: [' on', ' the', ' mat', ' by', ' the', ' fire'],
	phases: [
	{ tick: 0, type: 'tokenize', label: 'T' },
	{ tick: 1, type: 'prefill', label: 'PF' },
	{ tick: 3, type: 'decode', label: 'D1', tokenIdx: 0 },
	{ tick: 4, type: 'decode', label: 'D2', tokenIdx: 1 },
	{ tick: 5, type: 'decode', label: 'D3', tokenIdx: 2 },
	{ tick: 6, type: 'decode', label: 'D4', tokenIdx: 3 },
	{ tick: 7, type: 'decode', label: 'D5', tokenIdx: 4 },
	{ tick: 8, type: 'decode', label: 'D6', tokenIdx: 5 },
	{ tick: 9, type: 'output', label: 'O' },
	]
	},
	{ // User B — arrives at t=1, 5 output tokens
	name: 'B', icon: '🟩', prompt: '"What is the"',
	outputTokens: [' capital', ' of', ' France', ' today', '?'],
	phases: [
	{ tick: 1, type: 'tokenize', label: 'T' },
	{ tick: 2, type: 'wait', label: '⋯' },
	{ tick: 3, type: 'prefill', label: 'PF' },
	{ tick: 5, type: 'decode', label: 'D1', tokenIdx: 0 },
	{ tick: 6, type: 'decode', label: 'D2', tokenIdx: 1 },
	{ tick: 7, type: 'decode', label: 'D3', tokenIdx: 2 },
	{ tick: 8, type: 'decode', label: 'D4', tokenIdx: 3 },
	{ tick: 9, type: 'decode', label: 'D5', tokenIdx: 4 },
	{ tick: 10, type: 'output', label: 'O' },
	]
	},
	{ // User C — arrives at t=3, 5 output tokens
	name: 'C', icon: '🟨', prompt: '"Write a haiku"',
	outputTokens: [' about', ' spring', ' blossoms', ' falling', ' down'],
	phases: [
	{ tick: 3, type: 'tokenize', label: 'T' },
	{ tick: 4, type: 'wait', label: '⋯' },
	{ tick: 5, type: 'wait', label: '⋯' },
	{ tick: 6, type: 'prefill', label: 'PF' },
	{ tick: 8, type: 'decode', label: 'D1', tokenIdx: 0 },
	{ tick: 9, type: 'decode', label: 'D2', tokenIdx: 1 },
	{ tick: 10, type: 'decode', label: 'D3', tokenIdx: 2 },
	{ tick: 11, type: 'decode', label: 'D4', tokenIdx: 3 },
	{ tick: 12, type: 'decode', label: 'D5', tokenIdx: 4 },
	{ tick: 13, type: 'output', label: 'O' },
	]
	},
	{ // User D — arrives at t=5, 4 output tokens
	name: 'D', icon: '🟪', prompt: '"Explain quantum"',
	outputTokens: [' computing', ' in', ' simple', ' terms'],
	phases: [
	{ tick: 5, type: 'tokenize', label: 'T' },
	{ tick: 6, type: 'wait', label: '⋯' },
	{ tick: 7, type: 'wait', label: '⋯' },
	{ tick: 8, type: 'prefill', label: 'PF' },
	{ tick: 10, type: 'decode', label: 'D1', tokenIdx: 0 },
	{ tick: 11, type: 'decode', label: 'D2', tokenIdx: 1 },
	{ tick: 12, type: 'decode', label: 'D3', tokenIdx: 2 },
	{ tick: 13, type: 'decode', label: 'D4', tokenIdx: 3 },
	{ tick: 14, type: 'output', label: 'O' },
	]
	},
	];

	// Per-tick GPU state
	// At each tick, we compute: gpuActivity (prefill/decode/idle), active users, KV cache entries
	function getGPUActivity(tick) {
	let prefilling = [];
	let decoding = [];
	let totalKV = 0;
	const inputTokenCounts = { A: 3, B: 3, C: 3, D: 2 }; // rough token counts per prompt

	for (const user of USER_SCHEDULES) {
	const phase = getUserPhaseAtTick(user, tick);
	if (phase) {
	if (phase.type === 'prefill') prefilling.push(user.name);
	if (phase.type === 'decode') decoding.push(user.name);
	}

	const inputTokens = inputTokenCounts[user.name] \|\| 2;

	// Count decode tokens generated up to and including this tick
	let decodeDone = 0;
	for (const p of user.phases) {
	if (p.type === 'decode' && p.tick <= tick) decodeDone++;
	}
	// KV entries exist after prefill has started (KV built during prefill, fully available after)
	let prefillStarted = false;
	for (const p of user.phases) {
	if (p.type === 'prefill' && p.tick <= tick) prefillStarted = true;
	}
	if (prefillStarted \|\| decodeDone > 0) {
	totalKV += inputTokens + decodeDone;
	}
	}
	return { prefilling, decoding, totalKV };
	}

	function getUserPhaseAtTick(user, tick) {
	// Find the phase that contains this tick (phase lasts until next phase's tick)
	for (let i = user.phases.length - 1; i >= 0; i--) {
	if (user.phases[i].tick <= tick) {
	return user.phases[i];
	}
	}
	return null;
	}

	// For rendering: we need contiguous blocks, not just per-tick points.
	// Prefill lasts 2 ticks, decode lasts 1 tick each, etc.
	// Build display blocks from the phase list:
	function buildDisplayBlocks(user) {
	const blocks = [];
	for (let i = 0; i < user.phases.length; i++) {
	const phase = user.phases[i];
	let startTick = phase.tick;
	// Determine end tick: next phase's tick - 1, or if last phase, startTick
	let endTick = startTick;
	if (i < user.phases.length - 1) {
	endTick = user.phases[i + 1].tick - 1;
	}
	// But for decode, each is exactly 1 tick
	if (phase.type === 'decode' \|\| phase.type === 'tokenize' \|\| phase.type === 'output') {
	endTick = startTick;
	}
	if (phase.type === 'prefill') {
	// prefill spans from startTick to the next non-wait phase - 1
	// Find next real phase after any wait phases
	for (let j = i + 1; j < user.phases.length; j++) {
	if (user.phases[j].type !== 'wait') {
	endTick = user.phases[j].tick - 1;
	break;
	}
	}
	}
	if (phase.type === 'wait') {
	endTick = startTick;
	// If there are consecutive waits, merge them
	}
	blocks.push({ ...phase, startTick, endTick });
	}
	return blocks;
	}

	// Merge consecutive same-type blocks
	function mergeBlocks(blocks) {
	const merged = [];
	for (const b of blocks) {
	const last = merged[merged.length - 1];
	if (last && last.type === b.type && last.endTick + 1 === b.startTick) {
	last.endTick = b.endTick;
	} else {
	merged.push({ ...b });
	}
	}
	return merged;
	}

	// Precompute display blocks for all users
	const USER_BLOCKS = USER_SCHEDULES.map(u => {
	const blocks = buildDisplayBlocks(u);
	return mergeBlocks(blocks);
	});

	// --- Concurrency state ---
	let concTick = -1;
	let concAutoTimer = null;
	let concRunning = false;

	// DOM refs for concurrency tab
	function getConcEls() {
	return {
	swimlane: document.getElementById('swimlane'),
	gpuSlots: document.getElementById('gpuSlots'),
	gpuUtilBar: document.getElementById('gpuUtilBar'),
	gpuUtilVal: document.getElementById('gpuUtilVal'),
	vramBar: document.getElementById('vramBar'),
	vramVal: document.getElementById('vramVal'),
	status: document.getElementById('concStatus'),
	btnAuto: document.getElementById('concBtnAuto'),
	btnBack: document.getElementById('concBtnBack'),
	btnStep: document.getElementById('concBtnStep'),
	btnReset: document.getElementById('concBtnReset'),
	};
	}

	function renderConcTick(tick) {
	const els = getConcEls();
	const gpu = getGPUActivity(tick);

	// --- Build swimlane HTML ---
	let html = '';
	for (let ui = 0; ui < NUM_USERS; ui++) {
	const user = USER_SCHEDULES[ui];
	const blocks = USER_BLOCKS[ui];

	html += '<div class="swimlane-row">';
	html += `<div class="swimlane-label"><span class="user-icon">${user.icon}</span> ${user.name}: ${user.prompt}</div>`;
	html += '<div class="swimlane-track" style="position:relative;">';

	// Draw blocks
	for (const block of blocks) {
	const leftPct = (block.startTick / TOTAL_TICKS) * 100;
	const widthPct = ((block.endTick - block.startTick + 1) / TOTAL_TICKS) * 100;

	let cls = '';
	if (block.type === 'prefill') cls = 'prefill-block';
	else if (block.type === 'decode') cls = 'decode-block';
	else if (block.type === 'wait') cls = 'wait-block';
	else if (block.type === 'tokenize' \|\| block.type === 'output') cls = 'input-block';
	else continue;

	// Highlight if this block is active at current tick
	const isActive = tick >= block.startTick && tick <= block.endTick;
	const opacity = isActive ? '1' : (tick > block.endTick ? '0.5' : '0.35');
	const zIdx = isActive ? '3' : '1';

	// Label for decode: show which token
	let label = block.label \|\| '';
	if (block.type === 'decode' && block.tokenIdx !== undefined) {
	label = `D${block.tokenIdx + 1}`;
	}
	if (block.type === 'prefill') label = 'PF';
	if (block.type === 'tokenize') label = 'T';
	if (block.type === 'output') label = 'O';
	if (block.type === 'wait') label = '';

	if (widthPct > 1.5) {
	html += `<div class="swimlane-block ${cls}" style="left:${leftPct}%;width:${widthPct}%;opacity:${opacity};z-index:${zIdx};">${label}</div>`;
	}
	}

	// Tick cursor line if user is active
	const phase = getUserPhaseAtTick(user, tick);
	if (phase && phase.type !== 'done') {
	const cursorPct = ((tick + 0.5) / TOTAL_TICKS) * 100;
	html += `<div style="position:absolute;left:${cursorPct}%;top:0;bottom:0;width:2px;background:var(--active);z-index:10;opacity:0.8;"></div>`;
	}

	html += '</div></div>';
	}
	els.swimlane.innerHTML = html;

	// --- GPU utilization ---
	let gpuPct = 0;
	if (gpu.prefilling.length > 0) gpuPct = 90;
	else if (gpu.decoding.length > 0) gpuPct = Math.min(30 + gpu.decoding.length * 15, 85);
	els.gpuUtilBar.style.width = gpuPct + '%';
	els.gpuUtilVal.textContent = gpuPct + '%';

	// --- VRAM usage (KB per KV entry, rough) ---
	const kvGB = (gpu.totalKV * 0.4).toFixed(1);
	const vramPct = Math.min(gpu.totalKV * 4, 100);
	els.vramBar.style.width = vramPct + '%';
	els.vramVal.textContent = kvGB + ' GB';

	// --- GPU scheduler slots ---
	let slotHTML = '';
	if (gpu.prefilling.length > 0) {
	slotHTML = gpu.prefilling.map(u => `<span class="gpu-slot busy-prefill">⚡ ${u} prefill</span>`).join('');
	} else if (gpu.decoding.length > 0) {
	slotHTML = `<span class="gpu-slot busy-decode">🔄 batch decode (${gpu.decoding.join(',')})</span>`;
	} else {
	slotHTML = '<span class="gpu-slot" style="color:var(--dim)">idle</span>';
	}
	els.gpuSlots.innerHTML = slotHTML;

	// --- Status text ---
	let statusMsg = '';
	const tickLabel = tick >= 0 ? `Tick ${tick}` : '';
	if (tick < 0) {
	statusMsg = 'Click <span class="highlight">Start</span> to watch 4 users stream through the pipeline concurrently.';
	} else if (gpu.prefilling.length > 0 && gpu.decoding.length > 0) {
	statusMsg = `<span class="highlight">${tickLabel}:</span> GPU <span style="color:var(--prefill);">prefills</span> for ${gpu.prefilling.join(', ')} <strong>while</strong> <span style="color:var(--decode);">batching decode</span> for ${gpu.decoding.join(', ')} — continuous batching in action!`;
	} else if (gpu.prefilling.length > 0) {
	statusMsg = `<span class="highlight">${tickLabel}:</span> GPU is <span style="color:var(--prefill);">prefilling</span> for ${gpu.prefilling.join(', ')} — compute-bound, KV cache building.`;
	} else if (gpu.decoding.length > 0) {
	statusMsg = `<span class="highlight">${tickLabel}:</span> GPU is <span style="color:var(--decode);">batching decode</span> for ${gpu.decoding.join(', ')} — memory-bound, reading KV cache.`;
	} else if (tick >= 15) {
	statusMsg = `<span class="highlight">${tickLabel}:</span> All requests complete. GPU idle. KV cache freed.`;
	} else {
	statusMsg = `<span class="highlight">${tickLabel}:</span> GPU idle — requests tokenizing or queued.`;
	}

	// Add throughput note at end
	const completed = USER_SCHEDULES.filter(u => {
	const last = u.phases[u.phases.length - 1];
	return tick >= last.tick;
	}).length;
	if (tick >= 14 && completed === 4) {
	statusMsg += '<br><span style="color:var(--ram);">✓ All 4 requests completed in 14 ticks. Serially would take ~36 ticks. <strong>~2.6× throughput</strong> via concurrency.</span>';
	}

	els.status.innerHTML = statusMsg;

	// --- Update buttons ---
	els.btnStep.disabled = tick >= TOTAL_TICKS - 1;
	els.btnBack.disabled = tick <= 0;
	}

	function concStep() {
	if (concAutoTimer) return;
	if (concTick >= TOTAL_TICKS - 1) return;
	concTick++;
	renderConcTick(concTick);
	}

	function concBack() {
	if (concAutoTimer) return;
	if (concTick <= 0) return;
	concTick--;
	renderConcTick(concTick);
	}

	function concReset() {
	concStopAuto();
	concTick = -1;
	const els = getConcEls();
	els.gpuUtilBar.style.width = '0%';
	els.gpuUtilVal.textContent = '0%';
	els.vramBar.style.width = '0%';
	els.vramVal.textContent = '0 GB';
	els.gpuSlots.innerHTML = '<span class="gpu-slot" style="color:var(--dim)">idle</span>';
	els.status.innerHTML = 'Click <span class="highlight">Start</span> to watch 4 users stream through the pipeline concurrently.';
	els.btnStep.disabled = false;
	els.btnBack.disabled = true;
	renderConcTick(-1);
	}

	function concToggleAuto() {
	if (concAutoTimer) { concStopAuto(); return; }

	concReset();
	concTick = -1;
	const els = getConcEls();
	els.btnAuto.textContent = '⏸ Stop';
	els.btnAuto.classList.remove('primary');
	els.btnStep.disabled = true;
	els.btnBack.disabled = true;
	els.btnReset.disabled = false;
	concRunning = true;

	const delays = [
	600, // t0: A tokenizes
	1000, // t1: A prefill (compute-heavy)
	1000, // t2: A prefill cont, B waits
	900, // t3: A decode1, B prefill, C arrives
	700, // t4: A decode2, B prefill cont
	700, // t5: A decode3, B decode1, D arrives
	900, // t6: A decode4, B decode2, C prefill
	800, // t7: A decode5, B decode3, C prefill cont
	1000, // t8: A decode6, B decode4, C decode1, D prefill (4 users active!)
	800, // t9: A output, B decode5, C decode2, D prefill cont
	900, // t10: B output, C decode3, D decode1
	700, // t11: C decode4, D decode2
	700, // t12: C decode5, D decode3
	800, // t13: C output, D decode4
	900, // t14: D output — all done
	1800, // t15-16: idle, show summary
	];

	let i = 0;
	function tick() {
	if (i > TOTAL_TICKS - 1) { concStopAuto(); return; }
	concTick = i;
	renderConcTick(i);
	i++;
	const delay = delays[i - 1] \|\| 600;
	concAutoTimer = setTimeout(tick, delay);
	}
	concAutoTimer = setTimeout(tick, 400);
	}

	function concStopAuto() {
	if (concAutoTimer) clearTimeout(concAutoTimer);
	concAutoTimer = null;
	concRunning = false;
	const els = getConcEls();
	els.btnAuto.textContent = '▶ Start';
	els.btnAuto.classList.add('primary');
	if (concTick < TOTAL_TICKS - 1) els.btnStep.disabled = false;
	els.btnBack.disabled = concTick <= 0;
	els.btnReset.disabled = false;
	}

	// --- Tab switching ---
	function switchTab(tab) {
	document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
	document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('visible'));
	if (tab === 'pipeline') {
	document.querySelectorAll('.tab-btn')[0].classList.add('active');
	document.getElementById('tab-pipeline').classList.add('visible');
	} else {
	document.querySelectorAll('.tab-btn')[1].classList.add('active');
	document.getElementById('tab-concurrency').classList.add('visible');
	concReset();
	}
	}

	// Init concurrency tab
	concReset();
	</script>
	</body>
	</html>