Spaces:

bobber
/

routangseng-chat

Running

App Files Files Community

routangseng-chat / index.html

bobber

WASM streaming: requestAnimationFrame for batched DOM updates

dba13e2 verified 26 days ago

raw

history blame contribute delete

22.8 kB

	<!DOCTYPE html>
	<html lang="zh">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>肉糖生 Chat · WebGPU</title>
	<style>
	* { margin: 0; padding: 0; box-sizing: border-box; }
	body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: #1a1a2e; color: #e0e0e0; height: 100vh; display: flex; flex-direction: column; }
	header { background: #16213e; padding: 12px 20px; display: flex; align-items: center; gap: 12px; border-bottom: 1px solid #0f3460; }
	header h1 { font-size: 18px; color: #e94560; }
	.badge { background: #0f3460; color: #53a8b6; padding: 3px 10px; border-radius: 12px; font-size: 12px; }
	#status { font-size: 12px; color: #888; margin-left: auto; }
	#chat { flex: 1; overflow-y: auto; padding: 16px; display: flex; flex-direction: column; gap: 10px; }
	.msg { padding: 10px 14px; border-radius: 12px; max-width: 85%; line-height: 1.5; white-space: pre-wrap; word-break: break-word; }
	.msg.system { background: #16213e; color: #53a8b6; align-self: center; font-size: 13px; text-align: center; max-width: 90%; }
	.msg.user { background: #0f3460; color: #e0e0e0; align-self: flex-end; }
	.msg.assistant { background: #1a1a2e; border: 1px solid #333; color: #e0e0e0; align-self: flex-start; }
	footer { background: #16213e; padding: 12px 16px; border-top: 1px solid #0f3460; display: flex; gap: 8px; }
	#input { flex: 1; background: #1a1a2e; border: 1px solid #333; color: #e0e0e0; padding: 10px 14px; border-radius: 8px; font-size: 14px; outline: none; resize: none; overflow-y: hidden; font-family: inherit; line-height: 1.5; max-height: 120px; }
	#input:focus { border-color: #e94560; }
	button { background: #e94560; color: white; border: none; padding: 10px 20px; border-radius: 8px; cursor: pointer; font-size: 14px; }
	button:disabled { opacity: 0.5; cursor: not-allowed; }
	button:hover:not(:disabled) { background: #c73e54; }
	#progress-bar { height: 3px; background: #e94560; width: 0%; transition: width 0.3s; position: fixed; top: 0; left: 0; z-index: 100; }
	.typing { color: #888; font-style: italic; }
	.typing .dots { animation: blink 1.4s infinite; }
	@keyframes blink { 0%,20% { opacity: 1; } 50% { opacity: 0.3; } 80%,100% { opacity: 1; } }
	</style>
	</head>
	<body>
	<div id="progress-bar"></div>
	<header>
	<h1>🥩 肉糖生</h1>
	<span class="badge" id="model-badge">肉糖生 · ONNX · WebGPU</span>
	<span id="status">Initializing...</span>
	</header>
	<div id="chat">
	<div class="msg system">正在加载模型，请稍候... (浏览器本地 WebGPU，首次加载需下载约 1GB 权重)</div>
	</div>
	<footer>
	<button id="clear" title="新对话" style="background:#0f3460;padding:10px 12px;">🗑</button>
	<textarea id="input" rows="1" placeholder="输入消息..." disabled></textarea>
	<button id="send" disabled>发送</button>
	</footer>

	<!-- CDN override: ALL model files served from GitHub Pages -->
	<script>
	(function() {
	const GITHUB_BASE = 'https://bobbercheng.github.io/routangseng-models';
	const ONNX_BASE = GITHUB_BASE + '/onnx';

	// Bump MODEL_VERSION when you update model weights to invalidate cache
	const MODEL_VERSION = 'v3-2026-03-18';
	const CHUNK_MANIFEST = {
	'decoder_model_merged_quantized.onnx_data': { chunks: 15, totalMB: 721 },
	'embed_tokens_quantized.onnx_data': { chunks: 5, totalMB: 243 },
	'vision_encoder_quantized.onnx_data': { chunks: 2, totalMB: 97 },
	};
	const PARALLEL_DOWNLOADS = 4;

	// Whitelist: ONLY these files exist on GitHub Pages and should be redirected.
	// Any file NOT in this list falls through to HuggingFace (avoids GitHub 404 HTML responses).
	const GITHUB_FILES = {
	'config.json': 'config.json',
	'generation_config.json': 'generation_config.json',
	'tokenizer.json': 'tokenizer.json',
	'tokenizer_config.json': 'tokenizer_config.json',
	'preprocessor_config.json': 'preprocessor_config.json',
	'processor_config.json': 'processor_config.json',
	'chat_template.jinja': 'chat_template.jinja',
	'decoder_model_merged_quantized.onnx': 'onnx/decoder_model_merged_quantized.onnx',
	'embed_tokens_quantized.onnx': 'onnx/embed_tokens_quantized.onnx',
	'vision_encoder_quantized.onnx': 'onnx/vision_encoder_quantized.onnx',
	};

	const TOTAL_MODEL_MB = 721 + 243 + 97;
	let globalLoadedMB = 0;
	let globalCachedMB = 0;
	let dlStartTime = 0;

	function updateDownloadProgress() {
	const statusEl = document.getElementById('status');
	const progressBar = document.getElementById('progress-bar');
	if (!statusEl \|\| !progressBar) return;

	const totalDone = globalLoadedMB + globalCachedMB;
	const pct = Math.min(95, Math.round((totalDone / TOTAL_MODEL_MB) * 100));
	progressBar.style.width = (5 + pct * 0.9) + '%';

	if (globalCachedMB > 0 && globalLoadedMB === 0) {
	statusEl.textContent = `从缓存加载... ${totalDone.toFixed(0)} / ${TOTAL_MODEL_MB} MB (${pct}%)`;
	} else {
	let info = `${totalDone.toFixed(0)} / ${TOTAL_MODEL_MB} MB (${pct}%)`;
	if (dlStartTime && globalLoadedMB > 0) {
	const elapsed = (Date.now() - dlStartTime) / 1000;
	if (elapsed > 1) {
	const speed = globalLoadedMB / elapsed;
	const remaining = TOTAL_MODEL_MB - totalDone;
	const eta = remaining > 0 ? Math.round(remaining / speed) : 0;
	info += ` · ${speed.toFixed(1)} MB/s`;
	if (eta > 0) info += ` · ${eta}s`;
	}
	}
	if (globalCachedMB > 0) info += ` (${globalCachedMB.toFixed(0)} MB cached)`;
	statusEl.textContent = `下载模型权重... ${info}`;
	}
	}

	const CACHE_NAME = `onnx-${MODEL_VERSION}`;

	async function fetchOneChunk(filename, i) {
	const chunkName = `${filename}.${String(i).padStart(2, '0')}.chunk`;
	const url = `${ONNX_BASE}/${chunkName}`;

	try {
	const cache = await caches.open(CACHE_NAME);
	const cached = await cache.match(url);
	if (cached) {
	const buf = await cached.arrayBuffer();
	const mb = buf.byteLength / 1048576;
	globalCachedMB += mb;
	updateDownloadProgress();
	console.log(`[GitHub] ${chunkName}: ${mb.toFixed(0)} MB (cached)`);
	return buf;
	}
	} catch (e) { /* cache miss */ }

	if (!dlStartTime) dlStartTime = Date.now();
	const resp = await _origFetch(url);
	if (!resp.ok) throw new Error(`Failed: ${chunkName} ${resp.status}`);

	try {
	const cache = await caches.open(CACHE_NAME);
	cache.put(url, resp.clone()).catch(() => {});
	} catch (e) { /* best-effort cache */ }

	const buf = await resp.arrayBuffer();
	const mb = buf.byteLength / 1048576;
	globalLoadedMB += mb;
	updateDownloadProgress();
	console.log(`[GitHub] ${chunkName}: ${mb.toFixed(0)} MB (downloaded)`);
	return buf;
	}

	async function fetchChunked(filename) {
	const manifest = CHUNK_MANIFEST[filename];
	const numChunks = manifest.chunks;
	console.log(`[GitHub] Fetching ${filename}: ${numChunks} chunks (parallel=${PARALLEL_DOWNLOADS})...`);

	const results = new Array(numChunks);
	let nextIdx = 0;

	async function worker() {
	while (nextIdx < numChunks) {
	const idx = nextIdx++;
	results[idx] = await fetchOneChunk(filename, idx);
	}
	}

	const workers = [];
	for (let w = 0; w < Math.min(PARALLEL_DOWNLOADS, numChunks); w++) {
	workers.push(worker());
	}
	await Promise.all(workers);

	const blob = new Blob(results);
	console.log(`[GitHub] ${filename}: ${(blob.size/1048576).toFixed(0)} MB assembled`);

	return new Response(blob, {
	status: 200,
	headers: {
	'Content-Type': 'application/octet-stream',
	'Content-Length': blob.size.toString(),
	},
	});
	}

	const _origFetch = window.fetch.bind(window);
	window.fetch = function(input, init) {
	let url = typeof input === 'string' ? input : (input instanceof Request ? input.url : String(input));

	// Intercept file download URLs, but NOT HuggingFace API metadata calls
	// API calls: huggingface.co/api/models/... → pass through to HF
	// File downloads: huggingface.co/{user}/{repo}/resolve/main/{file} → redirect to GitHub
	// Also catches: cdn-lfs.hf.co/... and other CDN patterns
	if ((url.includes('routangseng-0.8b-hottake-onnx') \|\| url.includes('routangseng-0.8b'))
	&& !url.includes('/api/')) {
	const filename = url.split('/').pop().split('?')[0];

	// Chunked ONNX data files → assemble from GitHub chunks
	if (filename in CHUNK_MANIFEST) {
	return fetchChunked(filename);
	}

	// Whitelisted files → fetch from GitHub Pages, cache as ArrayBuffer for re-reads
	if (filename in GITHUB_FILES) {
	const githubUrl = `${GITHUB_BASE}/${GITHUB_FILES[filename]}`;
	console.log(`[GitHub] ${filename} → ${githubUrl}`);

	// Cache the ArrayBuffer so repeated fetches get a fresh Response
	if (!window._githubFileCache) window._githubFileCache = {};
	if (window._githubFileCache[filename]) {
	const buf = window._githubFileCache[filename];
	console.log(`[GitHub] ${filename} → from memory cache (${buf.byteLength} bytes)`);
	return Promise.resolve(new Response(buf.slice(0), { status: 200, headers: { 'Content-Type': filename.endsWith('.json') ? 'application/json' : 'application/octet-stream' } }));
	}

	return _origFetch(githubUrl).then(async (resp) => {
	if (!resp.ok) throw new Error(`GitHub fetch failed: ${filename} ${resp.status}`);
	const buf = await resp.arrayBuffer();
	window._githubFileCache[filename] = buf;
	console.log(`[GitHub] ${filename} → fetched & cached (${buf.byteLength} bytes)`);
	return new Response(buf.slice(0), { status: 200, headers: { 'Content-Type': filename.endsWith('.json') ? 'application/json' : 'application/octet-stream' } });
	});
	}

	// Everything else (added_tokens.json, special_tokens_map.json, etc.)
	// These don't exist on GitHub OR HuggingFace — return a proper 404
	// so transformers.js handles it gracefully (it expects 404 for optional files)
	console.log(`[GitHub] ${filename} → 404 (not in whitelist)`);
	return Promise.resolve(new Response('Not Found', { status: 404, statusText: 'Not Found' }));
	}

	return _origFetch(input, init);
	};
	console.log('[GitHub] Fetch override installed — all model files served from GitHub Pages');
	})();
	</script>

	<script type="module">
	import {
	AutoProcessor,
	Qwen3_5ForConditionalGeneration,
	TextStreamer,
	} from '/assets/transformers.js';

	// Still use the HF model ID so transformers.js knows the architecture,
	// but all actual file fetches are intercepted and redirected to GitHub Pages
	const DEFAULT_MODEL = 'bobber/routangseng-0.8b-hottake-onnx';
	const params = new URLSearchParams(window.location.search);
	const modelParam = params.get('model');
	const MODEL_ID = modelParam ? (modelParam.includes('/') ? modelParam : `bobber/${modelParam}`) : DEFAULT_MODEL;
	if (modelParam) {
	const shortName = MODEL_ID.split('/').pop();
	document.getElementById('model-badge').textContent = shortName + ' · ONNX · WebGPU';
	}
	const SYSTEM_PROMPT = '你是肉糖生，一个接地气的中文时政分析者。风格：结论先行，再用结构化分析展开；敢于质疑主流叙事，不和稀泥；用类比和现实例子把复杂问题讲透；语气直率但逻辑严密。回答时先给核心判断，再分层拆解，最后给出预测或建议。直接给出分析，不要先描述用户的问题或你的计划。';
	const isMobile = /Android\|iPhone\|iPad\|iPod/i.test(navigator.userAgent);
	const MAX_TOKENS = isMobile ? 512 : 1024;

	const chat = document.getElementById('chat');
	const input = document.getElementById('input');
	const sendBtn = document.getElementById('send');
	const statusEl = document.getElementById('status');
	const progressBar = document.getElementById('progress-bar');

	let processor = null;
	let model = null;
	let messages = [];
	let generating = false;
	let activeDevice = 'webgpu';

	function updateStatus(text) { statusEl.textContent = text; }
	function setProgress(pct) { progressBar.style.width = pct + '%'; }

	function addMsg(role, text) {
	const div = document.createElement('div');
	div.className = 'msg ' + role;
	div.textContent = text;
	chat.appendChild(div);
	chat.scrollTop = chat.scrollHeight;
	return div;
	}

	function stripThinkTags(text) {
	text = text.replace(/<think>[\s\S]*?<\/think>/g, '');
	text = text.replace(/<\/?think>/g, '');
	return text.trim();
	}

	async function getBestDevice() {
	// Safari reports navigator.gpu but ONNX Runtime's WebGPU backend fails
	const isSafari = /^((?!chrome\|android).)*safari/i.test(navigator.userAgent);
	if (isSafari) {
	console.warn("Safari detected. ONNX WebGPU backend not supported — using WASM.");
	return 'wasm';
	}
	if (!navigator.gpu) {
	console.warn("WebGPU not supported. Falling back to WASM.");
	return 'wasm';
	}
	try {
	const adapter = await navigator.gpu.requestAdapter();
	if (!adapter) {
	console.warn("No WebGPU adapter found. Falling back to WASM.");
	return 'wasm';
	}
	return 'webgpu';
	} catch (err) {
	console.warn("WebGPU initialization failed. Falling back to WASM.", err);
	return 'wasm';
	}
	}

	async function init() {
	try {
	if (!navigator.gpu) {
	addMsg('system', '⚠️ 此浏览器不支持 WebGPU。请使用 Chrome 或 Edge。');
	updateStatus('WebGPU not available');
	return;
	}

	try {
	const adapter = await navigator.gpu.requestAdapter();
	if (adapter) {
	const info = adapter.info \|\| {};
	const limits = adapter.limits \|\| {};
	const gpuDesc = info.description \|\| info.device \|\| info.architecture \|\| 'unknown';
	const gpuVendor = info.vendor \|\| 'unknown';
	const maxBuf = limits.maxBufferSize ? (limits.maxBufferSize / 1048576).toFixed(0) + 'MB' : '?';
	const gpuInfo = `GPU: ${gpuVendor} ${gpuDesc} · maxBuffer: ${maxBuf}`;
	window._gpuInfo = `${gpuVendor} ${gpuDesc} maxBuf=${maxBuf}`;
	console.log('[webgpu]', gpuInfo);
	addMsg('system', `🖥️ ${gpuInfo}`);
	}
	} catch (e) { console.warn('GPU info failed:', e); }

	updateStatus('Loading processor...');
	setProgress(5);
	processor = await AutoProcessor.from_pretrained(MODEL_ID);

	updateStatus('下载模型权重... 0 / 1061 MB (0%)');
	setProgress(5);

	let targetDevice = await getBestDevice();

	const modelOpts = {
	dtype: {
	embed_tokens: 'q8',
	vision_encoder: 'q8',
	decoder_model_merged: 'q8',
	},
	device: targetDevice,
	};

	try {
	model = await Qwen3_5ForConditionalGeneration.from_pretrained(MODEL_ID, modelOpts);
	} catch (e) {
	if (targetDevice === 'webgpu') {
	console.warn('[init] WebGPU failed, falling back to WASM:', e.message);
	addMsg('system', '⚠️ WebGPU 初始化失败，正在使用 WASM 后端加载...');
	targetDevice = 'wasm';
	modelOpts.device = 'wasm';
	model = await Qwen3_5ForConditionalGeneration.from_pretrained(MODEL_ID, modelOpts);
	} else {
	throw e;
	}
	}

	activeDevice = targetDevice;
	const backendLabel = targetDevice === 'webgpu' ? 'WebGPU' : 'WASM (CPU)';
	chat.innerHTML = '';
	addMsg('system', `模型加载完成！当前后端：${backendLabel} · 肉糖生 Phase 11 Hot-Take\n可以开始对话了。`);
	updateStatus('Ready · 肉糖生 · WebGPU');
	setProgress(0);
	input.disabled = false;
	sendBtn.disabled = false;
	input.value = 'AI发展很快，大家也拼命跟上快速发展，白领工作时间变长可是失业率上升工资也没有上涨，到底AI的快速发展谁受益？';
	input.focus();
	} catch (e) {
	updateStatus('Error');
	addMsg('system', '❌ 模型加载失败: ' + e.message);
	console.error(e);
	}
	}

	window.clearChat = function() {
	if (generating) return;
	messages = [];
	chat.innerHTML = '';
	addMsg('system', '新对话已开始。');
	input.focus();
	};

	window.sendMessage = async function() {
	const text = input.value.trim();
	console.log(`[sendMessage] called, text="${text?.substring(0,30)}", generating=${generating}, model=${!!model}, processor=${!!processor}`);
	if (!text \|\| generating \|\| !model \|\| !processor) {
	console.log(`[sendMessage] blocked: text=${!!text}, generating=${generating}, model=${!!model}, processor=${!!processor}`);
	return;
	}

	const ts = () => new Date().toISOString();
	generating = true;
	input.value = '';
	input.style.height = 'auto';
	sendBtn.disabled = true;
	input.disabled = true;
	addMsg('user', text);
	messages.push({ role: 'user', content: text });

	console.log(`[chat ${ts()}] USER: ${text}`);

	const assistantDiv = addMsg('assistant', '');
	assistantDiv.innerHTML = '<span class="typing">思考中<span class="dots">...</span></span>';
	updateStatus('Generating...');

	// Yield to browser so DOM updates render before heavy computation
	await new Promise(r => setTimeout(r, 50));

	try {
	const allMessages = [
	{
	role: 'system',
	content: [{ type: 'text', text: SYSTEM_PROMPT }],
	},
	...messages.map(m => ({
	role: m.role,
	content: [{ type: 'text', text: m.content }],
	})),
	];

	const promptText = processor.apply_chat_template(allMessages, {
	add_generation_prompt: true,
	tokenize: false,
	tokenizer_kwargs: { enable_thinking: false },
	});

	console.log(`[chat ${ts()}] Prompt tokens (approx): ${promptText.length}`);
	const inputs = await processor(promptText);

	let fullText = '';
	let tokenCount = 0;
	let firstTokenSeen = false;
	const genStart = Date.now();
	const watchdog = setTimeout(() => {
	if (!firstTokenSeen) {
	assistantDiv.textContent = '⚠️ 还没收到首个 token。可能是 WebGPU 卡住了，或浏览器缓存/显存出了问题。请打开浏览器控制台查看报错，或先切回基座模型排查。';
	updateStatus('Generating... (waiting for first token)');
	console.warn(`[chat ${ts()}] WARNING: no token after 20s`, { MODEL_ID, MAX_TOKENS });
	}
	}, isMobile ? 40000: 20000);

	// Use TextStreamer for both backends.
	// On WASM, the callback still fires per token but the browser won't repaint
	// until generate() returns. This is a known WASM limitation — the main thread
	// is blocked. We still get the streaming text at the end.
	let pendingUpdate = false;
	const streamer = new TextStreamer(processor.tokenizer, {
	skip_prompt: true,
	skip_special_tokens: true,
	callback_function: (token) => {
	if (!firstTokenSeen) {
	firstTokenSeen = true;
	clearTimeout(watchdog);
	const ttft = ((Date.now() - genStart) / 1000).toFixed(1);
	window._lastTTFT = parseFloat(ttft);
	console.log(`[chat ${ts()}] First token after ${ttft}s (TTFT)`);
	}
	tokenCount++;
	fullText += token;
	// Batch DOM updates — schedule one update per animation frame
	if (!pendingUpdate) {
	pendingUpdate = true;
	requestAnimationFrame(() => {
	assistantDiv.textContent = stripThinkTags(fullText);
	chat.scrollTop = chat.scrollHeight;
	pendingUpdate = false;
	});
	}
	}
	});

	await model.generate({
	...inputs,
	max_new_tokens: MAX_TOKENS,
	do_sample: true,
	temperature: 0.7,
	top_p: 0.9,
	streamer,
	});

	clearTimeout(watchdog);

	const cleanText = stripThinkTags(fullText);
	assistantDiv.textContent = cleanText;

	const elapsed = ((Date.now() - genStart) / 1000).toFixed(1);
	const tokPerSec = (tokenCount / ((Date.now() - genStart) / 1000)).toFixed(1);

	if (!cleanText.trim()) {
	assistantDiv.textContent = '❌ 模型执行完成，但没有输出可见文本。这通常说明 WebGPU 执行异常，或模型只产出了特殊 token。请打开控制台查看报错。';
	console.error(`[chat ${ts()}] EMPTY response after ${elapsed}s, ${tokenCount} tokens`);
	updateStatus('Error');
	return;
	}

	messages.push({ role: 'assistant', content: cleanText });
	console.log(`[chat ${ts()}] ASSISTANT (${tokenCount} tokens, ${elapsed}s, ${tokPerSec} tok/s): ${cleanText.substring(0, 200)}${cleanText.length > 200 ? '...' : ''}`);
	updateStatus(`Ready · ${tokPerSec} tok/s · ${tokenCount} tokens`);

	// Telemetry
	try {
	const TELEMETRY_API = 'https://bobber-routangseng-telemetry-api.hf.space/gradio_api/call/collect';
	const sessionId = window._sessionId \|\| (window._sessionId = Math.random().toString(36).slice(2, 10));
	fetch(TELEMETRY_API, {
	method: 'POST',
	headers: { 'Content-Type': 'application/json' },
	body: JSON.stringify({
	data: [JSON.stringify({
	ts: ts(),
	session_id: sessionId,
	model: MODEL_ID,
	question: text,
	answer: cleanText,
	tokens: tokenCount,
	elapsed_s: parseFloat(elapsed),
	tok_per_sec: parseFloat(tokPerSec),
	ttft_s: window._lastTTFT \|\| null,
	device: isMobile ? 'mobile' : 'desktop',
	user_agent: navigator.userAgent,
	gpu: window._gpuInfo \|\| null,
	webgpu: !!navigator.gpu,
	screen: `${screen.width}x${screen.height}`,
	lang: navigator.language,
	})]
	}),
	}).then(r => r.json()).then(d => {
	if (d.event_id) {
	console.log(`[telemetry ${ts()}] sent (event: ${d.event_id})`);
	}
	}).catch(e => {
	console.warn('[telemetry] failed:', e.message);
	});
	} catch (e) { /* best-effort */ }

	} catch (e) {
	assistantDiv.textContent = '❌ 生成失败: ' + e.message;
	console.error(`[chat ${ts()}] ERROR:`, e);
	updateStatus('Error');
	}

	generating = false;
	sendBtn.disabled = false;
	input.disabled = false;
	input.focus();
	};

	input.addEventListener('keydown', (e) => {
	if (e.key === 'Enter' && !e.shiftKey) {
	e.preventDefault();
	window.sendMessage();
	}
	});

	input.addEventListener('input', () => {
	input.style.height = 'auto';
	input.style.height = Math.min(input.scrollHeight, 120) + 'px';
	});

	document.getElementById('send').addEventListener('click', () => { window.sendMessage(); });
	document.getElementById('clear').addEventListener('click', () => { window.clearChat(); });

	init();
	</script>
	</body>
	</html>