Spaces:

mshz88
/

FADA-Mobile

Running

App Files Files Community

FADA-Mobile / js /model.js

mshz88's picture

Upload js/model.js with huggingface_hub

e4e4d61 verified 28 days ago

history blame contribute delete

23.7 kB

	/**
	* Model loader and inference using transformers.js v4.2.0 + WebGPU.
	* Follows the proven pattern from webml-community/Qwen3.5-WebGPU.
	* Loads Qwen3.5-VL 0.8B ONNX from mshz88/FADA-Mobile-ONNX.
	*/

	const MODEL_ID = "mshz88/FADA-Mobile-ONNX";
	const MODEL_REVISION = "2936611f4ad147e0cbe03e3884de8a42c5cc42b9"; // Pin to specific commit, bypass CDN cache
	const TRANSFORMERS_CDN = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0";

	let processor = null;
	let model = null;
	let tokenizer = null;
	let loadingPromise = null;
	let activeDevice = null;

	export function getLoadingStatus() {
	if (model && processor && tokenizer) return "ready";
	if (loadingPromise) return "loading";
	return "idle";
	}

	export function getActiveDevice() {
	return activeDevice;
	}

	/**
	* Load model + processor + tokenizer. Shows progress via callback.
	* @param {function} onProgress - Progress callback
	* @param {string} device - "webgpu" or "wasm"
	*/
	export async function loadModel(onProgress, device = "webgpu") {
	if (model && processor && tokenizer) return;
	if (loadingPromise) return loadingPromise;

	loadingPromise = (async () => {
	const {
	Qwen3_5ForConditionalGeneration,
	AutoProcessor,
	AutoTokenizer,
	env,
	} = await import(TRANSFORMERS_CDN);

	// Force high-performance GPU (discrete NVIDIA over integrated Intel)
	if (device === "webgpu" && navigator.gpu) {
	try {
	const adapter = await navigator.gpu.requestAdapter({
	powerPreference: "high-performance",
	});
	if (adapter) {
	const info = await adapter.requestAdapterInfo();
	console.log("[FADA] GPU adapter selected:", info.vendor, info.architecture, info.device, info.description);
	onProgress?.({ status: "info", note: `GPU: ${info.description \|\| info.vendor \|\| 'discrete GPU'}` });
	// Store globally for diagnostics
	window.__FADA_GPU_INFO = info;
	} else {
	console.warn("[FADA] No high-performance GPU adapter found");
	}
	} catch (e) {
	console.warn("[FADA] Could not query GPU adapter:", e);
	}

	// Try to set power preference at the env/backend level for transformers.js
	try {
	if (env?.backends?.onnx?.webgpu) {
	env.backends.onnx.webgpu.powerPreference = "high-performance";
	console.log("[FADA] Set env.backends.onnx.webgpu.powerPreference = high-performance");
	}
	// Also try the top-level webgpu settings
	if (env?.webgpu) {
	env.webgpu.powerPreference = "high-performance";
	}
	} catch (e) {
	console.warn("[FADA] Could not set env power preference:", e);
	}
	}

	onProgress?.({ status: "loading", file: "processor" });
	processor = await AutoProcessor.from_pretrained(MODEL_ID, {
	revision: MODEL_REVISION,
	});
	console.log("[FADA] Processor loaded");

	onProgress?.({ status: "loading", file: "tokenizer" });
	tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID, {
	revision: MODEL_REVISION,
	});
	console.log("[FADA] Tokenizer loaded");

	onProgress?.({ status: "loading", file: "model" });
	console.log(`[FADA] Loading model on device=${device}...`);
	activeDevice = device;

	// Vision encoder dtype: fp16 on WebGPU, fp32 on WASM
	const visionDtype = device === "wasm" ? "fp32" : "fp16";

	const buildModelConfig = (vDtype) => ({
	revision: MODEL_REVISION,
	dtype: {
	embed_tokens: "q4",
	vision_encoder: vDtype,
	decoder_model_merged: "q4",
	},
	device: device,
	progress_callback: (info) => {
	if (info.status === "progress") {
	onProgress?.({
	status: "downloading",
	file: info.file \|\| "",
	progress: info.progress \|\| 0,
	loaded: info.loaded \|\| 0,
	total: info.total \|\| 0,
	});
	}
	},
	});

	try {
	onProgress?.({
	status: "compiling",
	file: "model",
	note: "Creating GPU session & compiling shaders (2-5 min)...",
	});
	model = await Qwen3_5ForConditionalGeneration.from_pretrained(
	MODEL_ID,
	buildModelConfig(visionDtype)
	);
	} catch (err) {
	// Fallback: if fp16 not supported, retry with fp32 vision encoder
	const isFp16Error =
	visionDtype === "fp16" &&
	(err?.message?.toLowerCase().includes("does not support fp16") \|\|
	err?.message?.toLowerCase().includes("shader"));

	if (isFp16Error) {
	console.warn("[FADA] FP16 not supported, retrying with FP32 vision encoder...");
	onProgress?.({
	status: "loading",
	file: "model",
	note: "FP16 not supported, loading vision encoder in FP32 mode...",
	});
	model = await Qwen3_5ForConditionalGeneration.from_pretrained(
	MODEL_ID,
	buildModelConfig("fp32")
	);
	} else {
	throw err;
	}
	}

	console.log("[FADA] Model loaded successfully on", device);
	onProgress?.({ status: "ready" });
	})();

	try {
	await loadingPromise;
	} catch (e) {
	loadingPromise = null;
	model = null;
	processor = null;
	tokenizer = null;
	activeDevice = null;
	throw e;
	}
	}

	/**
	* Run inference on a single image with a text prompt.
	* Uses the community pattern: processor.apply_chat_template + model.generate + tokenizer.decode
	* No manual KV cache logic — transformers.js handles it internally.
	*
	* @param {HTMLImageElement\|HTMLCanvasElement\|RawImage} image
	* @param {string} prompt
	* @param {object} opts - { maxNewTokens, temperature, onToken }
	* @returns {string} Generated text
	*/
	export async function runInference(image, prompt, opts = {}) {
	if (!model \|\| !processor \|\| !tokenizer) throw new Error("Model not loaded");

	const { RawImage } = await import(TRANSFORMERS_CDN);

	// Convert browser image to RawImage using the same approach as webml-community demo:
	// Export to data URL then use RawImage.read() for proper internal state.
	let rawImage = image;
	if (image instanceof HTMLImageElement \|\| image instanceof HTMLCanvasElement) {
	const origW = image.naturalWidth \|\| image.width;
	const origH = image.naturalHeight \|\| image.height;

	// Resize to limit GPU memory (max 672px longest side for WebGPU)
	const MAX_DIM = 672;
	let processWidth = origW;
	let processHeight = origH;
	if (processWidth > MAX_DIM \|\| processHeight > MAX_DIM) {
	const scale = MAX_DIM / Math.max(processWidth, processHeight);
	processWidth = Math.round(processWidth * scale);
	processHeight = Math.round(processHeight * scale);
	}

	// Create resized canvas and export as data URL for RawImage.read()
	const resizedCanvas = document.createElement("canvas");
	resizedCanvas.width = processWidth;
	resizedCanvas.height = processHeight;
	const resizedCtx = resizedCanvas.getContext("2d");
	resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
	const dataURL = resizedCanvas.toDataURL("image/png");

	// Use RawImage.read() — the proven way (matches webml-community demo)
	rawImage = await RawImage.read(dataURL);
	console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`);
	}

	const maxNewTokens = opts.maxNewTokens \|\| 1024;
	const temperature = opts.temperature ?? 0.1;

	// Build messages array matching training format:
	// The model was trained WITHOUT a system message, just user + assistant.
	// The Qwen3.5 chat template auto-adds <think>\n\n</think>\n\n when
	// enable_thinking is false/undefined (pre-closed thinking mode).
	const messages = [
	{
	role: "user",
	content: [
	{ type: "image" },
	{ type: "text", text: prompt },
	],
	},
	];

	// Apply chat template with enable_thinking explicitly disabled.
	// The model was fine-tuned with thinking PRE-CLOSED: the template adds
	// <think>\n\n</think>\n\n before the model's actual output.
	// We must ensure this is present in the generation prompt.
	console.log("[FADA] Building inputs...");
	let text;
	const templateOpts = {
	add_generation_prompt: true,
	enable_thinking: false,
	};

	if (processor.apply_chat_template) {
	text = processor.apply_chat_template(messages, templateOpts);
	} else if (tokenizer.apply_chat_template) {
	text = tokenizer.apply_chat_template(messages, {
	...templateOpts,
	tokenize: false,
	});
	} else {
	// Manual fallback: must include the pre-closed <think> block
	text = `<\|im_start\|>user\n<\|vision_start\|><\|image_pad\|><\|vision_end\|>${prompt}<\|im_end\|>\n<\|im_start\|>assistant\n<think>\n\n</think>\n\n`;
	}

	// Verify the template output includes the required <think>...</think> block.
	// The model was trained expecting <think>\n\n</think>\n\n before its output.
	// Handle three failure cases:
	// 1. Template produced open <think>\n (enable_thinking=true path) -> close it
	// 2. Template didn't include <think> at all -> add the block
	// 3. Template produced correct <think>\n\n</think>\n\n -> leave as-is
	if (text.includes("<\|im_start\|>assistant")) {
	const assistantIdx = text.lastIndexOf("<\|im_start\|>assistant");
	const afterAssistant = text.slice(assistantIdx);
	if (!afterAssistant.includes("<think>")) {
	// No <think> at all - append the pre-closed block
	console.warn("[FADA] Template missing <think> block - appending pre-closed thinking");
	text = text.trimEnd() + "\n<think>\n\n</think>\n\n";
	} else if (!afterAssistant.includes("</think>")) {
	// Has <think> but no </think> - close it
	console.warn("[FADA] Template produced open <think> - patching to pre-closed format");
	text = text.replace(/<think>\s*$/, "<think>\n\n</think>\n\n");
	}
	}

	// Debug: log the template output (first and last parts)
	console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
	console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));

	// Call processor with SINGLE image (not array!) — matches webml-community demo pattern
	const inputs = await processor(text, rawImage);
	const inputLen = inputs.input_ids.dims[1];
	console.log(
	`[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
	);

	// === VISION PIPELINE DIAGNOSTIC ===
	console.log("[FADA-VISION] === Vision Pipeline Diagnostic ===");
	console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs));
	if (inputs.pixel_values) {
	console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims);
	console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type);
	const pv = inputs.pixel_values.data;
	const sample = Array.from(pv.slice(0, 10));
	console.log("[FADA-VISION] pixel_values first 10:", sample);
	console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v)));
	console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0));
	const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length)));
	console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge));
	} else {
	console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!");
	}
	if (inputs.image_grid_thw) {
	console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims);
	console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number));
	} else {
	console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!");
	}
	// Check if input_ids contain image tokens (token id 151655 for <\|image_pad\|>)
	if (inputs.input_ids) {
	const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t);
	// Qwen3.5-VL image pad token is 151655
	const imageTokenCount = ids.filter(t => t === 151655).length;
	const imageTokenCount2 = ids.filter(t => t === 248056).length;
	console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)");
	console.log("[FADA-VISION] input_ids length:", ids.length);
	console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30));
	// Find image pad tokens
	const firstImageIdx = ids.indexOf(151655);
	const firstImageIdx2 = ids.indexOf(248056);
	const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2;
	if (imgIdx >= 0) {
	console.log("[FADA-VISION] Image tokens start at index:", imgIdx);
	console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10));
	} else {
	console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong.");
	}
	}
	console.log("[FADA-VISION] === End Vision Diagnostic ===");

	// === DEEP DEBUG: Pre-generation diagnostics ===
	console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
	console.log("[FADA-DEBUG] Prompt:", prompt);
	console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
	console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
	console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
	console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) \|\| []).map(Number));
	console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) \|\| []).map(Number));
	console.log("[FADA-DEBUG] Pixel values shape:", inputs.pixel_values?.dims);
	console.log("[FADA-DEBUG] Image grid thw:", inputs.image_grid_thw?.dims, inputs.image_grid_thw?.data ? Array.from(inputs.image_grid_thw.data).map(Number) : 'N/A');
	console.log("[FADA-DEBUG] Attention mask shape:", inputs.attention_mask?.dims);
	// Decode last 20 input tokens to verify template ending
	try {
	const lastInputTokens = Array.from(inputs.input_ids.data.slice(-20)).map(Number);
	const decodedEnd = tokenizer.decode(lastInputTokens, { skip_special_tokens: false });
	console.log("[FADA-DEBUG] Last 20 input tokens decoded:", JSON.stringify(decodedEnd));
	} catch (e) { console.warn("[FADA-DEBUG] Could not decode last input tokens:", e); }
	console.log("[FADA-DEBUG] === Starting Generation ===");

	// Generate — transformers.js handles KV cache internally
	const t0 = performance.now();
	let output;
	try {
	output = await model.generate({
	...inputs,
	max_new_tokens: maxNewTokens,
	temperature: temperature > 0 ? temperature : undefined,
	do_sample: temperature > 0,
	top_p: 0.95,
	});
	} catch (err) {
	console.error("[FADA-DEBUG] Generation error:", err);
	if (err?.message?.includes("Device") && err?.message?.includes("lost")) {
	throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode.");
	}
	if (err?.message?.includes("mapAsync")) {
	throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode.");
	}
	throw err;
	}
	const genTime = ((performance.now() - t0) / 1000).toFixed(1);
	console.log("[FADA-DEBUG] Generation completed in", genTime, "seconds");

	// Decode only generated tokens (skip input)
	const outputDims = output.dims \|\| [];
	const outputTotalLen = outputDims[1] \|\| (outputDims[0] === 1 ? output.data?.length : 0);

	// Convert output Tensor to a plain JS array
	let outputTokens;
	try {
	// For 2D tensor [1, seq_len], try .tolist() which returns nested array
	if (output.tolist) {
	const listed = output.tolist();
	// Could be [[tok1, tok2, ...]] (2D) or [tok1, tok2, ...] (1D)
	const flat = Array.isArray(listed[0]) ? listed[0] : listed;
	outputTokens = flat.map(t => (typeof t === "bigint" ? Number(t) : t));
	} else {
	const firstBatch = output[0];
	const allTokens = firstBatch?.tolist
	? firstBatch.tolist()
	: Array.from(firstBatch?.data \|\| firstBatch \|\| output.data \|\| []);
	outputTokens = allTokens.map(t => (typeof t === "bigint" ? Number(t) : t));
	}
	} catch (e) {
	console.warn("[FADA] Tensor conversion fallback:", e);
	// Fallback: try direct Array.from on output data
	const raw = output.data \|\| output;
	outputTokens = Array.from(raw).map(t => (typeof t === "bigint" ? Number(t) : t));
	}

	// === DEEP DEBUG: Post-generation analysis ===
	console.log("[FADA-DEBUG] === Post-Generation Analysis ===");
	console.log(`[FADA-DEBUG] Output raw dims: ${JSON.stringify(outputDims)}`);
	console.log(`[FADA-DEBUG] Output type: ${typeof output}, constructor: ${output?.constructor?.name}`);
	console.log(`[FADA-DEBUG] outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`);
	console.log(`[FADA] Output tensor dims: ${JSON.stringify(outputDims)}, outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`);

	// Slice to only the newly generated tokens
	const newTokens = outputTokens.slice(inputLen);
	const numGenerated = newTokens.length;

	// Debug: log first few generated tokens
	if (numGenerated > 0) {
	console.log(`[FADA] First 20 new token IDs: [${newTokens.slice(0, 20).join(", ")}]`);
	const rawDecoded = tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: false });
	console.log(`[FADA] First 30 tokens decoded (with special): ${JSON.stringify(rawDecoded)}`);
	console.log(`[FADA-DEBUG] Decoded (skip_special=true): ${JSON.stringify(tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: true }))}`);
	} else {
	console.warn("[FADA-DEBUG] * CRITICAL: 0 new tokens generated! *");
	console.log(`[FADA-DEBUG] Output length === Input length? ${outputTokens.length === inputLen}`);
	console.log(`[FADA-DEBUG] Output length: ${outputTokens.length} vs Input length: ${inputLen}`);
	console.log(`[FADA-DEBUG] Last 10 output token IDs: [${outputTokens.slice(-10).join(", ")}]`);
	// Try decoding last few tokens to understand what happened
	if (outputTokens.length > 0) {
	const lastFew = tokenizer.decode(outputTokens.slice(-10), { skip_special_tokens: false });
	console.log(`[FADA-DEBUG] Last 10 tokens decoded (no skip): ${JSON.stringify(lastFew)}`);
	}
	// Check if ALL output tokens are the same as input (nothing was generated)
	if (outputTokens.length > inputLen) {
	// Actually there ARE new tokens but our inputLen might be wrong
	console.log(`[FADA-DEBUG] WAIT: outputTokens(${outputTokens.length}) > inputLen(${inputLen}) - recalculating...`);
	const extraTokens = outputTokens.slice(inputLen);
	console.log(`[FADA-DEBUG] Extra tokens: [${extraTokens.slice(0, 20).join(", ")}]`);
	console.log(`[FADA-DEBUG] Extra decoded: ${JSON.stringify(tokenizer.decode(extraTokens, { skip_special_tokens: false }))}`);
	} else if (outputTokens.length < inputLen) {
	console.log(`[FADA-DEBUG] STRANGE: output shorter than input! Model may have truncated.`);
	} else {
	console.log(`[FADA-DEBUG] Output exactly equals input - model generated EOS immediately or nothing at all.`);
	// Check what the last token IS
	const lastToken = outputTokens[outputTokens.length - 1];
	console.log(`[FADA-DEBUG] Last token ID: ${lastToken}, decoded: ${JSON.stringify(tokenizer.decode([lastToken], { skip_special_tokens: false }))}`);
	}
	}

	const decoded = tokenizer.decode(newTokens, {
	skip_special_tokens: true,
	});

	console.log(
	`[FADA] Generated ${numGenerated} tokens in ${genTime}s (${(numGenerated / (genTime \|\| 1)).toFixed(1)} tok/s)`
	);

	// Clean output: remove <think></think> tags and stray role prefixes
	// Ensure decoded is always a string (safeguard against unexpected return types)
	let textOutput = typeof decoded === "string" ? decoded : String(decoded \|\| "");
	textOutput = textOutput.replace(/<think>[\s\S]*?<\/think>/g, "").trim();
	const roleIdx = textOutput.indexOf("\nassistant\n");
	if (roleIdx !== -1) textOutput = textOutput.slice(0, roleIdx);
	if (textOutput.startsWith("assistant\n")) textOutput = textOutput.slice("assistant\n".length);

	if (numGenerated > 0 && textOutput.length === 0) {
	console.warn("[FADA] Generated tokens but cleaned output is empty. Raw decoded:", JSON.stringify(decoded?.slice(0, 200)));
	}

	return textOutput.trim();
	}

	/**
	* Text-only diagnostic test - runs inference WITHOUT any image.
	* If this produces output, the decoder works and the issue is in the vision pipeline.
	* If this also produces 0 tokens, the issue is in the decoder/generation loop.
	*/
	export async function testTextOnly() {
	if (!model \|\| !tokenizer) throw new Error("Model not loaded");

	console.log("[FADA-TEST] === TEXT-ONLY DIAGNOSTIC ===");

	// Simple text-only message (no image)
	const messages = [
	{ role: "system", content: "You are a helpful assistant." },
	{ role: "user", content: "Say hello and tell me what you can do." }
	];

	let text;
	try {
	text = tokenizer.apply_chat_template(messages, {
	add_generation_prompt: true,
	tokenize: false
	});
	} catch(e) {
	text = `<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n<\|im_start\|>user\nSay hello and tell me what you can do.<\|im_end\|>\n<\|im_start\|>assistant\n`;
	}

	// Add pre-closed thinking block if model expects it
	if (!text.includes("<think>")) {
	text += "<think>\n\n</think>\n\n";
	}

	console.log("[FADA-TEST] Template:", text);

	const inputs = tokenizer(text, { return_tensors: "pt" });
	console.log("[FADA-TEST] Input IDs shape:", inputs.input_ids.dims);
	console.log("[FADA-TEST] Input length:", inputs.input_ids.dims[1]);

	const t0 = performance.now();
	const output = await model.generate({
	...inputs,
	max_new_tokens: 50,
	temperature: 0.7,
	do_sample: true,
	top_p: 0.9,
	});
	const elapsed = ((performance.now() - t0) / 1000).toFixed(1);

	// Decode output
	let outputTokens;
	try {
	if (output.tolist) {
	const list = output.tolist();
	outputTokens = (Array.isArray(list[0]) ? list[0] : list).map(t => typeof t === "bigint" ? Number(t) : t);
	} else {
	const firstBatch = output[0];
	const allTokens = firstBatch.tolist ? firstBatch.tolist() : Array.from(firstBatch.data \|\| firstBatch);
	outputTokens = allTokens.map(t => typeof t === "bigint" ? Number(t) : t);
	}
	} catch(e) {
	console.error("[FADA-TEST] Output conversion error:", e);
	outputTokens = [];
	}

	const inputLen = inputs.input_ids.dims[1];
	const newTokens = outputTokens.slice(inputLen);

	console.log("[FADA-TEST] Output total tokens:", outputTokens.length);
	console.log("[FADA-TEST] New tokens:", newTokens.length);
	console.log("[FADA-TEST] Time:", elapsed, "s");

	if (newTokens.length > 0) {
	console.log("[FADA-TEST] First 20 tokens:", newTokens.slice(0, 20));
	const decoded = tokenizer.decode(newTokens, { skip_special_tokens: true });
	console.log("[FADA-TEST] \u2713 DECODED:", decoded);
	return `TEXT-ONLY TEST PASSED (${newTokens.length} tokens in ${elapsed}s): ${decoded}`;
	} else {
	console.log("[FADA-TEST] \u2717 0 tokens generated - DECODER ITSELF IS BROKEN ON WEBGPU");
	console.log("[FADA-TEST] Last 5 tokens of output:", outputTokens.slice(-5));
	const lastDecoded = tokenizer.decode(outputTokens.slice(-5), { skip_special_tokens: false });
	console.log("[FADA-TEST] Last tokens decoded:", lastDecoded);
	return `TEXT-ONLY TEST FAILED: 0 tokens in ${elapsed}s. Decoder broken on WebGPU.`;
	}
	}