/** * Model loader and inference using transformers.js v4.2.0 + WebGPU. * Follows the proven pattern from webml-community/Qwen3.5-WebGPU. * Loads Qwen3.5-VL 0.8B ONNX from mshz88/FADA-Mobile-ONNX. */ const MODEL_ID = "mshz88/FADA-Mobile-ONNX"; const MODEL_REVISION = "2936611f4ad147e0cbe03e3884de8a42c5cc42b9"; // Pin to specific commit, bypass CDN cache const TRANSFORMERS_CDN = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0"; let processor = null; let model = null; let tokenizer = null; let loadingPromise = null; let activeDevice = null; export function getLoadingStatus() { if (model && processor && tokenizer) return "ready"; if (loadingPromise) return "loading"; return "idle"; } export function getActiveDevice() { return activeDevice; } /** * Load model + processor + tokenizer. Shows progress via callback. * @param {function} onProgress - Progress callback * @param {string} device - "webgpu" or "wasm" */ export async function loadModel(onProgress, device = "webgpu") { if (model && processor && tokenizer) return; if (loadingPromise) return loadingPromise; loadingPromise = (async () => { const { Qwen3_5ForConditionalGeneration, AutoProcessor, AutoTokenizer, env, } = await import(TRANSFORMERS_CDN); // Force high-performance GPU (discrete NVIDIA over integrated Intel) if (device === "webgpu" && navigator.gpu) { try { const adapter = await navigator.gpu.requestAdapter({ powerPreference: "high-performance", }); if (adapter) { const info = await adapter.requestAdapterInfo(); console.log("[FADA] GPU adapter selected:", info.vendor, info.architecture, info.device, info.description); onProgress?.({ status: "info", note: `GPU: ${info.description || info.vendor || 'discrete GPU'}` }); // Store globally for diagnostics window.__FADA_GPU_INFO = info; } else { console.warn("[FADA] No high-performance GPU adapter found"); } } catch (e) { console.warn("[FADA] Could not query GPU adapter:", e); } // Try to set power preference at the env/backend level for transformers.js try { if (env?.backends?.onnx?.webgpu) { env.backends.onnx.webgpu.powerPreference = "high-performance"; console.log("[FADA] Set env.backends.onnx.webgpu.powerPreference = high-performance"); } // Also try the top-level webgpu settings if (env?.webgpu) { env.webgpu.powerPreference = "high-performance"; } } catch (e) { console.warn("[FADA] Could not set env power preference:", e); } } onProgress?.({ status: "loading", file: "processor" }); processor = await AutoProcessor.from_pretrained(MODEL_ID, { revision: MODEL_REVISION, }); console.log("[FADA] Processor loaded"); onProgress?.({ status: "loading", file: "tokenizer" }); tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID, { revision: MODEL_REVISION, }); console.log("[FADA] Tokenizer loaded"); onProgress?.({ status: "loading", file: "model" }); console.log(`[FADA] Loading model on device=${device}...`); activeDevice = device; // Vision encoder dtype: fp16 on WebGPU, fp32 on WASM const visionDtype = device === "wasm" ? "fp32" : "fp16"; const buildModelConfig = (vDtype) => ({ revision: MODEL_REVISION, dtype: { embed_tokens: "q4", vision_encoder: vDtype, decoder_model_merged: "q4", }, device: device, progress_callback: (info) => { if (info.status === "progress") { onProgress?.({ status: "downloading", file: info.file || "", progress: info.progress || 0, loaded: info.loaded || 0, total: info.total || 0, }); } }, }); try { onProgress?.({ status: "compiling", file: "model", note: "Creating GPU session & compiling shaders (2-5 min)...", }); model = await Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, buildModelConfig(visionDtype) ); } catch (err) { // Fallback: if fp16 not supported, retry with fp32 vision encoder const isFp16Error = visionDtype === "fp16" && (err?.message?.toLowerCase().includes("does not support fp16") || err?.message?.toLowerCase().includes("shader")); if (isFp16Error) { console.warn("[FADA] FP16 not supported, retrying with FP32 vision encoder..."); onProgress?.({ status: "loading", file: "model", note: "FP16 not supported, loading vision encoder in FP32 mode...", }); model = await Qwen3_5ForConditionalGeneration.from_pretrained( MODEL_ID, buildModelConfig("fp32") ); } else { throw err; } } console.log("[FADA] Model loaded successfully on", device); onProgress?.({ status: "ready" }); })(); try { await loadingPromise; } catch (e) { loadingPromise = null; model = null; processor = null; tokenizer = null; activeDevice = null; throw e; } } /** * Run inference on a single image with a text prompt. * Uses the community pattern: processor.apply_chat_template + model.generate + tokenizer.decode * No manual KV cache logic — transformers.js handles it internally. * * @param {HTMLImageElement|HTMLCanvasElement|RawImage} image * @param {string} prompt * @param {object} opts - { maxNewTokens, temperature, onToken } * @returns {string} Generated text */ export async function runInference(image, prompt, opts = {}) { if (!model || !processor || !tokenizer) throw new Error("Model not loaded"); const { RawImage } = await import(TRANSFORMERS_CDN); // Convert browser image to RawImage using the same approach as webml-community demo: // Export to data URL then use RawImage.read() for proper internal state. let rawImage = image; if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) { const origW = image.naturalWidth || image.width; const origH = image.naturalHeight || image.height; // Resize to limit GPU memory (max 672px longest side for WebGPU) const MAX_DIM = 672; let processWidth = origW; let processHeight = origH; if (processWidth > MAX_DIM || processHeight > MAX_DIM) { const scale = MAX_DIM / Math.max(processWidth, processHeight); processWidth = Math.round(processWidth * scale); processHeight = Math.round(processHeight * scale); } // Create resized canvas and export as data URL for RawImage.read() const resizedCanvas = document.createElement("canvas"); resizedCanvas.width = processWidth; resizedCanvas.height = processHeight; const resizedCtx = resizedCanvas.getContext("2d"); resizedCtx.drawImage(image, 0, 0, processWidth, processHeight); const dataURL = resizedCanvas.toDataURL("image/png"); // Use RawImage.read() — the proven way (matches webml-community demo) rawImage = await RawImage.read(dataURL); console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`); } const maxNewTokens = opts.maxNewTokens || 1024; const temperature = opts.temperature ?? 0.1; // Build messages array matching training format: // The model was trained WITHOUT a system message, just user + assistant. // The Qwen3.5 chat template auto-adds \n\n\n\n when // enable_thinking is false/undefined (pre-closed thinking mode). const messages = [ { role: "user", content: [ { type: "image" }, { type: "text", text: prompt }, ], }, ]; // Apply chat template with enable_thinking explicitly disabled. // The model was fine-tuned with thinking PRE-CLOSED: the template adds // \n\n\n\n before the model's actual output. // We must ensure this is present in the generation prompt. console.log("[FADA] Building inputs..."); let text; const templateOpts = { add_generation_prompt: true, enable_thinking: false, }; if (processor.apply_chat_template) { text = processor.apply_chat_template(messages, templateOpts); } else if (tokenizer.apply_chat_template) { text = tokenizer.apply_chat_template(messages, { ...templateOpts, tokenize: false, }); } else { // Manual fallback: must include the pre-closed block text = `<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>${prompt}<|im_end|>\n<|im_start|>assistant\n\n\n\n\n`; } // Verify the template output includes the required ... block. // The model was trained expecting \n\n\n\n before its output. // Handle three failure cases: // 1. Template produced open \n (enable_thinking=true path) -> close it // 2. Template didn't include at all -> add the block // 3. Template produced correct \n\n\n\n -> leave as-is if (text.includes("<|im_start|>assistant")) { const assistantIdx = text.lastIndexOf("<|im_start|>assistant"); const afterAssistant = text.slice(assistantIdx); if (!afterAssistant.includes("")) { // No at all - append the pre-closed block console.warn("[FADA] Template missing block - appending pre-closed thinking"); text = text.trimEnd() + "\n\n\n\n\n"; } else if (!afterAssistant.includes("")) { // Has but no - close it console.warn("[FADA] Template produced open - patching to pre-closed format"); text = text.replace(/\s*$/, "\n\n\n\n"); } } // Debug: log the template output (first and last parts) console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120))); console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80))); // Call processor with SINGLE image (not array!) — matches webml-community demo pattern const inputs = await processor(text, rawImage); const inputLen = inputs.input_ids.dims[1]; console.log( `[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...` ); // === VISION PIPELINE DIAGNOSTIC === console.log("[FADA-VISION] === Vision Pipeline Diagnostic ==="); console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs)); if (inputs.pixel_values) { console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims); console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type); const pv = inputs.pixel_values.data; const sample = Array.from(pv.slice(0, 10)); console.log("[FADA-VISION] pixel_values first 10:", sample); console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v))); console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0)); const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length))); console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge)); } else { console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!"); } if (inputs.image_grid_thw) { console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims); console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number)); } else { console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!"); } // Check if input_ids contain image tokens (token id 151655 for <|image_pad|>) if (inputs.input_ids) { const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t); // Qwen3.5-VL image pad token is 151655 const imageTokenCount = ids.filter(t => t === 151655).length; const imageTokenCount2 = ids.filter(t => t === 248056).length; console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)"); console.log("[FADA-VISION] input_ids length:", ids.length); console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30)); // Find image pad tokens const firstImageIdx = ids.indexOf(151655); const firstImageIdx2 = ids.indexOf(248056); const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2; if (imgIdx >= 0) { console.log("[FADA-VISION] Image tokens start at index:", imgIdx); console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10)); } else { console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong."); } } console.log("[FADA-VISION] === End Vision Diagnostic ==="); // === DEEP DEBUG: Pre-generation diagnostics === console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ==="); console.log("[FADA-DEBUG] Prompt:", prompt); console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500)); console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200)); console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims); console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number)); console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number)); console.log("[FADA-DEBUG] Pixel values shape:", inputs.pixel_values?.dims); console.log("[FADA-DEBUG] Image grid thw:", inputs.image_grid_thw?.dims, inputs.image_grid_thw?.data ? Array.from(inputs.image_grid_thw.data).map(Number) : 'N/A'); console.log("[FADA-DEBUG] Attention mask shape:", inputs.attention_mask?.dims); // Decode last 20 input tokens to verify template ending try { const lastInputTokens = Array.from(inputs.input_ids.data.slice(-20)).map(Number); const decodedEnd = tokenizer.decode(lastInputTokens, { skip_special_tokens: false }); console.log("[FADA-DEBUG] Last 20 input tokens decoded:", JSON.stringify(decodedEnd)); } catch (e) { console.warn("[FADA-DEBUG] Could not decode last input tokens:", e); } console.log("[FADA-DEBUG] === Starting Generation ==="); // Generate — transformers.js handles KV cache internally const t0 = performance.now(); let output; try { output = await model.generate({ ...inputs, max_new_tokens: maxNewTokens, temperature: temperature > 0 ? temperature : undefined, do_sample: temperature > 0, top_p: 0.95, }); } catch (err) { console.error("[FADA-DEBUG] Generation error:", err); if (err?.message?.includes("Device") && err?.message?.includes("lost")) { throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode."); } if (err?.message?.includes("mapAsync")) { throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode."); } throw err; } const genTime = ((performance.now() - t0) / 1000).toFixed(1); console.log("[FADA-DEBUG] Generation completed in", genTime, "seconds"); // Decode only generated tokens (skip input) const outputDims = output.dims || []; const outputTotalLen = outputDims[1] || (outputDims[0] === 1 ? output.data?.length : 0); // Convert output Tensor to a plain JS array let outputTokens; try { // For 2D tensor [1, seq_len], try .tolist() which returns nested array if (output.tolist) { const listed = output.tolist(); // Could be [[tok1, tok2, ...]] (2D) or [tok1, tok2, ...] (1D) const flat = Array.isArray(listed[0]) ? listed[0] : listed; outputTokens = flat.map(t => (typeof t === "bigint" ? Number(t) : t)); } else { const firstBatch = output[0]; const allTokens = firstBatch?.tolist ? firstBatch.tolist() : Array.from(firstBatch?.data || firstBatch || output.data || []); outputTokens = allTokens.map(t => (typeof t === "bigint" ? Number(t) : t)); } } catch (e) { console.warn("[FADA] Tensor conversion fallback:", e); // Fallback: try direct Array.from on output data const raw = output.data || output; outputTokens = Array.from(raw).map(t => (typeof t === "bigint" ? Number(t) : t)); } // === DEEP DEBUG: Post-generation analysis === console.log("[FADA-DEBUG] === Post-Generation Analysis ==="); console.log(`[FADA-DEBUG] Output raw dims: ${JSON.stringify(outputDims)}`); console.log(`[FADA-DEBUG] Output type: ${typeof output}, constructor: ${output?.constructor?.name}`); console.log(`[FADA-DEBUG] outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`); console.log(`[FADA] Output tensor dims: ${JSON.stringify(outputDims)}, outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`); // Slice to only the newly generated tokens const newTokens = outputTokens.slice(inputLen); const numGenerated = newTokens.length; // Debug: log first few generated tokens if (numGenerated > 0) { console.log(`[FADA] First 20 new token IDs: [${newTokens.slice(0, 20).join(", ")}]`); const rawDecoded = tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: false }); console.log(`[FADA] First 30 tokens decoded (with special): ${JSON.stringify(rawDecoded)}`); console.log(`[FADA-DEBUG] Decoded (skip_special=true): ${JSON.stringify(tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: true }))}`); } else { console.warn("[FADA-DEBUG] *** CRITICAL: 0 new tokens generated! ***"); console.log(`[FADA-DEBUG] Output length === Input length? ${outputTokens.length === inputLen}`); console.log(`[FADA-DEBUG] Output length: ${outputTokens.length} vs Input length: ${inputLen}`); console.log(`[FADA-DEBUG] Last 10 output token IDs: [${outputTokens.slice(-10).join(", ")}]`); // Try decoding last few tokens to understand what happened if (outputTokens.length > 0) { const lastFew = tokenizer.decode(outputTokens.slice(-10), { skip_special_tokens: false }); console.log(`[FADA-DEBUG] Last 10 tokens decoded (no skip): ${JSON.stringify(lastFew)}`); } // Check if ALL output tokens are the same as input (nothing was generated) if (outputTokens.length > inputLen) { // Actually there ARE new tokens but our inputLen might be wrong console.log(`[FADA-DEBUG] WAIT: outputTokens(${outputTokens.length}) > inputLen(${inputLen}) - recalculating...`); const extraTokens = outputTokens.slice(inputLen); console.log(`[FADA-DEBUG] Extra tokens: [${extraTokens.slice(0, 20).join(", ")}]`); console.log(`[FADA-DEBUG] Extra decoded: ${JSON.stringify(tokenizer.decode(extraTokens, { skip_special_tokens: false }))}`); } else if (outputTokens.length < inputLen) { console.log(`[FADA-DEBUG] STRANGE: output shorter than input! Model may have truncated.`); } else { console.log(`[FADA-DEBUG] Output exactly equals input - model generated EOS immediately or nothing at all.`); // Check what the last token IS const lastToken = outputTokens[outputTokens.length - 1]; console.log(`[FADA-DEBUG] Last token ID: ${lastToken}, decoded: ${JSON.stringify(tokenizer.decode([lastToken], { skip_special_tokens: false }))}`); } } const decoded = tokenizer.decode(newTokens, { skip_special_tokens: true, }); console.log( `[FADA] Generated ${numGenerated} tokens in ${genTime}s (${(numGenerated / (genTime || 1)).toFixed(1)} tok/s)` ); // Clean output: remove tags and stray role prefixes // Ensure decoded is always a string (safeguard against unexpected return types) let textOutput = typeof decoded === "string" ? decoded : String(decoded || ""); textOutput = textOutput.replace(/[\s\S]*?<\/think>/g, "").trim(); const roleIdx = textOutput.indexOf("\nassistant\n"); if (roleIdx !== -1) textOutput = textOutput.slice(0, roleIdx); if (textOutput.startsWith("assistant\n")) textOutput = textOutput.slice("assistant\n".length); if (numGenerated > 0 && textOutput.length === 0) { console.warn("[FADA] Generated tokens but cleaned output is empty. Raw decoded:", JSON.stringify(decoded?.slice(0, 200))); } return textOutput.trim(); } /** * Text-only diagnostic test - runs inference WITHOUT any image. * If this produces output, the decoder works and the issue is in the vision pipeline. * If this also produces 0 tokens, the issue is in the decoder/generation loop. */ export async function testTextOnly() { if (!model || !tokenizer) throw new Error("Model not loaded"); console.log("[FADA-TEST] === TEXT-ONLY DIAGNOSTIC ==="); // Simple text-only message (no image) const messages = [ { role: "system", content: "You are a helpful assistant." }, { role: "user", content: "Say hello and tell me what you can do." } ]; let text; try { text = tokenizer.apply_chat_template(messages, { add_generation_prompt: true, tokenize: false }); } catch(e) { text = `<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nSay hello and tell me what you can do.<|im_end|>\n<|im_start|>assistant\n`; } // Add pre-closed thinking block if model expects it if (!text.includes("")) { text += "\n\n\n\n"; } console.log("[FADA-TEST] Template:", text); const inputs = tokenizer(text, { return_tensors: "pt" }); console.log("[FADA-TEST] Input IDs shape:", inputs.input_ids.dims); console.log("[FADA-TEST] Input length:", inputs.input_ids.dims[1]); const t0 = performance.now(); const output = await model.generate({ ...inputs, max_new_tokens: 50, temperature: 0.7, do_sample: true, top_p: 0.9, }); const elapsed = ((performance.now() - t0) / 1000).toFixed(1); // Decode output let outputTokens; try { if (output.tolist) { const list = output.tolist(); outputTokens = (Array.isArray(list[0]) ? list[0] : list).map(t => typeof t === "bigint" ? Number(t) : t); } else { const firstBatch = output[0]; const allTokens = firstBatch.tolist ? firstBatch.tolist() : Array.from(firstBatch.data || firstBatch); outputTokens = allTokens.map(t => typeof t === "bigint" ? Number(t) : t); } } catch(e) { console.error("[FADA-TEST] Output conversion error:", e); outputTokens = []; } const inputLen = inputs.input_ids.dims[1]; const newTokens = outputTokens.slice(inputLen); console.log("[FADA-TEST] Output total tokens:", outputTokens.length); console.log("[FADA-TEST] New tokens:", newTokens.length); console.log("[FADA-TEST] Time:", elapsed, "s"); if (newTokens.length > 0) { console.log("[FADA-TEST] First 20 tokens:", newTokens.slice(0, 20)); const decoded = tokenizer.decode(newTokens, { skip_special_tokens: true }); console.log("[FADA-TEST] \u2713 DECODED:", decoded); return `TEXT-ONLY TEST PASSED (${newTokens.length} tokens in ${elapsed}s): ${decoded}`; } else { console.log("[FADA-TEST] \u2717 0 tokens generated - DECODER ITSELF IS BROKEN ON WEBGPU"); console.log("[FADA-TEST] Last 5 tokens of output:", outputTokens.slice(-5)); const lastDecoded = tokenizer.decode(outputTokens.slice(-5), { skip_special_tokens: false }); console.log("[FADA-TEST] Last tokens decoded:", lastDecoded); return `TEXT-ONLY TEST FAILED: 0 tokens in ${elapsed}s. Decoder broken on WebGPU.`; } }