Spaces:
Running
Running
| /** | |
| * Model loader and inference using transformers.js v4.2.0 + WebGPU. | |
| * Follows the proven pattern from webml-community/Qwen3.5-WebGPU. | |
| * Loads Qwen3.5-VL 0.8B ONNX from mshz88/FADA-Mobile-ONNX. | |
| */ | |
| const MODEL_ID = "mshz88/FADA-Mobile-ONNX"; | |
| const MODEL_REVISION = "2936611f4ad147e0cbe03e3884de8a42c5cc42b9"; // Pin to specific commit, bypass CDN cache | |
| const TRANSFORMERS_CDN = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0"; | |
| let processor = null; | |
| let model = null; | |
| let tokenizer = null; | |
| let loadingPromise = null; | |
| let activeDevice = null; | |
| export function getLoadingStatus() { | |
| if (model && processor && tokenizer) return "ready"; | |
| if (loadingPromise) return "loading"; | |
| return "idle"; | |
| } | |
| export function getActiveDevice() { | |
| return activeDevice; | |
| } | |
| /** | |
| * Load model + processor + tokenizer. Shows progress via callback. | |
| * @param {function} onProgress - Progress callback | |
| * @param {string} device - "webgpu" or "wasm" | |
| */ | |
| export async function loadModel(onProgress, device = "webgpu") { | |
| if (model && processor && tokenizer) return; | |
| if (loadingPromise) return loadingPromise; | |
| loadingPromise = (async () => { | |
| const { | |
| Qwen3_5ForConditionalGeneration, | |
| AutoProcessor, | |
| AutoTokenizer, | |
| env, | |
| } = await import(TRANSFORMERS_CDN); | |
| // Force high-performance GPU (discrete NVIDIA over integrated Intel) | |
| if (device === "webgpu" && navigator.gpu) { | |
| try { | |
| const adapter = await navigator.gpu.requestAdapter({ | |
| powerPreference: "high-performance", | |
| }); | |
| if (adapter) { | |
| const info = await adapter.requestAdapterInfo(); | |
| console.log("[FADA] GPU adapter selected:", info.vendor, info.architecture, info.device, info.description); | |
| onProgress?.({ status: "info", note: `GPU: ${info.description || info.vendor || 'discrete GPU'}` }); | |
| // Store globally for diagnostics | |
| window.__FADA_GPU_INFO = info; | |
| } else { | |
| console.warn("[FADA] No high-performance GPU adapter found"); | |
| } | |
| } catch (e) { | |
| console.warn("[FADA] Could not query GPU adapter:", e); | |
| } | |
| // Try to set power preference at the env/backend level for transformers.js | |
| try { | |
| if (env?.backends?.onnx?.webgpu) { | |
| env.backends.onnx.webgpu.powerPreference = "high-performance"; | |
| console.log("[FADA] Set env.backends.onnx.webgpu.powerPreference = high-performance"); | |
| } | |
| // Also try the top-level webgpu settings | |
| if (env?.webgpu) { | |
| env.webgpu.powerPreference = "high-performance"; | |
| } | |
| } catch (e) { | |
| console.warn("[FADA] Could not set env power preference:", e); | |
| } | |
| } | |
| onProgress?.({ status: "loading", file: "processor" }); | |
| processor = await AutoProcessor.from_pretrained(MODEL_ID, { | |
| revision: MODEL_REVISION, | |
| }); | |
| console.log("[FADA] Processor loaded"); | |
| onProgress?.({ status: "loading", file: "tokenizer" }); | |
| tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID, { | |
| revision: MODEL_REVISION, | |
| }); | |
| console.log("[FADA] Tokenizer loaded"); | |
| onProgress?.({ status: "loading", file: "model" }); | |
| console.log(`[FADA] Loading model on device=${device}...`); | |
| activeDevice = device; | |
| // Vision encoder dtype: fp16 on WebGPU, fp32 on WASM | |
| const visionDtype = device === "wasm" ? "fp32" : "fp16"; | |
| const buildModelConfig = (vDtype) => ({ | |
| revision: MODEL_REVISION, | |
| dtype: { | |
| embed_tokens: "q4", | |
| vision_encoder: vDtype, | |
| decoder_model_merged: "q4", | |
| }, | |
| device: device, | |
| progress_callback: (info) => { | |
| if (info.status === "progress") { | |
| onProgress?.({ | |
| status: "downloading", | |
| file: info.file || "", | |
| progress: info.progress || 0, | |
| loaded: info.loaded || 0, | |
| total: info.total || 0, | |
| }); | |
| } | |
| }, | |
| }); | |
| try { | |
| onProgress?.({ | |
| status: "compiling", | |
| file: "model", | |
| note: "Creating GPU session & compiling shaders (2-5 min)...", | |
| }); | |
| model = await Qwen3_5ForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| buildModelConfig(visionDtype) | |
| ); | |
| } catch (err) { | |
| // Fallback: if fp16 not supported, retry with fp32 vision encoder | |
| const isFp16Error = | |
| visionDtype === "fp16" && | |
| (err?.message?.toLowerCase().includes("does not support fp16") || | |
| err?.message?.toLowerCase().includes("shader")); | |
| if (isFp16Error) { | |
| console.warn("[FADA] FP16 not supported, retrying with FP32 vision encoder..."); | |
| onProgress?.({ | |
| status: "loading", | |
| file: "model", | |
| note: "FP16 not supported, loading vision encoder in FP32 mode...", | |
| }); | |
| model = await Qwen3_5ForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| buildModelConfig("fp32") | |
| ); | |
| } else { | |
| throw err; | |
| } | |
| } | |
| console.log("[FADA] Model loaded successfully on", device); | |
| onProgress?.({ status: "ready" }); | |
| })(); | |
| try { | |
| await loadingPromise; | |
| } catch (e) { | |
| loadingPromise = null; | |
| model = null; | |
| processor = null; | |
| tokenizer = null; | |
| activeDevice = null; | |
| throw e; | |
| } | |
| } | |
| /** | |
| * Run inference on a single image with a text prompt. | |
| * Uses the community pattern: processor.apply_chat_template + model.generate + tokenizer.decode | |
| * No manual KV cache logic — transformers.js handles it internally. | |
| * | |
| * @param {HTMLImageElement|HTMLCanvasElement|RawImage} image | |
| * @param {string} prompt | |
| * @param {object} opts - { maxNewTokens, temperature, onToken } | |
| * @returns {string} Generated text | |
| */ | |
| export async function runInference(image, prompt, opts = {}) { | |
| if (!model || !processor || !tokenizer) throw new Error("Model not loaded"); | |
| const { RawImage } = await import(TRANSFORMERS_CDN); | |
| // Convert browser image to RawImage using the same approach as webml-community demo: | |
| // Export to data URL then use RawImage.read() for proper internal state. | |
| let rawImage = image; | |
| if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) { | |
| const origW = image.naturalWidth || image.width; | |
| const origH = image.naturalHeight || image.height; | |
| // Resize to limit GPU memory (max 672px longest side for WebGPU) | |
| const MAX_DIM = 672; | |
| let processWidth = origW; | |
| let processHeight = origH; | |
| if (processWidth > MAX_DIM || processHeight > MAX_DIM) { | |
| const scale = MAX_DIM / Math.max(processWidth, processHeight); | |
| processWidth = Math.round(processWidth * scale); | |
| processHeight = Math.round(processHeight * scale); | |
| } | |
| // Create resized canvas and export as data URL for RawImage.read() | |
| const resizedCanvas = document.createElement("canvas"); | |
| resizedCanvas.width = processWidth; | |
| resizedCanvas.height = processHeight; | |
| const resizedCtx = resizedCanvas.getContext("2d"); | |
| resizedCtx.drawImage(image, 0, 0, processWidth, processHeight); | |
| const dataURL = resizedCanvas.toDataURL("image/png"); | |
| // Use RawImage.read() — the proven way (matches webml-community demo) | |
| rawImage = await RawImage.read(dataURL); | |
| console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`); | |
| } | |
| const maxNewTokens = opts.maxNewTokens || 1024; | |
| const temperature = opts.temperature ?? 0.1; | |
| // Build messages array matching training format: | |
| // The model was trained WITHOUT a system message, just user + assistant. | |
| // The Qwen3.5 chat template auto-adds <think>\n\n</think>\n\n when | |
| // enable_thinking is false/undefined (pre-closed thinking mode). | |
| const messages = [ | |
| { | |
| role: "user", | |
| content: [ | |
| { type: "image" }, | |
| { type: "text", text: prompt }, | |
| ], | |
| }, | |
| ]; | |
| // Apply chat template with enable_thinking explicitly disabled. | |
| // The model was fine-tuned with thinking PRE-CLOSED: the template adds | |
| // <think>\n\n</think>\n\n before the model's actual output. | |
| // We must ensure this is present in the generation prompt. | |
| console.log("[FADA] Building inputs..."); | |
| let text; | |
| const templateOpts = { | |
| add_generation_prompt: true, | |
| enable_thinking: false, | |
| }; | |
| if (processor.apply_chat_template) { | |
| text = processor.apply_chat_template(messages, templateOpts); | |
| } else if (tokenizer.apply_chat_template) { | |
| text = tokenizer.apply_chat_template(messages, { | |
| ...templateOpts, | |
| tokenize: false, | |
| }); | |
| } else { | |
| // Manual fallback: must include the pre-closed <think> block | |
| text = `<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>${prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n`; | |
| } | |
| // Verify the template output includes the required <think>...</think> block. | |
| // The model was trained expecting <think>\n\n</think>\n\n before its output. | |
| // Handle three failure cases: | |
| // 1. Template produced open <think>\n (enable_thinking=true path) -> close it | |
| // 2. Template didn't include <think> at all -> add the block | |
| // 3. Template produced correct <think>\n\n</think>\n\n -> leave as-is | |
| if (text.includes("<|im_start|>assistant")) { | |
| const assistantIdx = text.lastIndexOf("<|im_start|>assistant"); | |
| const afterAssistant = text.slice(assistantIdx); | |
| if (!afterAssistant.includes("<think>")) { | |
| // No <think> at all - append the pre-closed block | |
| console.warn("[FADA] Template missing <think> block - appending pre-closed thinking"); | |
| text = text.trimEnd() + "\n<think>\n\n</think>\n\n"; | |
| } else if (!afterAssistant.includes("</think>")) { | |
| // Has <think> but no </think> - close it | |
| console.warn("[FADA] Template produced open <think> - patching to pre-closed format"); | |
| text = text.replace(/<think>\s*$/, "<think>\n\n</think>\n\n"); | |
| } | |
| } | |
| // Debug: log the template output (first and last parts) | |
| console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120))); | |
| console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80))); | |
| // Call processor with SINGLE image (not array!) — matches webml-community demo pattern | |
| const inputs = await processor(text, rawImage); | |
| const inputLen = inputs.input_ids.dims[1]; | |
| console.log( | |
| `[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...` | |
| ); | |
| // === VISION PIPELINE DIAGNOSTIC === | |
| console.log("[FADA-VISION] === Vision Pipeline Diagnostic ==="); | |
| console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs)); | |
| if (inputs.pixel_values) { | |
| console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims); | |
| console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type); | |
| const pv = inputs.pixel_values.data; | |
| const sample = Array.from(pv.slice(0, 10)); | |
| console.log("[FADA-VISION] pixel_values first 10:", sample); | |
| console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v))); | |
| console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0)); | |
| const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length))); | |
| console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge)); | |
| } else { | |
| console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!"); | |
| } | |
| if (inputs.image_grid_thw) { | |
| console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims); | |
| console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number)); | |
| } else { | |
| console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!"); | |
| } | |
| // Check if input_ids contain image tokens (token id 151655 for <|image_pad|>) | |
| if (inputs.input_ids) { | |
| const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t); | |
| // Qwen3.5-VL image pad token is 151655 | |
| const imageTokenCount = ids.filter(t => t === 151655).length; | |
| const imageTokenCount2 = ids.filter(t => t === 248056).length; | |
| console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)"); | |
| console.log("[FADA-VISION] input_ids length:", ids.length); | |
| console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30)); | |
| // Find image pad tokens | |
| const firstImageIdx = ids.indexOf(151655); | |
| const firstImageIdx2 = ids.indexOf(248056); | |
| const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2; | |
| if (imgIdx >= 0) { | |
| console.log("[FADA-VISION] Image tokens start at index:", imgIdx); | |
| console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10)); | |
| } else { | |
| console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong."); | |
| } | |
| } | |
| console.log("[FADA-VISION] === End Vision Diagnostic ==="); | |
| // === DEEP DEBUG: Pre-generation diagnostics === | |
| console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ==="); | |
| console.log("[FADA-DEBUG] Prompt:", prompt); | |
| console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500)); | |
| console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200)); | |
| console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims); | |
| console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number)); | |
| console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number)); | |
| console.log("[FADA-DEBUG] Pixel values shape:", inputs.pixel_values?.dims); | |
| console.log("[FADA-DEBUG] Image grid thw:", inputs.image_grid_thw?.dims, inputs.image_grid_thw?.data ? Array.from(inputs.image_grid_thw.data).map(Number) : 'N/A'); | |
| console.log("[FADA-DEBUG] Attention mask shape:", inputs.attention_mask?.dims); | |
| // Decode last 20 input tokens to verify template ending | |
| try { | |
| const lastInputTokens = Array.from(inputs.input_ids.data.slice(-20)).map(Number); | |
| const decodedEnd = tokenizer.decode(lastInputTokens, { skip_special_tokens: false }); | |
| console.log("[FADA-DEBUG] Last 20 input tokens decoded:", JSON.stringify(decodedEnd)); | |
| } catch (e) { console.warn("[FADA-DEBUG] Could not decode last input tokens:", e); } | |
| console.log("[FADA-DEBUG] === Starting Generation ==="); | |
| // Generate — transformers.js handles KV cache internally | |
| const t0 = performance.now(); | |
| let output; | |
| try { | |
| output = await model.generate({ | |
| ...inputs, | |
| max_new_tokens: maxNewTokens, | |
| temperature: temperature > 0 ? temperature : undefined, | |
| do_sample: temperature > 0, | |
| top_p: 0.95, | |
| }); | |
| } catch (err) { | |
| console.error("[FADA-DEBUG] Generation error:", err); | |
| if (err?.message?.includes("Device") && err?.message?.includes("lost")) { | |
| throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode."); | |
| } | |
| if (err?.message?.includes("mapAsync")) { | |
| throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode."); | |
| } | |
| throw err; | |
| } | |
| const genTime = ((performance.now() - t0) / 1000).toFixed(1); | |
| console.log("[FADA-DEBUG] Generation completed in", genTime, "seconds"); | |
| // Decode only generated tokens (skip input) | |
| const outputDims = output.dims || []; | |
| const outputTotalLen = outputDims[1] || (outputDims[0] === 1 ? output.data?.length : 0); | |
| // Convert output Tensor to a plain JS array | |
| let outputTokens; | |
| try { | |
| // For 2D tensor [1, seq_len], try .tolist() which returns nested array | |
| if (output.tolist) { | |
| const listed = output.tolist(); | |
| // Could be [[tok1, tok2, ...]] (2D) or [tok1, tok2, ...] (1D) | |
| const flat = Array.isArray(listed[0]) ? listed[0] : listed; | |
| outputTokens = flat.map(t => (typeof t === "bigint" ? Number(t) : t)); | |
| } else { | |
| const firstBatch = output[0]; | |
| const allTokens = firstBatch?.tolist | |
| ? firstBatch.tolist() | |
| : Array.from(firstBatch?.data || firstBatch || output.data || []); | |
| outputTokens = allTokens.map(t => (typeof t === "bigint" ? Number(t) : t)); | |
| } | |
| } catch (e) { | |
| console.warn("[FADA] Tensor conversion fallback:", e); | |
| // Fallback: try direct Array.from on output data | |
| const raw = output.data || output; | |
| outputTokens = Array.from(raw).map(t => (typeof t === "bigint" ? Number(t) : t)); | |
| } | |
| // === DEEP DEBUG: Post-generation analysis === | |
| console.log("[FADA-DEBUG] === Post-Generation Analysis ==="); | |
| console.log(`[FADA-DEBUG] Output raw dims: ${JSON.stringify(outputDims)}`); | |
| console.log(`[FADA-DEBUG] Output type: ${typeof output}, constructor: ${output?.constructor?.name}`); | |
| console.log(`[FADA-DEBUG] outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`); | |
| console.log(`[FADA] Output tensor dims: ${JSON.stringify(outputDims)}, outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`); | |
| // Slice to only the newly generated tokens | |
| const newTokens = outputTokens.slice(inputLen); | |
| const numGenerated = newTokens.length; | |
| // Debug: log first few generated tokens | |
| if (numGenerated > 0) { | |
| console.log(`[FADA] First 20 new token IDs: [${newTokens.slice(0, 20).join(", ")}]`); | |
| const rawDecoded = tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: false }); | |
| console.log(`[FADA] First 30 tokens decoded (with special): ${JSON.stringify(rawDecoded)}`); | |
| console.log(`[FADA-DEBUG] Decoded (skip_special=true): ${JSON.stringify(tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: true }))}`); | |
| } else { | |
| console.warn("[FADA-DEBUG] *** CRITICAL: 0 new tokens generated! ***"); | |
| console.log(`[FADA-DEBUG] Output length === Input length? ${outputTokens.length === inputLen}`); | |
| console.log(`[FADA-DEBUG] Output length: ${outputTokens.length} vs Input length: ${inputLen}`); | |
| console.log(`[FADA-DEBUG] Last 10 output token IDs: [${outputTokens.slice(-10).join(", ")}]`); | |
| // Try decoding last few tokens to understand what happened | |
| if (outputTokens.length > 0) { | |
| const lastFew = tokenizer.decode(outputTokens.slice(-10), { skip_special_tokens: false }); | |
| console.log(`[FADA-DEBUG] Last 10 tokens decoded (no skip): ${JSON.stringify(lastFew)}`); | |
| } | |
| // Check if ALL output tokens are the same as input (nothing was generated) | |
| if (outputTokens.length > inputLen) { | |
| // Actually there ARE new tokens but our inputLen might be wrong | |
| console.log(`[FADA-DEBUG] WAIT: outputTokens(${outputTokens.length}) > inputLen(${inputLen}) - recalculating...`); | |
| const extraTokens = outputTokens.slice(inputLen); | |
| console.log(`[FADA-DEBUG] Extra tokens: [${extraTokens.slice(0, 20).join(", ")}]`); | |
| console.log(`[FADA-DEBUG] Extra decoded: ${JSON.stringify(tokenizer.decode(extraTokens, { skip_special_tokens: false }))}`); | |
| } else if (outputTokens.length < inputLen) { | |
| console.log(`[FADA-DEBUG] STRANGE: output shorter than input! Model may have truncated.`); | |
| } else { | |
| console.log(`[FADA-DEBUG] Output exactly equals input - model generated EOS immediately or nothing at all.`); | |
| // Check what the last token IS | |
| const lastToken = outputTokens[outputTokens.length - 1]; | |
| console.log(`[FADA-DEBUG] Last token ID: ${lastToken}, decoded: ${JSON.stringify(tokenizer.decode([lastToken], { skip_special_tokens: false }))}`); | |
| } | |
| } | |
| const decoded = tokenizer.decode(newTokens, { | |
| skip_special_tokens: true, | |
| }); | |
| console.log( | |
| `[FADA] Generated ${numGenerated} tokens in ${genTime}s (${(numGenerated / (genTime || 1)).toFixed(1)} tok/s)` | |
| ); | |
| // Clean output: remove <think></think> tags and stray role prefixes | |
| // Ensure decoded is always a string (safeguard against unexpected return types) | |
| let textOutput = typeof decoded === "string" ? decoded : String(decoded || ""); | |
| textOutput = textOutput.replace(/<think>[\s\S]*?<\/think>/g, "").trim(); | |
| const roleIdx = textOutput.indexOf("\nassistant\n"); | |
| if (roleIdx !== -1) textOutput = textOutput.slice(0, roleIdx); | |
| if (textOutput.startsWith("assistant\n")) textOutput = textOutput.slice("assistant\n".length); | |
| if (numGenerated > 0 && textOutput.length === 0) { | |
| console.warn("[FADA] Generated tokens but cleaned output is empty. Raw decoded:", JSON.stringify(decoded?.slice(0, 200))); | |
| } | |
| return textOutput.trim(); | |
| } | |
| /** | |
| * Text-only diagnostic test - runs inference WITHOUT any image. | |
| * If this produces output, the decoder works and the issue is in the vision pipeline. | |
| * If this also produces 0 tokens, the issue is in the decoder/generation loop. | |
| */ | |
| export async function testTextOnly() { | |
| if (!model || !tokenizer) throw new Error("Model not loaded"); | |
| console.log("[FADA-TEST] === TEXT-ONLY DIAGNOSTIC ==="); | |
| // Simple text-only message (no image) | |
| const messages = [ | |
| { role: "system", content: "You are a helpful assistant." }, | |
| { role: "user", content: "Say hello and tell me what you can do." } | |
| ]; | |
| let text; | |
| try { | |
| text = tokenizer.apply_chat_template(messages, { | |
| add_generation_prompt: true, | |
| tokenize: false | |
| }); | |
| } catch(e) { | |
| text = `<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nSay hello and tell me what you can do.<|im_end|>\n<|im_start|>assistant\n`; | |
| } | |
| // Add pre-closed thinking block if model expects it | |
| if (!text.includes("<think>")) { | |
| text += "<think>\n\n</think>\n\n"; | |
| } | |
| console.log("[FADA-TEST] Template:", text); | |
| const inputs = tokenizer(text, { return_tensors: "pt" }); | |
| console.log("[FADA-TEST] Input IDs shape:", inputs.input_ids.dims); | |
| console.log("[FADA-TEST] Input length:", inputs.input_ids.dims[1]); | |
| const t0 = performance.now(); | |
| const output = await model.generate({ | |
| ...inputs, | |
| max_new_tokens: 50, | |
| temperature: 0.7, | |
| do_sample: true, | |
| top_p: 0.9, | |
| }); | |
| const elapsed = ((performance.now() - t0) / 1000).toFixed(1); | |
| // Decode output | |
| let outputTokens; | |
| try { | |
| if (output.tolist) { | |
| const list = output.tolist(); | |
| outputTokens = (Array.isArray(list[0]) ? list[0] : list).map(t => typeof t === "bigint" ? Number(t) : t); | |
| } else { | |
| const firstBatch = output[0]; | |
| const allTokens = firstBatch.tolist ? firstBatch.tolist() : Array.from(firstBatch.data || firstBatch); | |
| outputTokens = allTokens.map(t => typeof t === "bigint" ? Number(t) : t); | |
| } | |
| } catch(e) { | |
| console.error("[FADA-TEST] Output conversion error:", e); | |
| outputTokens = []; | |
| } | |
| const inputLen = inputs.input_ids.dims[1]; | |
| const newTokens = outputTokens.slice(inputLen); | |
| console.log("[FADA-TEST] Output total tokens:", outputTokens.length); | |
| console.log("[FADA-TEST] New tokens:", newTokens.length); | |
| console.log("[FADA-TEST] Time:", elapsed, "s"); | |
| if (newTokens.length > 0) { | |
| console.log("[FADA-TEST] First 20 tokens:", newTokens.slice(0, 20)); | |
| const decoded = tokenizer.decode(newTokens, { skip_special_tokens: true }); | |
| console.log("[FADA-TEST] \u2713 DECODED:", decoded); | |
| return `TEXT-ONLY TEST PASSED (${newTokens.length} tokens in ${elapsed}s): ${decoded}`; | |
| } else { | |
| console.log("[FADA-TEST] \u2717 0 tokens generated - DECODER ITSELF IS BROKEN ON WEBGPU"); | |
| console.log("[FADA-TEST] Last 5 tokens of output:", outputTokens.slice(-5)); | |
| const lastDecoded = tokenizer.decode(outputTokens.slice(-5), { skip_special_tokens: false }); | |
| console.log("[FADA-TEST] Last tokens decoded:", lastDecoded); | |
| return `TEXT-ONLY TEST FAILED: 0 tokens in ${elapsed}s. Decoder broken on WebGPU.`; | |
| } | |
| } | |