FADA-Mobile / js /model.js
mshz88's picture
Upload js/model.js with huggingface_hub
e4e4d61 verified
/**
* Model loader and inference using transformers.js v4.2.0 + WebGPU.
* Follows the proven pattern from webml-community/Qwen3.5-WebGPU.
* Loads Qwen3.5-VL 0.8B ONNX from mshz88/FADA-Mobile-ONNX.
*/
const MODEL_ID = "mshz88/FADA-Mobile-ONNX";
const MODEL_REVISION = "2936611f4ad147e0cbe03e3884de8a42c5cc42b9"; // Pin to specific commit, bypass CDN cache
const TRANSFORMERS_CDN = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@4.2.0";
let processor = null;
let model = null;
let tokenizer = null;
let loadingPromise = null;
let activeDevice = null;
export function getLoadingStatus() {
if (model && processor && tokenizer) return "ready";
if (loadingPromise) return "loading";
return "idle";
}
export function getActiveDevice() {
return activeDevice;
}
/**
* Load model + processor + tokenizer. Shows progress via callback.
* @param {function} onProgress - Progress callback
* @param {string} device - "webgpu" or "wasm"
*/
export async function loadModel(onProgress, device = "webgpu") {
if (model && processor && tokenizer) return;
if (loadingPromise) return loadingPromise;
loadingPromise = (async () => {
const {
Qwen3_5ForConditionalGeneration,
AutoProcessor,
AutoTokenizer,
env,
} = await import(TRANSFORMERS_CDN);
// Force high-performance GPU (discrete NVIDIA over integrated Intel)
if (device === "webgpu" && navigator.gpu) {
try {
const adapter = await navigator.gpu.requestAdapter({
powerPreference: "high-performance",
});
if (adapter) {
const info = await adapter.requestAdapterInfo();
console.log("[FADA] GPU adapter selected:", info.vendor, info.architecture, info.device, info.description);
onProgress?.({ status: "info", note: `GPU: ${info.description || info.vendor || 'discrete GPU'}` });
// Store globally for diagnostics
window.__FADA_GPU_INFO = info;
} else {
console.warn("[FADA] No high-performance GPU adapter found");
}
} catch (e) {
console.warn("[FADA] Could not query GPU adapter:", e);
}
// Try to set power preference at the env/backend level for transformers.js
try {
if (env?.backends?.onnx?.webgpu) {
env.backends.onnx.webgpu.powerPreference = "high-performance";
console.log("[FADA] Set env.backends.onnx.webgpu.powerPreference = high-performance");
}
// Also try the top-level webgpu settings
if (env?.webgpu) {
env.webgpu.powerPreference = "high-performance";
}
} catch (e) {
console.warn("[FADA] Could not set env power preference:", e);
}
}
onProgress?.({ status: "loading", file: "processor" });
processor = await AutoProcessor.from_pretrained(MODEL_ID, {
revision: MODEL_REVISION,
});
console.log("[FADA] Processor loaded");
onProgress?.({ status: "loading", file: "tokenizer" });
tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID, {
revision: MODEL_REVISION,
});
console.log("[FADA] Tokenizer loaded");
onProgress?.({ status: "loading", file: "model" });
console.log(`[FADA] Loading model on device=${device}...`);
activeDevice = device;
// Vision encoder dtype: fp16 on WebGPU, fp32 on WASM
const visionDtype = device === "wasm" ? "fp32" : "fp16";
const buildModelConfig = (vDtype) => ({
revision: MODEL_REVISION,
dtype: {
embed_tokens: "q4",
vision_encoder: vDtype,
decoder_model_merged: "q4",
},
device: device,
progress_callback: (info) => {
if (info.status === "progress") {
onProgress?.({
status: "downloading",
file: info.file || "",
progress: info.progress || 0,
loaded: info.loaded || 0,
total: info.total || 0,
});
}
},
});
try {
onProgress?.({
status: "compiling",
file: "model",
note: "Creating GPU session & compiling shaders (2-5 min)...",
});
model = await Qwen3_5ForConditionalGeneration.from_pretrained(
MODEL_ID,
buildModelConfig(visionDtype)
);
} catch (err) {
// Fallback: if fp16 not supported, retry with fp32 vision encoder
const isFp16Error =
visionDtype === "fp16" &&
(err?.message?.toLowerCase().includes("does not support fp16") ||
err?.message?.toLowerCase().includes("shader"));
if (isFp16Error) {
console.warn("[FADA] FP16 not supported, retrying with FP32 vision encoder...");
onProgress?.({
status: "loading",
file: "model",
note: "FP16 not supported, loading vision encoder in FP32 mode...",
});
model = await Qwen3_5ForConditionalGeneration.from_pretrained(
MODEL_ID,
buildModelConfig("fp32")
);
} else {
throw err;
}
}
console.log("[FADA] Model loaded successfully on", device);
onProgress?.({ status: "ready" });
})();
try {
await loadingPromise;
} catch (e) {
loadingPromise = null;
model = null;
processor = null;
tokenizer = null;
activeDevice = null;
throw e;
}
}
/**
* Run inference on a single image with a text prompt.
* Uses the community pattern: processor.apply_chat_template + model.generate + tokenizer.decode
* No manual KV cache logic — transformers.js handles it internally.
*
* @param {HTMLImageElement|HTMLCanvasElement|RawImage} image
* @param {string} prompt
* @param {object} opts - { maxNewTokens, temperature, onToken }
* @returns {string} Generated text
*/
export async function runInference(image, prompt, opts = {}) {
if (!model || !processor || !tokenizer) throw new Error("Model not loaded");
const { RawImage } = await import(TRANSFORMERS_CDN);
// Convert browser image to RawImage using the same approach as webml-community demo:
// Export to data URL then use RawImage.read() for proper internal state.
let rawImage = image;
if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
const origW = image.naturalWidth || image.width;
const origH = image.naturalHeight || image.height;
// Resize to limit GPU memory (max 672px longest side for WebGPU)
const MAX_DIM = 672;
let processWidth = origW;
let processHeight = origH;
if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
const scale = MAX_DIM / Math.max(processWidth, processHeight);
processWidth = Math.round(processWidth * scale);
processHeight = Math.round(processHeight * scale);
}
// Create resized canvas and export as data URL for RawImage.read()
const resizedCanvas = document.createElement("canvas");
resizedCanvas.width = processWidth;
resizedCanvas.height = processHeight;
const resizedCtx = resizedCanvas.getContext("2d");
resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
const dataURL = resizedCanvas.toDataURL("image/png");
// Use RawImage.read() — the proven way (matches webml-community demo)
rawImage = await RawImage.read(dataURL);
console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`);
}
const maxNewTokens = opts.maxNewTokens || 1024;
const temperature = opts.temperature ?? 0.1;
// Build messages array matching training format:
// The model was trained WITHOUT a system message, just user + assistant.
// The Qwen3.5 chat template auto-adds <think>\n\n</think>\n\n when
// enable_thinking is false/undefined (pre-closed thinking mode).
const messages = [
{
role: "user",
content: [
{ type: "image" },
{ type: "text", text: prompt },
],
},
];
// Apply chat template with enable_thinking explicitly disabled.
// The model was fine-tuned with thinking PRE-CLOSED: the template adds
// <think>\n\n</think>\n\n before the model's actual output.
// We must ensure this is present in the generation prompt.
console.log("[FADA] Building inputs...");
let text;
const templateOpts = {
add_generation_prompt: true,
enable_thinking: false,
};
if (processor.apply_chat_template) {
text = processor.apply_chat_template(messages, templateOpts);
} else if (tokenizer.apply_chat_template) {
text = tokenizer.apply_chat_template(messages, {
...templateOpts,
tokenize: false,
});
} else {
// Manual fallback: must include the pre-closed <think> block
text = `<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>${prompt}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n`;
}
// Verify the template output includes the required <think>...</think> block.
// The model was trained expecting <think>\n\n</think>\n\n before its output.
// Handle three failure cases:
// 1. Template produced open <think>\n (enable_thinking=true path) -> close it
// 2. Template didn't include <think> at all -> add the block
// 3. Template produced correct <think>\n\n</think>\n\n -> leave as-is
if (text.includes("<|im_start|>assistant")) {
const assistantIdx = text.lastIndexOf("<|im_start|>assistant");
const afterAssistant = text.slice(assistantIdx);
if (!afterAssistant.includes("<think>")) {
// No <think> at all - append the pre-closed block
console.warn("[FADA] Template missing <think> block - appending pre-closed thinking");
text = text.trimEnd() + "\n<think>\n\n</think>\n\n";
} else if (!afterAssistant.includes("</think>")) {
// Has <think> but no </think> - close it
console.warn("[FADA] Template produced open <think> - patching to pre-closed format");
text = text.replace(/<think>\s*$/, "<think>\n\n</think>\n\n");
}
}
// Debug: log the template output (first and last parts)
console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));
// Call processor with SINGLE image (not array!) — matches webml-community demo pattern
const inputs = await processor(text, rawImage);
const inputLen = inputs.input_ids.dims[1];
console.log(
`[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
);
// === VISION PIPELINE DIAGNOSTIC ===
console.log("[FADA-VISION] === Vision Pipeline Diagnostic ===");
console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs));
if (inputs.pixel_values) {
console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims);
console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type);
const pv = inputs.pixel_values.data;
const sample = Array.from(pv.slice(0, 10));
console.log("[FADA-VISION] pixel_values first 10:", sample);
console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v)));
console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0));
const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length)));
console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge));
} else {
console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!");
}
if (inputs.image_grid_thw) {
console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims);
console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number));
} else {
console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!");
}
// Check if input_ids contain image tokens (token id 151655 for <|image_pad|>)
if (inputs.input_ids) {
const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t);
// Qwen3.5-VL image pad token is 151655
const imageTokenCount = ids.filter(t => t === 151655).length;
const imageTokenCount2 = ids.filter(t => t === 248056).length;
console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)");
console.log("[FADA-VISION] input_ids length:", ids.length);
console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30));
// Find image pad tokens
const firstImageIdx = ids.indexOf(151655);
const firstImageIdx2 = ids.indexOf(248056);
const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2;
if (imgIdx >= 0) {
console.log("[FADA-VISION] Image tokens start at index:", imgIdx);
console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10));
} else {
console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong.");
}
}
console.log("[FADA-VISION] === End Vision Diagnostic ===");
// === DEEP DEBUG: Pre-generation diagnostics ===
console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
console.log("[FADA-DEBUG] Prompt:", prompt);
console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));
console.log("[FADA-DEBUG] Pixel values shape:", inputs.pixel_values?.dims);
console.log("[FADA-DEBUG] Image grid thw:", inputs.image_grid_thw?.dims, inputs.image_grid_thw?.data ? Array.from(inputs.image_grid_thw.data).map(Number) : 'N/A');
console.log("[FADA-DEBUG] Attention mask shape:", inputs.attention_mask?.dims);
// Decode last 20 input tokens to verify template ending
try {
const lastInputTokens = Array.from(inputs.input_ids.data.slice(-20)).map(Number);
const decodedEnd = tokenizer.decode(lastInputTokens, { skip_special_tokens: false });
console.log("[FADA-DEBUG] Last 20 input tokens decoded:", JSON.stringify(decodedEnd));
} catch (e) { console.warn("[FADA-DEBUG] Could not decode last input tokens:", e); }
console.log("[FADA-DEBUG] === Starting Generation ===");
// Generate — transformers.js handles KV cache internally
const t0 = performance.now();
let output;
try {
output = await model.generate({
...inputs,
max_new_tokens: maxNewTokens,
temperature: temperature > 0 ? temperature : undefined,
do_sample: temperature > 0,
top_p: 0.95,
});
} catch (err) {
console.error("[FADA-DEBUG] Generation error:", err);
if (err?.message?.includes("Device") && err?.message?.includes("lost")) {
throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode.");
}
if (err?.message?.includes("mapAsync")) {
throw new Error("GPU ran out of memory. Try a smaller image or switch to CPU (WASM) mode.");
}
throw err;
}
const genTime = ((performance.now() - t0) / 1000).toFixed(1);
console.log("[FADA-DEBUG] Generation completed in", genTime, "seconds");
// Decode only generated tokens (skip input)
const outputDims = output.dims || [];
const outputTotalLen = outputDims[1] || (outputDims[0] === 1 ? output.data?.length : 0);
// Convert output Tensor to a plain JS array
let outputTokens;
try {
// For 2D tensor [1, seq_len], try .tolist() which returns nested array
if (output.tolist) {
const listed = output.tolist();
// Could be [[tok1, tok2, ...]] (2D) or [tok1, tok2, ...] (1D)
const flat = Array.isArray(listed[0]) ? listed[0] : listed;
outputTokens = flat.map(t => (typeof t === "bigint" ? Number(t) : t));
} else {
const firstBatch = output[0];
const allTokens = firstBatch?.tolist
? firstBatch.tolist()
: Array.from(firstBatch?.data || firstBatch || output.data || []);
outputTokens = allTokens.map(t => (typeof t === "bigint" ? Number(t) : t));
}
} catch (e) {
console.warn("[FADA] Tensor conversion fallback:", e);
// Fallback: try direct Array.from on output data
const raw = output.data || output;
outputTokens = Array.from(raw).map(t => (typeof t === "bigint" ? Number(t) : t));
}
// === DEEP DEBUG: Post-generation analysis ===
console.log("[FADA-DEBUG] === Post-Generation Analysis ===");
console.log(`[FADA-DEBUG] Output raw dims: ${JSON.stringify(outputDims)}`);
console.log(`[FADA-DEBUG] Output type: ${typeof output}, constructor: ${output?.constructor?.name}`);
console.log(`[FADA-DEBUG] outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`);
console.log(`[FADA] Output tensor dims: ${JSON.stringify(outputDims)}, outputTokens.length: ${outputTokens.length}, inputLen: ${inputLen}`);
// Slice to only the newly generated tokens
const newTokens = outputTokens.slice(inputLen);
const numGenerated = newTokens.length;
// Debug: log first few generated tokens
if (numGenerated > 0) {
console.log(`[FADA] First 20 new token IDs: [${newTokens.slice(0, 20).join(", ")}]`);
const rawDecoded = tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: false });
console.log(`[FADA] First 30 tokens decoded (with special): ${JSON.stringify(rawDecoded)}`);
console.log(`[FADA-DEBUG] Decoded (skip_special=true): ${JSON.stringify(tokenizer.decode(newTokens.slice(0, 30), { skip_special_tokens: true }))}`);
} else {
console.warn("[FADA-DEBUG] *** CRITICAL: 0 new tokens generated! ***");
console.log(`[FADA-DEBUG] Output length === Input length? ${outputTokens.length === inputLen}`);
console.log(`[FADA-DEBUG] Output length: ${outputTokens.length} vs Input length: ${inputLen}`);
console.log(`[FADA-DEBUG] Last 10 output token IDs: [${outputTokens.slice(-10).join(", ")}]`);
// Try decoding last few tokens to understand what happened
if (outputTokens.length > 0) {
const lastFew = tokenizer.decode(outputTokens.slice(-10), { skip_special_tokens: false });
console.log(`[FADA-DEBUG] Last 10 tokens decoded (no skip): ${JSON.stringify(lastFew)}`);
}
// Check if ALL output tokens are the same as input (nothing was generated)
if (outputTokens.length > inputLen) {
// Actually there ARE new tokens but our inputLen might be wrong
console.log(`[FADA-DEBUG] WAIT: outputTokens(${outputTokens.length}) > inputLen(${inputLen}) - recalculating...`);
const extraTokens = outputTokens.slice(inputLen);
console.log(`[FADA-DEBUG] Extra tokens: [${extraTokens.slice(0, 20).join(", ")}]`);
console.log(`[FADA-DEBUG] Extra decoded: ${JSON.stringify(tokenizer.decode(extraTokens, { skip_special_tokens: false }))}`);
} else if (outputTokens.length < inputLen) {
console.log(`[FADA-DEBUG] STRANGE: output shorter than input! Model may have truncated.`);
} else {
console.log(`[FADA-DEBUG] Output exactly equals input - model generated EOS immediately or nothing at all.`);
// Check what the last token IS
const lastToken = outputTokens[outputTokens.length - 1];
console.log(`[FADA-DEBUG] Last token ID: ${lastToken}, decoded: ${JSON.stringify(tokenizer.decode([lastToken], { skip_special_tokens: false }))}`);
}
}
const decoded = tokenizer.decode(newTokens, {
skip_special_tokens: true,
});
console.log(
`[FADA] Generated ${numGenerated} tokens in ${genTime}s (${(numGenerated / (genTime || 1)).toFixed(1)} tok/s)`
);
// Clean output: remove <think></think> tags and stray role prefixes
// Ensure decoded is always a string (safeguard against unexpected return types)
let textOutput = typeof decoded === "string" ? decoded : String(decoded || "");
textOutput = textOutput.replace(/<think>[\s\S]*?<\/think>/g, "").trim();
const roleIdx = textOutput.indexOf("\nassistant\n");
if (roleIdx !== -1) textOutput = textOutput.slice(0, roleIdx);
if (textOutput.startsWith("assistant\n")) textOutput = textOutput.slice("assistant\n".length);
if (numGenerated > 0 && textOutput.length === 0) {
console.warn("[FADA] Generated tokens but cleaned output is empty. Raw decoded:", JSON.stringify(decoded?.slice(0, 200)));
}
return textOutput.trim();
}
/**
* Text-only diagnostic test - runs inference WITHOUT any image.
* If this produces output, the decoder works and the issue is in the vision pipeline.
* If this also produces 0 tokens, the issue is in the decoder/generation loop.
*/
export async function testTextOnly() {
if (!model || !tokenizer) throw new Error("Model not loaded");
console.log("[FADA-TEST] === TEXT-ONLY DIAGNOSTIC ===");
// Simple text-only message (no image)
const messages = [
{ role: "system", content: "You are a helpful assistant." },
{ role: "user", content: "Say hello and tell me what you can do." }
];
let text;
try {
text = tokenizer.apply_chat_template(messages, {
add_generation_prompt: true,
tokenize: false
});
} catch(e) {
text = `<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nSay hello and tell me what you can do.<|im_end|>\n<|im_start|>assistant\n`;
}
// Add pre-closed thinking block if model expects it
if (!text.includes("<think>")) {
text += "<think>\n\n</think>\n\n";
}
console.log("[FADA-TEST] Template:", text);
const inputs = tokenizer(text, { return_tensors: "pt" });
console.log("[FADA-TEST] Input IDs shape:", inputs.input_ids.dims);
console.log("[FADA-TEST] Input length:", inputs.input_ids.dims[1]);
const t0 = performance.now();
const output = await model.generate({
...inputs,
max_new_tokens: 50,
temperature: 0.7,
do_sample: true,
top_p: 0.9,
});
const elapsed = ((performance.now() - t0) / 1000).toFixed(1);
// Decode output
let outputTokens;
try {
if (output.tolist) {
const list = output.tolist();
outputTokens = (Array.isArray(list[0]) ? list[0] : list).map(t => typeof t === "bigint" ? Number(t) : t);
} else {
const firstBatch = output[0];
const allTokens = firstBatch.tolist ? firstBatch.tolist() : Array.from(firstBatch.data || firstBatch);
outputTokens = allTokens.map(t => typeof t === "bigint" ? Number(t) : t);
}
} catch(e) {
console.error("[FADA-TEST] Output conversion error:", e);
outputTokens = [];
}
const inputLen = inputs.input_ids.dims[1];
const newTokens = outputTokens.slice(inputLen);
console.log("[FADA-TEST] Output total tokens:", outputTokens.length);
console.log("[FADA-TEST] New tokens:", newTokens.length);
console.log("[FADA-TEST] Time:", elapsed, "s");
if (newTokens.length > 0) {
console.log("[FADA-TEST] First 20 tokens:", newTokens.slice(0, 20));
const decoded = tokenizer.decode(newTokens, { skip_special_tokens: true });
console.log("[FADA-TEST] \u2713 DECODED:", decoded);
return `TEXT-ONLY TEST PASSED (${newTokens.length} tokens in ${elapsed}s): ${decoded}`;
} else {
console.log("[FADA-TEST] \u2717 0 tokens generated - DECODER ITSELF IS BROKEN ON WEBGPU");
console.log("[FADA-TEST] Last 5 tokens of output:", outputTokens.slice(-5));
const lastDecoded = tokenizer.decode(outputTokens.slice(-5), { skip_special_tokens: false });
console.log("[FADA-TEST] Last tokens decoded:", lastDecoded);
return `TEXT-ONLY TEST FAILED: 0 tokens in ${elapsed}s. Decoder broken on WebGPU.`;
}
}