Spaces:

mshz88
/

FADA-Mobile

Running

App Files Files Community

mshz88 commited on 27 days ago

Commit

e4e4d61

verified ·

1 Parent(s): c7fdf67

Upload js/model.js with huggingface_hub

Browse files

Files changed (1) hide show

js/model.js +60 -18

js/model.js CHANGED Viewed

@@ -179,35 +179,34 @@ export async function runInference(image, prompt, opts = {}) {
   const { RawImage } = await import(TRANSFORMERS_CDN);
-  // Convert browser image to RawImage if needed, with resize to limit GPU memory
   let rawImage = image;
   if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
-    const canvas = document.createElement("canvas");
-    canvas.width = image.naturalWidth || image.width;
-    canvas.height = image.naturalHeight || image.height;
-    const ctx = canvas.getContext("2d");
-    ctx.drawImage(image, 0, 0);
-    // Resize image to limit GPU memory usage (max 672px longest side for WebGPU)
-    // Match closer to training resolution for better output quality
     const MAX_DIM = 672;
-    let processWidth = canvas.width;
-    let processHeight = canvas.height;
     if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
       const scale = MAX_DIM / Math.max(processWidth, processHeight);
       processWidth = Math.round(processWidth * scale);
       processHeight = Math.round(processHeight * scale);
     }
-    // Create resized canvas
     const resizedCanvas = document.createElement("canvas");
     resizedCanvas.width = processWidth;
     resizedCanvas.height = processHeight;
     const resizedCtx = resizedCanvas.getContext("2d");
     resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
-    const imgData = resizedCtx.getImageData(0, 0, processWidth, processHeight);
-    rawImage = new RawImage(imgData.data, processWidth, processHeight, 4);
-    console.log(`[FADA] Image resized: ${canvas.width}x${canvas.height} -> ${processWidth}x${processHeight}`);
   }
   const maxNewTokens = opts.maxNewTokens || 1024;
@@ -274,19 +273,62 @@ export async function runInference(image, prompt, opts = {}) {
   console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
   console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));
-  const inputs = await processor(text, [rawImage]);
   const inputLen = inputs.input_ids.dims[1];
   console.log(
     `[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
   );
   // === DEEP DEBUG: Pre-generation diagnostics ===
   console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
   console.log("[FADA-DEBUG] Prompt:", prompt);
-  console.log("[FADA-DEBUG] Messages:", JSON.stringify(messages));
   console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
   console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
-  console.log("[FADA-DEBUG] Template total length:", text?.length);
   console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
   console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
   console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));

   const { RawImage } = await import(TRANSFORMERS_CDN);
+  // Convert browser image to RawImage using the same approach as webml-community demo:
+  // Export to data URL then use RawImage.read() for proper internal state.
   let rawImage = image;
   if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
+    const origW = image.naturalWidth || image.width;
+    const origH = image.naturalHeight || image.height;
+    // Resize to limit GPU memory (max 672px longest side for WebGPU)
     const MAX_DIM = 672;
+    let processWidth = origW;
+    let processHeight = origH;
     if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
       const scale = MAX_DIM / Math.max(processWidth, processHeight);
       processWidth = Math.round(processWidth * scale);
       processHeight = Math.round(processHeight * scale);
     }
+    // Create resized canvas and export as data URL for RawImage.read()
     const resizedCanvas = document.createElement("canvas");
     resizedCanvas.width = processWidth;
     resizedCanvas.height = processHeight;
     const resizedCtx = resizedCanvas.getContext("2d");
     resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
+    const dataURL = resizedCanvas.toDataURL("image/png");
+    // Use RawImage.read() — the proven way (matches webml-community demo)
+    rawImage = await RawImage.read(dataURL);
+    console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`);
   }
   const maxNewTokens = opts.maxNewTokens || 1024;
   console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
   console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));
+  // Call processor with SINGLE image (not array!) — matches webml-community demo pattern
+  const inputs = await processor(text, rawImage);
   const inputLen = inputs.input_ids.dims[1];
   console.log(
     `[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
   );
+  // === VISION PIPELINE DIAGNOSTIC ===
+  console.log("[FADA-VISION] === Vision Pipeline Diagnostic ===");
+  console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs));
+  if (inputs.pixel_values) {
+    console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims);
+    console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type);
+    const pv = inputs.pixel_values.data;
+    const sample = Array.from(pv.slice(0, 10));
+    console.log("[FADA-VISION] pixel_values first 10:", sample);
+    console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v)));
+    console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0));
+    const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length)));
+    console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge));
+  } else {
+    console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!");
+  }
+  if (inputs.image_grid_thw) {
+    console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims);
+    console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number));
+  } else {
+    console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!");
+  }
+  // Check if input_ids contain image tokens (token id 151655 for <|image_pad|>)
+  if (inputs.input_ids) {
+    const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t);
+    // Qwen3.5-VL image pad token is 151655
+    const imageTokenCount = ids.filter(t => t === 151655).length;
+    const imageTokenCount2 = ids.filter(t => t === 248056).length;
+    console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)");
+    console.log("[FADA-VISION] input_ids length:", ids.length);
+    console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30));
+    // Find image pad tokens
+    const firstImageIdx = ids.indexOf(151655);
+    const firstImageIdx2 = ids.indexOf(248056);
+    const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2;
+    if (imgIdx >= 0) {
+      console.log("[FADA-VISION] Image tokens start at index:", imgIdx);
+      console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10));
+    } else {
+      console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong.");
+    }
+  }
+  console.log("[FADA-VISION] === End Vision Diagnostic ===");
   // === DEEP DEBUG: Pre-generation diagnostics ===
   console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
   console.log("[FADA-DEBUG] Prompt:", prompt);
   console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
   console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
   console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
   console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
   console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));