mshz88 commited on
Commit
e4e4d61
·
verified ·
1 Parent(s): c7fdf67

Upload js/model.js with huggingface_hub

Browse files
Files changed (1) hide show
  1. js/model.js +60 -18
js/model.js CHANGED
@@ -179,35 +179,34 @@ export async function runInference(image, prompt, opts = {}) {
179
 
180
  const { RawImage } = await import(TRANSFORMERS_CDN);
181
 
182
- // Convert browser image to RawImage if needed, with resize to limit GPU memory
 
183
  let rawImage = image;
184
  if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
185
- const canvas = document.createElement("canvas");
186
- canvas.width = image.naturalWidth || image.width;
187
- canvas.height = image.naturalHeight || image.height;
188
- const ctx = canvas.getContext("2d");
189
- ctx.drawImage(image, 0, 0);
190
-
191
- // Resize image to limit GPU memory usage (max 672px longest side for WebGPU)
192
- // Match closer to training resolution for better output quality
193
  const MAX_DIM = 672;
194
- let processWidth = canvas.width;
195
- let processHeight = canvas.height;
196
  if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
197
  const scale = MAX_DIM / Math.max(processWidth, processHeight);
198
  processWidth = Math.round(processWidth * scale);
199
  processHeight = Math.round(processHeight * scale);
200
  }
201
 
202
- // Create resized canvas
203
  const resizedCanvas = document.createElement("canvas");
204
  resizedCanvas.width = processWidth;
205
  resizedCanvas.height = processHeight;
206
  const resizedCtx = resizedCanvas.getContext("2d");
207
  resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
208
- const imgData = resizedCtx.getImageData(0, 0, processWidth, processHeight);
209
- rawImage = new RawImage(imgData.data, processWidth, processHeight, 4);
210
- console.log(`[FADA] Image resized: ${canvas.width}x${canvas.height} -> ${processWidth}x${processHeight}`);
 
 
211
  }
212
 
213
  const maxNewTokens = opts.maxNewTokens || 1024;
@@ -274,19 +273,62 @@ export async function runInference(image, prompt, opts = {}) {
274
  console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
275
  console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));
276
 
277
- const inputs = await processor(text, [rawImage]);
 
278
  const inputLen = inputs.input_ids.dims[1];
279
  console.log(
280
  `[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
281
  );
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  // === DEEP DEBUG: Pre-generation diagnostics ===
284
  console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
285
  console.log("[FADA-DEBUG] Prompt:", prompt);
286
- console.log("[FADA-DEBUG] Messages:", JSON.stringify(messages));
287
  console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
288
  console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
289
- console.log("[FADA-DEBUG] Template total length:", text?.length);
290
  console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
291
  console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
292
  console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));
 
179
 
180
  const { RawImage } = await import(TRANSFORMERS_CDN);
181
 
182
+ // Convert browser image to RawImage using the same approach as webml-community demo:
183
+ // Export to data URL then use RawImage.read() for proper internal state.
184
  let rawImage = image;
185
  if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
186
+ const origW = image.naturalWidth || image.width;
187
+ const origH = image.naturalHeight || image.height;
188
+
189
+ // Resize to limit GPU memory (max 672px longest side for WebGPU)
 
 
 
 
190
  const MAX_DIM = 672;
191
+ let processWidth = origW;
192
+ let processHeight = origH;
193
  if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
194
  const scale = MAX_DIM / Math.max(processWidth, processHeight);
195
  processWidth = Math.round(processWidth * scale);
196
  processHeight = Math.round(processHeight * scale);
197
  }
198
 
199
+ // Create resized canvas and export as data URL for RawImage.read()
200
  const resizedCanvas = document.createElement("canvas");
201
  resizedCanvas.width = processWidth;
202
  resizedCanvas.height = processHeight;
203
  const resizedCtx = resizedCanvas.getContext("2d");
204
  resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
205
+ const dataURL = resizedCanvas.toDataURL("image/png");
206
+
207
+ // Use RawImage.read() the proven way (matches webml-community demo)
208
+ rawImage = await RawImage.read(dataURL);
209
+ console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`);
210
  }
211
 
212
  const maxNewTokens = opts.maxNewTokens || 1024;
 
273
  console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
274
  console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));
275
 
276
+ // Call processor with SINGLE image (not array!) — matches webml-community demo pattern
277
+ const inputs = await processor(text, rawImage);
278
  const inputLen = inputs.input_ids.dims[1];
279
  console.log(
280
  `[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
281
  );
282
 
283
+ // === VISION PIPELINE DIAGNOSTIC ===
284
+ console.log("[FADA-VISION] === Vision Pipeline Diagnostic ===");
285
+ console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs));
286
+ if (inputs.pixel_values) {
287
+ console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims);
288
+ console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type);
289
+ const pv = inputs.pixel_values.data;
290
+ const sample = Array.from(pv.slice(0, 10));
291
+ console.log("[FADA-VISION] pixel_values first 10:", sample);
292
+ console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v)));
293
+ console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0));
294
+ const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length)));
295
+ console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge));
296
+ } else {
297
+ console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!");
298
+ }
299
+ if (inputs.image_grid_thw) {
300
+ console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims);
301
+ console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number));
302
+ } else {
303
+ console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!");
304
+ }
305
+ // Check if input_ids contain image tokens (token id 151655 for <|image_pad|>)
306
+ if (inputs.input_ids) {
307
+ const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t);
308
+ // Qwen3.5-VL image pad token is 151655
309
+ const imageTokenCount = ids.filter(t => t === 151655).length;
310
+ const imageTokenCount2 = ids.filter(t => t === 248056).length;
311
+ console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)");
312
+ console.log("[FADA-VISION] input_ids length:", ids.length);
313
+ console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30));
314
+ // Find image pad tokens
315
+ const firstImageIdx = ids.indexOf(151655);
316
+ const firstImageIdx2 = ids.indexOf(248056);
317
+ const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2;
318
+ if (imgIdx >= 0) {
319
+ console.log("[FADA-VISION] Image tokens start at index:", imgIdx);
320
+ console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10));
321
+ } else {
322
+ console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong.");
323
+ }
324
+ }
325
+ console.log("[FADA-VISION] === End Vision Diagnostic ===");
326
+
327
  // === DEEP DEBUG: Pre-generation diagnostics ===
328
  console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
329
  console.log("[FADA-DEBUG] Prompt:", prompt);
 
330
  console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
331
  console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
 
332
  console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
333
  console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
334
  console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));