Spaces:
Running
Running
Upload js/model.js with huggingface_hub
Browse files- js/model.js +60 -18
js/model.js
CHANGED
|
@@ -179,35 +179,34 @@ export async function runInference(image, prompt, opts = {}) {
|
|
| 179 |
|
| 180 |
const { RawImage } = await import(TRANSFORMERS_CDN);
|
| 181 |
|
| 182 |
-
// Convert browser image to RawImage
|
|
|
|
| 183 |
let rawImage = image;
|
| 184 |
if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
|
| 185 |
-
const
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
ctx.drawImage(image, 0, 0);
|
| 190 |
-
|
| 191 |
-
// Resize image to limit GPU memory usage (max 672px longest side for WebGPU)
|
| 192 |
-
// Match closer to training resolution for better output quality
|
| 193 |
const MAX_DIM = 672;
|
| 194 |
-
let processWidth =
|
| 195 |
-
let processHeight =
|
| 196 |
if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
|
| 197 |
const scale = MAX_DIM / Math.max(processWidth, processHeight);
|
| 198 |
processWidth = Math.round(processWidth * scale);
|
| 199 |
processHeight = Math.round(processHeight * scale);
|
| 200 |
}
|
| 201 |
|
| 202 |
-
// Create resized canvas
|
| 203 |
const resizedCanvas = document.createElement("canvas");
|
| 204 |
resizedCanvas.width = processWidth;
|
| 205 |
resizedCanvas.height = processHeight;
|
| 206 |
const resizedCtx = resizedCanvas.getContext("2d");
|
| 207 |
resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
|
| 208 |
-
const
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
| 211 |
}
|
| 212 |
|
| 213 |
const maxNewTokens = opts.maxNewTokens || 1024;
|
|
@@ -274,19 +273,62 @@ export async function runInference(image, prompt, opts = {}) {
|
|
| 274 |
console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
|
| 275 |
console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));
|
| 276 |
|
| 277 |
-
|
|
|
|
| 278 |
const inputLen = inputs.input_ids.dims[1];
|
| 279 |
console.log(
|
| 280 |
`[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
|
| 281 |
);
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
// === DEEP DEBUG: Pre-generation diagnostics ===
|
| 284 |
console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
|
| 285 |
console.log("[FADA-DEBUG] Prompt:", prompt);
|
| 286 |
-
console.log("[FADA-DEBUG] Messages:", JSON.stringify(messages));
|
| 287 |
console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
|
| 288 |
console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
|
| 289 |
-
console.log("[FADA-DEBUG] Template total length:", text?.length);
|
| 290 |
console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
|
| 291 |
console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
|
| 292 |
console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));
|
|
|
|
| 179 |
|
| 180 |
const { RawImage } = await import(TRANSFORMERS_CDN);
|
| 181 |
|
| 182 |
+
// Convert browser image to RawImage using the same approach as webml-community demo:
|
| 183 |
+
// Export to data URL then use RawImage.read() for proper internal state.
|
| 184 |
let rawImage = image;
|
| 185 |
if (image instanceof HTMLImageElement || image instanceof HTMLCanvasElement) {
|
| 186 |
+
const origW = image.naturalWidth || image.width;
|
| 187 |
+
const origH = image.naturalHeight || image.height;
|
| 188 |
+
|
| 189 |
+
// Resize to limit GPU memory (max 672px longest side for WebGPU)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
const MAX_DIM = 672;
|
| 191 |
+
let processWidth = origW;
|
| 192 |
+
let processHeight = origH;
|
| 193 |
if (processWidth > MAX_DIM || processHeight > MAX_DIM) {
|
| 194 |
const scale = MAX_DIM / Math.max(processWidth, processHeight);
|
| 195 |
processWidth = Math.round(processWidth * scale);
|
| 196 |
processHeight = Math.round(processHeight * scale);
|
| 197 |
}
|
| 198 |
|
| 199 |
+
// Create resized canvas and export as data URL for RawImage.read()
|
| 200 |
const resizedCanvas = document.createElement("canvas");
|
| 201 |
resizedCanvas.width = processWidth;
|
| 202 |
resizedCanvas.height = processHeight;
|
| 203 |
const resizedCtx = resizedCanvas.getContext("2d");
|
| 204 |
resizedCtx.drawImage(image, 0, 0, processWidth, processHeight);
|
| 205 |
+
const dataURL = resizedCanvas.toDataURL("image/png");
|
| 206 |
+
|
| 207 |
+
// Use RawImage.read() — the proven way (matches webml-community demo)
|
| 208 |
+
rawImage = await RawImage.read(dataURL);
|
| 209 |
+
console.log(`[FADA] Image loaded via RawImage.read(): ${origW}x${origH} -> ${processWidth}x${processHeight}, channels=${rawImage.channels}`);
|
| 210 |
}
|
| 211 |
|
| 212 |
const maxNewTokens = opts.maxNewTokens || 1024;
|
|
|
|
| 273 |
console.log("[FADA] Template text (first 120):", JSON.stringify(text.slice(0, 120)));
|
| 274 |
console.log("[FADA] Template text (last 80):", JSON.stringify(text.slice(-80)));
|
| 275 |
|
| 276 |
+
// Call processor with SINGLE image (not array!) — matches webml-community demo pattern
|
| 277 |
+
const inputs = await processor(text, rawImage);
|
| 278 |
const inputLen = inputs.input_ids.dims[1];
|
| 279 |
console.log(
|
| 280 |
`[FADA] Input tokens: ${inputLen}, generating (max_new_tokens=${maxNewTokens}, temp=${temperature}, device=${activeDevice})...`
|
| 281 |
);
|
| 282 |
|
| 283 |
+
// === VISION PIPELINE DIAGNOSTIC ===
|
| 284 |
+
console.log("[FADA-VISION] === Vision Pipeline Diagnostic ===");
|
| 285 |
+
console.log("[FADA-VISION] Inputs keys:", Object.keys(inputs));
|
| 286 |
+
if (inputs.pixel_values) {
|
| 287 |
+
console.log("[FADA-VISION] pixel_values shape:", inputs.pixel_values.dims);
|
| 288 |
+
console.log("[FADA-VISION] pixel_values dtype:", inputs.pixel_values.type);
|
| 289 |
+
const pv = inputs.pixel_values.data;
|
| 290 |
+
const sample = Array.from(pv.slice(0, 10));
|
| 291 |
+
console.log("[FADA-VISION] pixel_values first 10:", sample);
|
| 292 |
+
console.log("[FADA-VISION] pixel_values has NaN:", sample.some(v => isNaN(v)));
|
| 293 |
+
console.log("[FADA-VISION] pixel_values all zero:", sample.every(v => v === 0));
|
| 294 |
+
const sampleLarge = Array.from(pv.slice(0, Math.min(1000, pv.length)));
|
| 295 |
+
console.log("[FADA-VISION] pixel_values min/max:", Math.min(...sampleLarge), Math.max(...sampleLarge));
|
| 296 |
+
} else {
|
| 297 |
+
console.log("[FADA-VISION] \u2717 NO pixel_values in inputs! Image was NOT processed!");
|
| 298 |
+
}
|
| 299 |
+
if (inputs.image_grid_thw) {
|
| 300 |
+
console.log("[FADA-VISION] image_grid_thw shape:", inputs.image_grid_thw.dims);
|
| 301 |
+
console.log("[FADA-VISION] image_grid_thw data:", Array.from(inputs.image_grid_thw.data).map(Number));
|
| 302 |
+
} else {
|
| 303 |
+
console.log("[FADA-VISION] \u2717 NO image_grid_thw! Vision metadata missing!");
|
| 304 |
+
}
|
| 305 |
+
// Check if input_ids contain image tokens (token id 151655 for <|image_pad|>)
|
| 306 |
+
if (inputs.input_ids) {
|
| 307 |
+
const ids = Array.from(inputs.input_ids.data).map(t => typeof t === 'bigint' ? Number(t) : t);
|
| 308 |
+
// Qwen3.5-VL image pad token is 151655
|
| 309 |
+
const imageTokenCount = ids.filter(t => t === 151655).length;
|
| 310 |
+
const imageTokenCount2 = ids.filter(t => t === 248056).length;
|
| 311 |
+
console.log("[FADA-VISION] input_ids contain", imageTokenCount, "image tokens (151655) and", imageTokenCount2, "tokens (248056)");
|
| 312 |
+
console.log("[FADA-VISION] input_ids length:", ids.length);
|
| 313 |
+
console.log("[FADA-VISION] First 30 input tokens:", ids.slice(0, 30));
|
| 314 |
+
// Find image pad tokens
|
| 315 |
+
const firstImageIdx = ids.indexOf(151655);
|
| 316 |
+
const firstImageIdx2 = ids.indexOf(248056);
|
| 317 |
+
const imgIdx = firstImageIdx >= 0 ? firstImageIdx : firstImageIdx2;
|
| 318 |
+
if (imgIdx >= 0) {
|
| 319 |
+
console.log("[FADA-VISION] Image tokens start at index:", imgIdx);
|
| 320 |
+
console.log("[FADA-VISION] Tokens around image:", ids.slice(Math.max(0, imgIdx-3), imgIdx+10));
|
| 321 |
+
} else {
|
| 322 |
+
console.log("[FADA-VISION] \u2717 NO image pad tokens found in input_ids! Template may be wrong.");
|
| 323 |
+
}
|
| 324 |
+
}
|
| 325 |
+
console.log("[FADA-VISION] === End Vision Diagnostic ===");
|
| 326 |
+
|
| 327 |
// === DEEP DEBUG: Pre-generation diagnostics ===
|
| 328 |
console.log("[FADA-DEBUG] === Pre-Generation Diagnostics ===");
|
| 329 |
console.log("[FADA-DEBUG] Prompt:", prompt);
|
|
|
|
| 330 |
console.log("[FADA-DEBUG] Template text (first 500 chars):", typeof text, text?.substring?.(0, 500));
|
| 331 |
console.log("[FADA-DEBUG] Template text (last 200 chars):", text?.slice?.(-200));
|
|
|
|
| 332 |
console.log("[FADA-DEBUG] Input IDs shape:", inputs.input_ids?.dims);
|
| 333 |
console.log("[FADA-DEBUG] Input IDs first 10:", Array.from(inputs.input_ids?.data?.slice(0, 10) || []).map(Number));
|
| 334 |
console.log("[FADA-DEBUG] Input IDs last 10:", Array.from(inputs.input_ids?.data?.slice(-10) || []).map(Number));
|