Spaces:

SiddhJagani
/

vlm

Running

App Files Files Community

SiddhJagani commited on Feb 19

Commit

8158e83

verified ·

1 Parent(s): 9589b05

Update index.html

Browse files

Files changed (1) hide show

index.html +585 -125

index.html CHANGED Viewed

@@ -16,36 +16,218 @@
   const HEAD_DIM = 128;
   const MAX_NEW_TOKENS = 512;
   let tokenizer = null;
   let embedTokens = null;
   let embedImages = null;
   let decoder = null;
   let isLoaded = false;
   let isGenerating = false;
-  // Conversation history stored as array of {role, content} where content may have image
   let chatHistory = [];
   const $ = id => document.getElementById(id);
-  const statusEl       = $('status');
-  const progressEl     = $('progress');
-  const progressBar    = $('progress-bar');
-  const progressText   = $('progress-text');
-  const chatContainer  = $('chat-container');
-  const inputEl        = $('user-input');
-  const sendBtn        = $('send-btn');
-  const loadBtn        = $('load-btn');
-  const storageInfo    = $('storage-info');
-  const cacheIndicator = $('cache-indicator');
-  const imageBtn       = $('image-btn');
-  const imageInput     = $('image-input');
-  const imagePreview   = $('image-preview');
   const imagePreviewImg = $('image-preview-img');
-  const removeImageBtn = $('remove-image-btn');
-  // Current attached image state
-  let currentImageData = null; // base64 data URL
-  let currentImagePixels = null; // processed for model
   // ─── CACHE CHECK ─────────────────────────────────────────────────────────
   async function checkCache() {
@@ -54,7 +236,7 @@
         const est = await navigator.storage.estimate();
         const usedMB = ((est.usage || 0) / 1024 / 1024).toFixed(0);
         const quotaGB = ((est.quota || 0) / 1024 / 1024 / 1024).toFixed(1);
-        storageInfo.textContent = `Browser storage: ${usedMB}MB used / ${quotaGB}GB available`;
       }
     } catch(e) {}
   }
@@ -64,81 +246,81 @@
     loadBtn.disabled = true;
     loadBtn.textContent = 'Loading...';
     progressEl.style.display = 'flex';
     $('welcome').style.display = 'none';
     try {
-      // Configure ONNX runtime
       ort.env.wasm.numThreads = 1;
-      // Step 1: Load tokenizer
-      statusEl.textContent = 'Loading tokenizer...';
-      progressBar.style.width = '5%';
-      progressText.textContent = '5%';
       tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
-      // Step 2: Load embed_tokens session
-      statusEl.textContent = 'Loading token embedder (~30MB)...';
-      progressBar.style.width = '20%';
-      progressText.textContent = '20%';
-      embedTokens = await loadOrtSession('embed_tokens_fp16', 1);
-      // Step 3: Load embed_images session
-      statusEl.textContent = 'Loading vision encoder (~400MB)...';
-      progressBar.style.width = '40%';
-      progressText.textContent = '40%';
-      embedImages = await loadOrtSession('embed_images_fp16', 1);
-      // Step 4: Load decoder (largest, q4 ~1.1GB)
-      statusEl.textContent = 'Loading language decoder (~1.1GB)...';
-      progressBar.style.width = '60%';
-      progressText.textContent = '60%';
-      decoder = await loadOrtSession('decoder_q4', 1);
-      progressBar.style.width = '100%';
-      progressText.textContent = '100%';
-      statusEl.textContent = 'Model ready — running fully on your device (WebGPU)';
       progressEl.style.display = 'none';
       isLoaded = true;
       inputEl.disabled = false;
       sendBtn.disabled = false;
       imageBtn.disabled = false;
-      inputEl.placeholder = 'Ask anything... (optionally attach an image)';
       loadBtn.style.display = 'none';
       checkCache();
-      addSystemMessage('✓ LFM2.5-VL-1.6B loaded & running on-device via WebGPU. You can attach images!');
     } catch(err) {
       console.error(err);
-      statusEl.textContent = `Error: ${err.message}`;
       progressEl.style.display = 'none';
       loadBtn.disabled = false;
       loadBtn.textContent = 'Retry Load';
-      if (err.message && err.message.includes('WebGPU')) {
-        addSystemMessage('⚠️ WebGPU not supported. Use Chrome 113+ or Edge 113+ with WebGPU enabled.');
       } else {
-        addSystemMessage(`Error loading model: ${err.message}`);
       }
     }
   }
-  async function loadOrtSession(name, dataFiles = 1) {
-    const onnxUrl = `${MODEL_BASE}/onnx/${name}.onnx`;
-    const externalData = [];
-    for (let i = 0; i < dataFiles; i++) {
-      const suffix = i === 0 ? '' : `_${i}`;
-      externalData.push({
-        path: `${name}.onnx_data${suffix}`,
-        data: `${MODEL_BASE}/onnx/${name}.onnx_data${suffix}`
-      });
-    }
-    return ort.InferenceSession.create(onnxUrl, {
-      executionProviders: ['webgpu'],
-      externalData,
-    });
-  }
   // ─── IMAGE HANDLING ───────────────────────────────────────────────────────
   imageBtn.addEventListener('click', () => imageInput.click());
@@ -162,55 +344,99 @@
     imagePreviewImg.src = '';
   });
-  // Process image to pixel_values for the model
   async function processImage(dataUrl) {
     return new Promise((resolve, reject) => {
       const img = new Image();
       img.onload = () => {
-        // Resize to max 512x512 preserving aspect ratio
         let w = img.width, h = img.height;
-        const maxDim = 512;
-        if (w > maxDim || h > maxDim) {
-          if (w > h) { h = Math.round(h * maxDim / w); w = maxDim; }
-          else { w = Math.round(w * maxDim / h); h = maxDim; }
         }
-        // Round to multiple of 14 (patch size for SigLIP2)
-        w = Math.max(14, Math.round(w / 14) * 14);
-        h = Math.max(14, Math.round(h / 14) * 14);
         const canvas = document.createElement('canvas');
         canvas.width = w; canvas.height = h;
         const ctx = canvas.getContext('2d');
         ctx.drawImage(img, 0, 0, w, h);
-        const imageData = ctx.getImageData(0, 0, w, h);
-        const { data } = imageData;
-        // Normalize with SigLIP2 mean/std
-        const mean = [0.5, 0.5, 0.5];
-        const std  = [0.5, 0.5, 0.5];
-        const nPixels = w * h;
-        const floats = new Float32Array(3 * nPixels);
-        for (let i = 0; i < nPixels; i++) {
-          floats[0 * nPixels + i] = (data[i * 4 + 0] / 255 - mean[0]) / std[0];
-          floats[1 * nPixels + i] = (data[i * 4 + 1] / 255 - mean[1]) / std[1];
-          floats[2 * nPixels + i] = (data[i * 4 + 2] / 255 - mean[2]) / std[2];
         }
-        const pixelValues = new ort.Tensor('float32', floats, [1, 3, h, w]);
-        const pixelAttentionMask = new ort.Tensor('int64',
-          new BigInt64Array(h * w).fill(1n), [1, h, w]);
-        const numPatchH = h / 14, numPatchW = w / 14;
-        const spatialShapes = new ort.Tensor('int64',
-          new BigInt64Array([BigInt(numPatchH), BigInt(numPatchW)]), [1, 2]);
-        resolve({ pixelValues, pixelAttentionMask, spatialShapes, width: w, height: h });
       };
       img.onerror = reject;
       img.src = dataUrl;
     });
   }
   // ─── HELPERS ─────────────────────────────────────────────────────────────
   async function getTextEmbeddings(ids) {
     const tensor = new ort.Tensor('int64',
@@ -353,46 +579,74 @@
         const imgData = await processImage(attachedImage);
         // Get image embeddings from vision encoder
         const imgOut = await embedImages.run({
-          pixel_values: imgData.pixelValues,
           pixel_attention_mask: imgData.pixelAttentionMask,
-          spatial_shapes: imgData.spatialShapes,
         });
-        const imageEmbeds = imgOut.image_features || imgOut.outputs || Object.values(imgOut)[0];
-        // Find <image> token positions and replace with image embeddings
-        const imageTokenId = tokenizer.convert_tokens_to_ids('<image>');
         const ids = Array.from(inputIds);
         const imagePositions = ids.reduce((acc, id, i) => {
           if (id === imageTokenId) acc.push(i);
           return acc;
         }, []);
-        // Build merged embeddings: replace image token positions with image embeddings
         const embedDim = inputsEmbeds.dims[2];
-        const totalLen = inputsEmbeds.dims[1] - imagePositions.length + imageEmbeds.dims[0];
-        const mergedData = new Float32Array(totalLen * embedDim);
-        let srcIdx = 0, dstIdx = 0, imgIdx = 0;
-        for (let i = 0; i < ids.length; i++) {
-          if (ids[i] === imageTokenId && imgIdx < imageEmbeds.dims[0]) {
-            // Copy image embedding
-            const embedData = imageEmbeds.data;
-            for (let d = 0; d < embedDim; d++) {
-              mergedData[dstIdx * embedDim + d] = embedData[imgIdx * embedDim + d];
             }
-            imgIdx++;
-            dstIdx++;
-          } else {
-            // Copy text embedding
-            for (let d = 0; d < embedDim; d++) {
-              mergedData[dstIdx * embedDim + d] = inputsEmbeds.data[i * embedDim + d];
-            }
-            dstIdx++;
           }
         }
-        inputsEmbeds = new ort.Tensor('float32', mergedData, [1, totalLen, embedDim]);
         statusEl.textContent = 'Generating response...';
       } else {
         statusEl.textContent = 'Generating response...';
@@ -403,7 +657,7 @@
       // Generation loop
       const cache = initCache();
       const eosId = tokenizer.eos_token_id;
-      const imEndId = tokenizer.convert_tokens_to_ids('<|im_end|>');
       const generatedTokens = [];
       let curLen = inputsEmbeds.dims[1];
       let embeds = inputsEmbeds;
@@ -996,6 +1250,158 @@
   }
   .spec-val.green { color: #5a8a00; }
 </style>
 </head>
 <body>
@@ -1016,6 +1422,7 @@
   <div id="progress">
     <div class="progress-track"><div id="progress-bar"></div></div>
     <span id="progress-text"></span>
   </div>
   <div class="model-tag">LFM2.5-VL-1.6B · ONNX · WebGPU</div>
   <button id="load-btn">Load Model</button>
@@ -1097,5 +1504,58 @@
   </div>
 </div>
 </body>
 </html>

   const HEAD_DIM = 128;
   const MAX_NEW_TOKENS = 512;
+  // Approximate file sizes in bytes for progress weighting
+  const FILE_SIZES = {
+    'embed_tokens_fp16.onnx_data': 30 * 1024 * 1024,
+    'embed_images_fp16.onnx_data': 400 * 1024 * 1024,
+    'decoder_q4.onnx_data':        1100 * 1024 * 1024,
+  };
+  const TOTAL_BYTES = Object.values(FILE_SIZES).reduce((a, b) => a + b, 0);
+  // Per-file downloaded bytes tracker
+  const downloadedBytes = {};
+  let compilingPhase = false;
   let tokenizer = null;
   let embedTokens = null;
   let embedImages = null;
   let decoder = null;
   let isLoaded = false;
   let isGenerating = false;
   let chatHistory = [];
   const $ = id => document.getElementById(id);
+  const statusEl        = $('status');
+  const progressEl      = $('progress');
+  const progressBar     = $('progress-bar');
+  const progressText    = $('progress-text');
+  const progressDetail  = $('progress-detail');
+  const chatContainer   = $('chat-container');
+  const inputEl         = $('user-input');
+  const sendBtn         = $('send-btn');
+  const loadBtn         = $('load-btn');
+  const storageInfo     = $('storage-info');
+  const cacheIndicator  = $('cache-indicator');
+  const imageBtn        = $('image-btn');
+  const imageInput      = $('image-input');
+  const imagePreview    = $('image-preview');
   const imagePreviewImg = $('image-preview-img');
+  const removeImageBtn  = $('remove-image-btn');
+  const loadingOverlay  = $('loading-overlay');
+  const loadingStep     = $('loading-step');
+  const loadingFile     = $('loading-file');
+  const loadingBytes    = $('loading-bytes');
+  const loadingEta      = $('loading-eta');
+  const loadingBarFill  = $('loading-bar-fill');
+  const loadingPct      = $('loading-pct');
+  let currentImageData = null;
+  // ─── SPEED / ETA TRACKING ────────────────────────────────────────────────
+  let downloadStart = null;
+  let lastSpeedBytes = 0;
+  let lastSpeedTime  = 0;
+  let speedSamples   = [];
+  function formatBytes(bytes) {
+    if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
+    return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
+  }
+  function formatSpeed(bps) {
+    if (bps < 1024 * 1024) return `${(bps / 1024).toFixed(0)} KB/s`;
+    return `${(bps / 1024 / 1024).toFixed(1)} MB/s`;
+  }
+  function formatEta(seconds) {
+    if (!isFinite(seconds) || seconds < 0) return '—';
+    if (seconds < 60) return `~${Math.ceil(seconds)}s`;
+    return `~${Math.ceil(seconds / 60)}min ${Math.ceil(seconds % 60)}s`;
+  }
+  function updateOverallProgress(currentFile) {
+    const total = Object.values(downloadedBytes).reduce((a, b) => a + b, 0);
+    const pct = Math.min(99, Math.round((total / TOTAL_BYTES) * 100));
+    // Speed calculation (rolling average over last 5 samples)
+    const now = Date.now();
+    if (lastSpeedTime) {
+      const dt = (now - lastSpeedTime) / 1000;
+      const db = total - lastSpeedBytes;
+      if (dt > 0.3) {
+        const sample = db / dt;
+        speedSamples.push(sample);
+        if (speedSamples.length > 8) speedSamples.shift();
+        lastSpeedBytes = total;
+        lastSpeedTime = now;
+      }
+    } else {
+      downloadStart = now;
+      lastSpeedTime = now;
+      lastSpeedBytes = 0;
+    }
+    const avgSpeed = speedSamples.length
+      ? speedSamples.reduce((a, b) => a + b, 0) / speedSamples.length
+      : 0;
+    const remaining = avgSpeed > 0 ? (TOTAL_BYTES - total) / avgSpeed : Infinity;
+    // Update big overlay
+    loadingBarFill.style.width = `${pct}%`;
+    loadingPct.textContent = `${pct}%`;
+    loadingBytes.textContent = `${formatBytes(total)} / ${formatBytes(TOTAL_BYTES)}`;
+    if (avgSpeed > 0) {
+      loadingEta.textContent = `${formatSpeed(avgSpeed)} · ETA ${formatEta(remaining)}`;
+    }
+    // Update header mini bar
+    progressBar.style.width = `${pct}%`;
+    progressText.textContent = `${pct}%`;
+    // Current file label
+    if (currentFile) {
+      const fileBytes = downloadedBytes[currentFile] || 0;
+      const fileTotal = FILE_SIZES[currentFile] || 0;
+      const filePct = fileTotal ? Math.min(100, Math.round(fileBytes / fileTotal * 100)) : 0;
+      loadingFile.textContent = currentFile;
+      if (progressDetail) progressDetail.textContent = `${currentFile} — ${filePct}%`;
+    }
+  }
+  // ─── FETCH WITH PROGRESS ─────────────────────────────────────────────────
+  async function fetchWithProgress(url, label) {
+    const key = label;
+    downloadedBytes[key] = 0;
+    const resp = await fetch(url);
+    if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
+    const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
+    const knownSize = FILE_SIZES[label] || contentLength || 1;
+    const reader = resp.body.getReader();
+    const chunks = [];
+    let received = 0;
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      chunks.push(value);
+      received += value.length;
+      downloadedBytes[key] = received;
+      updateOverallProgress(label);
+    }
+    // Merge chunks
+    const total = chunks.reduce((s, c) => s + c.length, 0);
+    const merged = new Uint8Array(total);
+    let offset = 0;
+    for (const c of chunks) { merged.set(c, offset); offset += c.length; }
+    downloadedBytes[key] = merged.length;
+    return merged;
+  }
+  // ─── LOAD ORT SESSION WITH PROGRESS ──────────────────────────────────────
+  async function loadOrtSessionWithProgress(name, stepLabel, stepNum) {
+    const onnxUrl   = `${MODEL_BASE}/onnx/${name}.onnx`;
+    const dataLabel = `${name}.onnx_data`;
+    const dataUrl   = `${MODEL_BASE}/onnx/${dataLabel}`;
+    setStep(stepLabel, 'Fetching model header...', stepNum);
+    // Fetch the small .onnx file (just the graph, no weights)
+    const onnxResp = await fetch(onnxUrl);
+    if (!onnxResp.ok) throw new Error(`Failed to fetch ${name}.onnx`);
+    const onnxBuffer = await onnxResp.arrayBuffer();
+    // Fetch the large external data with progress
+    setStep(stepLabel, `Downloading ${dataLabel}...`);
+    const dataBuffer = await fetchWithProgress(dataUrl, dataLabel);
+    // Compiling phase
+    setStep(stepLabel, 'Compiling WebGPU shaders...');
+    loadingEta.textContent = 'Compiling shaders — this can take 30–60s, please wait...';
+    compilingPhase = true;
+    const session = await ort.InferenceSession.create(onnxBuffer, {
+      executionProviders: ['webgpu'],
+      externalData: [{ path: dataLabel, data: dataBuffer.buffer }],
+    });
+    compilingPhase = false;
+    return session;
+  }
+  // Step index mapping
+  const STEP_MAP = {
+    1: 'Step 1 / 4',
+    2: 'Step 2 / 4',
+    3: 'Step 3 / 4',
+    4: 'Step 4 / 4',
+  };
+  function setStep(step, file, stepNum) {
+    loadingStep.textContent = step;
+    loadingFile.textContent = file;
+    statusEl.textContent = `${step} — ${file}`;
+    // Update step dots
+    for (let i = 1; i <= 4; i++) {
+      const dot = document.getElementById(`step-dot-${i}`);
+      const lbl = document.getElementById(`step-lbl-${i}`);
+      if (!dot) continue;
+      if (stepNum && i < stepNum) {
+        dot.className = 'lo-step-dot done';
+        lbl.className = 'lo-step-label done';
+      } else if (stepNum && i === stepNum) {
+        dot.className = 'lo-step-dot active';
+        lbl.className = 'lo-step-label active';
+      } else {
+        dot.className = 'lo-step-dot';
+        lbl.className = 'lo-step-label';
+      }
+    }
+  }
   // ─── CACHE CHECK ─────────────────────────────────────────────────────────
   async function checkCache() {
         const est = await navigator.storage.estimate();
         const usedMB = ((est.usage || 0) / 1024 / 1024).toFixed(0);
         const quotaGB = ((est.quota || 0) / 1024 / 1024 / 1024).toFixed(1);
+        storageInfo.textContent = `${usedMB}MB used / ${quotaGB}GB available`;
       }
     } catch(e) {}
   }
     loadBtn.disabled = true;
     loadBtn.textContent = 'Loading...';
     progressEl.style.display = 'flex';
+    loadingOverlay.style.display = 'flex';
     $('welcome').style.display = 'none';
+    downloadStart = Date.now();
     try {
       ort.env.wasm.numThreads = 1;
+      // Step 1: Tokenizer
+      setStep('Step 1 / 4 — Tokenizer', 'Downloading config files...', 1);
       tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
+      setStep('Step 1 / 4 — Tokenizer', 'Done ✓', 1);
+      // Step 2: Token embedder (~30MB)
+      embedTokens = await loadOrtSessionWithProgress(
+        'embed_tokens_fp16',
+        'Step 2 / 4 — Token Embedder (~30 MB)',
+        2
+      );
+      // Step 3: Vision encoder (~400MB)
+      embedImages = await loadOrtSessionWithProgress(
+        'embed_images_fp16',
+        'Step 3 / 4 — Vision Encoder (~400 MB)',
+        3
+      );
+      // Step 4: Decoder (~1.1GB)
+      decoder = await loadOrtSessionWithProgress(
+        'decoder_q4',
+        'Step 4 / 4 — Language Decoder (~1.1 GB)',
+        4
+      );
+      // Done! Mark all steps done
+      for (let i = 1; i <= 4; i++) {
+        const dot = document.getElementById(`step-dot-${i}`);
+        const lbl = document.getElementById(`step-lbl-${i}`);
+        if (dot) { dot.className = 'lo-step-dot done'; lbl.className = 'lo-step-label done'; }
+      }
+      loadingBarFill.style.width = '100%';
+      loadingPct.textContent = '100%';
+      loadingEta.textContent = `Completed in ${((Date.now() - downloadStart) / 1000).toFixed(0)}s`;
+      await new Promise(r => setTimeout(r, 600));
+      loadingOverlay.style.display = 'none';
       progressEl.style.display = 'none';
       isLoaded = true;
       inputEl.disabled = false;
       sendBtn.disabled = false;
       imageBtn.disabled = false;
+      inputEl.placeholder = 'Ask anything... (optionally attach an image 🖼)';
       loadBtn.style.display = 'none';
+      statusEl.textContent = 'Model ready — running fully on your device';
+      cacheIndicator.innerHTML = `<span class="dot cached"></span> Model running on-device`;
+      cacheIndicator.classList.add('has-cache');
       checkCache();
+      addSystemMessage('✓ LFM2.5-VL-1.6B loaded. Runs 100% in-browser via WebGPU. Attach an image or just chat!');
     } catch(err) {
       console.error(err);
+      loadingOverlay.style.display = 'none';
       progressEl.style.display = 'none';
       loadBtn.disabled = false;
       loadBtn.textContent = 'Retry Load';
+      statusEl.textContent = `Error: ${err.message.slice(0, 80)}`;
+      if (err.message.includes('WebGPU') || err.message.includes('gpu')) {
+        addSystemMessage('⚠️ WebGPU not supported. Please use Chrome 113+ or Edge 113+ and check chrome://flags/#enable-unsafe-webgpu');
       } else {
+        addSystemMessage(`❌ Error loading model: ${err.message}\n\nTry refreshing and loading again — large file downloads sometimes fail.`);
       }
     }
   }
   // ─── IMAGE HANDLING ───────────────────────────────────────────────────────
   imageBtn.addEventListener('click', () => imageInput.click());
     imagePreviewImg.src = '';
   });
+  // ── IMAGE PROCESSING ──────────────────────────────────────────────────────
+  // SigLIP2 NaFlex expects patches of 16x16 pixels from tiles of up to 512x512.
+  // embed_images_fp16 input shape:
+  //   pixel_values:        [total_patches, 3, 16, 16]  (rank 4 — one entry per patch)
+  //   pixel_attention_mask:[total_patches, seq_per_patch] where seq_per_patch = (512/16)^2 = 1024...
+  //   spatial_shapes:      [num_tiles, 2]  each row = [nPatchH, nPatchW] for that tile
+  // We use a single tile resized to ≤512x512, snapped to multiples of 16.
+  const PATCH_SIZE = 16;   // pixel patch size
+  const MAX_TILE   = 512;  // max tile dimension
   async function processImage(dataUrl) {
     return new Promise((resolve, reject) => {
       const img = new Image();
       img.onload = () => {
+        // Resize to fit inside MAX_TILE x MAX_TILE preserving aspect ratio
         let w = img.width, h = img.height;
+        if (w > MAX_TILE || h > MAX_TILE) {
+          if (w >= h) { h = Math.round(h * MAX_TILE / w); w = MAX_TILE; }
+          else        { w = Math.round(w * MAX_TILE / h); h = MAX_TILE; }
         }
+        // Snap to nearest multiple of PATCH_SIZE
+        w = Math.max(PATCH_SIZE, Math.round(w / PATCH_SIZE) * PATCH_SIZE);
+        h = Math.max(PATCH_SIZE, Math.round(h / PATCH_SIZE) * PATCH_SIZE);
         const canvas = document.createElement('canvas');
         canvas.width = w; canvas.height = h;
         const ctx = canvas.getContext('2d');
         ctx.drawImage(img, 0, 0, w, h);
+        const rgba = ctx.getImageData(0, 0, w, h).data;
+        // Number of patches in each dimension for this single tile
+        const nPatchH = h / PATCH_SIZE;   // rows of patches
+        const nPatchW = w / PATCH_SIZE;   // cols of patches
+        const totalPatches = nPatchH * nPatchW;
+        // Build pixel_values: [totalPatches, 3, PATCH_SIZE, PATCH_SIZE]
+        // Normalise: (x/255 - 0.5) / 0.5  (SigLIP2 mean=0.5, std=0.5)
+        const patchElems = 3 * PATCH_SIZE * PATCH_SIZE;
+        const pvData = new Float32Array(totalPatches * patchElems);
+        for (let pr = 0; pr < nPatchH; pr++) {
+          for (let pc = 0; pc < nPatchW; pc++) {
+            const patchIdx = pr * nPatchW + pc;
+            for (let py = 0; py < PATCH_SIZE; py++) {
+              for (let px = 0; px < PATCH_SIZE; px++) {
+                const imgY  = pr * PATCH_SIZE + py;
+                const imgX  = pc * PATCH_SIZE + px;
+                const pixOff = (imgY * w + imgX) * 4;  // RGBA offset in imageData
+                const base  = patchIdx * patchElems;
+                // channel-first: [3, PATCH_SIZE, PATCH_SIZE]
+                pvData[base + 0 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 0] / 255 - 0.5) / 0.5;
+                pvData[base + 1 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 1] / 255 - 0.5) / 0.5;
+                pvData[base + 2 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 2] / 255 - 0.5) / 0.5;
+              }
+            }
+          }
         }
+        // pixel_attention_mask: [totalPatches, nPatchH * nPatchW] — all ones (all patches valid)
+        // Each patch attends to all other patches within the same tile
+        const seqPerPatch = nPatchH * nPatchW;
+        const pamData = new BigInt64Array(totalPatches * seqPerPatch).fill(1n);
+        // spatial_shapes: [1, 2] — one tile with shape [nPatchH, nPatchW]
+        const ssData = new BigInt64Array([BigInt(nPatchH), BigInt(nPatchW)]);
+        const pixelValues        = new ort.Tensor('float32', pvData,  [totalPatches, 3, PATCH_SIZE, PATCH_SIZE]);
+        const pixelAttentionMask = new ort.Tensor('int64',   pamData, [totalPatches, seqPerPatch]);
+        const spatialShapes      = new ort.Tensor('int64',   ssData,  [1, 2]);
+        resolve({ pixelValues, pixelAttentionMask, spatialShapes, nPatchH, nPatchW, totalPatches });
       };
       img.onerror = reject;
       img.src = dataUrl;
     });
   }
+  // Look up the integer token ID for a special token string by scanning the vocab
+  function findTokenId(tokStr) {
+    // Transformers.js tokenizer exposes vocab via .vocab or ._tokenizer.model.vocab
+    try {
+      const vocab = tokenizer.vocab || tokenizer._tokenizer?.model?.vocab;
+      if (vocab && vocab[tokStr] !== undefined) return vocab[tokStr];
+    } catch(e) {}
+    // Fallback: encode the bare string and take the first token
+    try {
+      const ids = tokenizer.encode(tokStr, { add_special_tokens: false });
+      if (ids && ids.length > 0) return ids[0];
+    } catch(e) {}
+    return null;
+  }
   // ─── HELPERS ─────────────────────────────────────────────────────────────
   async function getTextEmbeddings(ids) {
     const tensor = new ort.Tensor('int64',
         const imgData = await processImage(attachedImage);
         // Get image embeddings from vision encoder
+        // Output is image_features: [total_image_tokens, hidden_size]
         const imgOut = await embedImages.run({
+          pixel_values:         imgData.pixelValues,
           pixel_attention_mask: imgData.pixelAttentionMask,
+          spatial_shapes:       imgData.spatialShapes,
         });
+        // The output key may vary — grab the first output tensor
+        const imageEmbeds = imgOut.image_features
+          || imgOut.outputs
+          || imgOut[Object.keys(imgOut)[0]];
+        console.log('Image embed output keys:', Object.keys(imgOut));
+        console.log('Image embed shape:', imageEmbeds.dims);
+        // Find <image> token positions in the token sequence.
+        // In Transformers.js, use findTokenId() helper — no .convert_tokens_to_ids()
+        const imageTokenId = findTokenId('<image>');
+        console.log('Image token ID:', imageTokenId);
         const ids = Array.from(inputIds);
+        // Count how many image positions we have
         const imagePositions = ids.reduce((acc, id, i) => {
           if (id === imageTokenId) acc.push(i);
           return acc;
         }, []);
+        console.log('Image token positions:', imagePositions.length, 'image embeds:', imageEmbeds.dims[0]);
+        // The vision encoder returns one embedding vector per image token slot.
+        // We replace each <image> token embedding with the corresponding image embed.
+        // If there are more image embed vectors than <image> tokens, we expand:
+        // the single <image> token placeholder is replaced by ALL image embed vectors.
         const embedDim = inputsEmbeds.dims[2];
+        const numImgVecs = imageEmbeds.dims[0];  // actual number of image feature vectors
+        if (imagePositions.length === 0) {
+          // No <image> placeholder in tokenised text — just prepend image embeds
+          const totalLen = numImgVecs + inputsEmbeds.dims[1];
+          const mergedData = new Float32Array(totalLen * embedDim);
+          mergedData.set(new Float32Array(imageEmbeds.data.buffer, imageEmbeds.data.byteOffset, numImgVecs * embedDim), 0);
+          mergedData.set(new Float32Array(inputsEmbeds.data.buffer, inputsEmbeds.data.byteOffset, inputsEmbeds.dims[1] * embedDim), numImgVecs * embedDim);
+          inputsEmbeds = new ort.Tensor('float32', mergedData, [1, totalLen, embedDim]);
+        } else {
+          // Replace <image> token(s) with image embed vectors (expanding 1→N if needed)
+          const numReplace = imagePositions.length;          // usually 1
+          const expandPer  = Math.ceil(numImgVecs / numReplace);
+          const totalLen   = inputsEmbeds.dims[1] - numReplace + numImgVecs;
+          const mergedData = new Float32Array(totalLen * embedDim);
+          const imgEmbeds32 = new Float32Array(imageEmbeds.data.buffer ?? imageEmbeds.data, 0, numImgVecs * embedDim);
+          const txtData32  = new Float32Array(inputsEmbeds.data.buffer ?? inputsEmbeds.data, 0, inputsEmbeds.dims[1] * embedDim);
+          let dst = 0;
+          let imgCursor = 0;
+          const imgPosSet = new Set(imagePositions);
+          for (let i = 0; i < ids.length; i++) {
+            if (imgPosSet.has(i)) {
+              // Insert all remaining image embed vectors at first <image> token, skip rest
+              if (imgCursor < numImgVecs) {
+                const toCopy = (i === imagePositions[0]) ? numImgVecs : 0;
+                mergedData.set(imgEmbeds32.subarray(0, toCopy * embedDim), dst * embedDim);
+                dst += toCopy;
+                imgCursor = numImgVecs;
+              }
+            } else {
+              mergedData.set(txtData32.subarray(i * embedDim, (i + 1) * embedDim), dst * embedDim);
+              dst++;
             }
           }
+          inputsEmbeds = new ort.Tensor('float32', mergedData, [1, dst, embedDim]);
         }
         statusEl.textContent = 'Generating response...';
       } else {
         statusEl.textContent = 'Generating response...';
       // Generation loop
       const cache = initCache();
       const eosId = tokenizer.eos_token_id;
+      const imEndId = findTokenId('<|im_end|>');
       const generatedTokens = [];
       let curLen = inputsEmbeds.dims[1];
       let embeds = inputsEmbeds;
   }
   .spec-val.green { color: #5a8a00; }
+  /* LOADING OVERLAY */
+  #loading-overlay {
+    display: none;
+    position: fixed;
+    inset: 0;
+    z-index: 100;
+    background: rgba(8,8,8,0.97);
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    backdrop-filter: blur(4px);
+  }
+  .lo-inner {
+    width: min(540px, 90vw);
+    display: flex;
+    flex-direction: column;
+    gap: 28px;
+  }
+  .lo-title {
+    font-family: 'Bebas Neue', sans-serif;
+    font-size: 11px;
+    letter-spacing: 4px;
+    color: var(--accent);
+    opacity: 0.6;
+    margin-bottom: 4px;
+  }
+  #loading-step {
+    font-family: 'DM Mono', monospace;
+    font-size: 13px;
+    color: var(--text);
+  }
+  .lo-bar-wrap {
+    height: 6px;
+    background: #1a1a1a;
+    border-radius: 3px;
+    overflow: visible;
+    position: relative;
+  }
+  #loading-bar-fill {
+    height: 100%;
+    background: var(--accent);
+    border-radius: 3px;
+    width: 0%;
+    transition: width 0.5s ease;
+    box-shadow: 0 0 12px var(--accent);
+    position: relative;
+  }
+  #loading-bar-fill::after {
+    content: '';
+    position: absolute;
+    right: -1px; top: -3px;
+    width: 12px; height: 12px;
+    background: var(--accent);
+    border-radius: 50%;
+    box-shadow: 0 0 10px var(--accent), 0 0 20px var(--accent);
+  }
+  .lo-stats {
+    display: flex;
+    justify-content: space-between;
+    align-items: baseline;
+  }
+  #loading-pct {
+    font-family: 'Bebas Neue', sans-serif;
+    font-size: 52px;
+    letter-spacing: 2px;
+    color: var(--accent);
+    line-height: 1;
+  }
+  .lo-right {
+    display: flex;
+    flex-direction: column;
+    align-items: flex-end;
+    gap: 5px;
+  }
+  #loading-bytes {
+    font-family: 'DM Mono', monospace;
+    font-size: 11px;
+    color: #555;
+  }
+  #loading-eta {
+    font-family: 'DM Mono', monospace;
+    font-size: 10px;
+    color: var(--muted);
+    text-align: right;
+    max-width: 300px;
+  }
+  .lo-steps {
+    display: flex;
+    flex-direction: column;
+    border: 1px solid var(--border);
+  }
+  .lo-step-row {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    padding: 9px 14px;
+    border-bottom: 1px solid var(--border);
+  }
+  .lo-step-row:last-child { border-bottom: none; }
+  .lo-step-dot {
+    width: 6px; height: 6px;
+    border-radius: 50%;
+    background: #1e1e1e;
+    flex-shrink: 0;
+    transition: background 0.3s;
+  }
+  .lo-step-dot.active {
+    background: var(--accent);
+    box-shadow: 0 0 8px var(--accent);
+    animation: thinkBounce 1s infinite;
+  }
+  .lo-step-dot.done { background: #3a6a00; animation: none; }
+  .lo-step-label {
+    font-family: 'DM Mono', monospace;
+    font-size: 10px;
+    color: #2a2a2a;
+    transition: color 0.3s;
+    flex: 1;
+  }
+  .lo-step-label.active { color: var(--text); }
+  .lo-step-label.done { color: #3a6a00; }
+  .lo-step-size {
+    font-family: 'DM Mono', monospace;
+    font-size: 9px;
+    color: #1e1e1e;
+  }
+  #loading-file {
+    font-family: 'DM Mono', monospace;
+    font-size: 9px;
+    color: #2a2a2a;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    max-width: 100%;
+    margin-top: 4px;
+  }
+  .lo-note {
+    font-family: 'DM Mono', monospace;
+    font-size: 9px;
+    color: #1e1e1e;
+    line-height: 1.8;
+  }
+  #progress-detail {
+    font-family: 'DM Mono', monospace;
+    font-size: 9px;
+    color: #444;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    max-width: 200px;
+  }
 </style>
 </head>
 <body>
   <div id="progress">
     <div class="progress-track"><div id="progress-bar"></div></div>
     <span id="progress-text"></span>
+    <span id="progress-detail"></span>
   </div>
   <div class="model-tag">LFM2.5-VL-1.6B · ONNX · WebGPU</div>
   <button id="load-btn">Load Model</button>
   </div>
 </div>
+<!-- Loading overlay -->
+<div id="loading-overlay">
+  <div class="lo-inner">
+    <div>
+      <div class="lo-title">DOWNLOADING MODEL</div>
+      <div id="loading-step">Initializing...</div>
+      <div id="loading-file"></div>
+    </div>
+    <div>
+      <div class="lo-stats">
+        <div id="loading-pct">0%</div>
+        <div class="lo-right">
+          <div id="loading-bytes">0 MB / ~1.5 GB</div>
+          <div id="loading-eta">Calculating speed...</div>
+        </div>
+      </div>
+      <div class="lo-bar-wrap" style="margin-top:12px">
+        <div id="loading-bar-fill"></div>
+      </div>
+    </div>
+    <div class="lo-steps">
+      <div class="lo-step-row" id="step-row-1">
+        <div class="lo-step-dot" id="step-dot-1"></div>
+        <div class="lo-step-label" id="step-lbl-1">Tokenizer</div>
+        <div class="lo-step-size">~5 MB</div>
+      </div>
+      <div class="lo-step-row" id="step-row-2">
+        <div class="lo-step-dot" id="step-dot-2"></div>
+        <div class="lo-step-label" id="step-lbl-2">Token Embedder</div>
+        <div class="lo-step-size">~30 MB</div>
+      </div>
+      <div class="lo-step-row" id="step-row-3">
+        <div class="lo-step-dot" id="step-dot-3"></div>
+        <div class="lo-step-label" id="step-lbl-3">Vision Encoder (SigLIP2)</div>
+        <div class="lo-step-size">~400 MB</div>
+      </div>
+      <div class="lo-step-row" id="step-row-4">
+        <div class="lo-step-dot" id="step-dot-4"></div>
+        <div class="lo-step-label" id="step-lbl-4">Language Decoder (Q4)</div>
+        <div class="lo-step-size">~1.1 GB</div>
+      </div>
+    </div>
+    <div class="lo-note">
+      ⚡ First load downloads ~1.5 GB from Hugging Face.<br>
+      🔒 Everything runs 100% in-browser — zero data leaves your device.<br>
+      🛜 Keep this tab open. Do not refresh during download.
+    </div>
+  </div>
+</div>
 </body>
 </html>