SiddhJagani commited on
Commit
8158e83
·
verified ·
1 Parent(s): 9589b05

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +585 -125
index.html CHANGED
@@ -16,36 +16,218 @@
16
  const HEAD_DIM = 128;
17
  const MAX_NEW_TOKENS = 512;
18
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  let tokenizer = null;
20
  let embedTokens = null;
21
  let embedImages = null;
22
  let decoder = null;
23
  let isLoaded = false;
24
  let isGenerating = false;
25
-
26
- // Conversation history stored as array of {role, content} where content may have image
27
  let chatHistory = [];
28
 
29
  const $ = id => document.getElementById(id);
30
- const statusEl = $('status');
31
- const progressEl = $('progress');
32
- const progressBar = $('progress-bar');
33
- const progressText = $('progress-text');
34
- const chatContainer = $('chat-container');
35
- const inputEl = $('user-input');
36
- const sendBtn = $('send-btn');
37
- const loadBtn = $('load-btn');
38
- const storageInfo = $('storage-info');
39
- const cacheIndicator = $('cache-indicator');
40
- const imageBtn = $('image-btn');
41
- const imageInput = $('image-input');
42
- const imagePreview = $('image-preview');
 
43
  const imagePreviewImg = $('image-preview-img');
44
- const removeImageBtn = $('remove-image-btn');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- // Current attached image state
47
- let currentImageData = null; // base64 data URL
48
- let currentImagePixels = null; // processed for model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  // ─── CACHE CHECK ─────────────────────────────────────────────────────────
51
  async function checkCache() {
@@ -54,7 +236,7 @@
54
  const est = await navigator.storage.estimate();
55
  const usedMB = ((est.usage || 0) / 1024 / 1024).toFixed(0);
56
  const quotaGB = ((est.quota || 0) / 1024 / 1024 / 1024).toFixed(1);
57
- storageInfo.textContent = `Browser storage: ${usedMB}MB used / ${quotaGB}GB available`;
58
  }
59
  } catch(e) {}
60
  }
@@ -64,81 +246,81 @@
64
  loadBtn.disabled = true;
65
  loadBtn.textContent = 'Loading...';
66
  progressEl.style.display = 'flex';
 
67
  $('welcome').style.display = 'none';
 
68
 
69
  try {
70
- // Configure ONNX runtime
71
  ort.env.wasm.numThreads = 1;
72
 
73
- // Step 1: Load tokenizer
74
- statusEl.textContent = 'Loading tokenizer...';
75
- progressBar.style.width = '5%';
76
- progressText.textContent = '5%';
77
  tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- // Step 2: Load embed_tokens session
80
- statusEl.textContent = 'Loading token embedder (~30MB)...';
81
- progressBar.style.width = '20%';
82
- progressText.textContent = '20%';
83
- embedTokens = await loadOrtSession('embed_tokens_fp16', 1);
84
-
85
- // Step 3: Load embed_images session
86
- statusEl.textContent = 'Loading vision encoder (~400MB)...';
87
- progressBar.style.width = '40%';
88
- progressText.textContent = '40%';
89
- embedImages = await loadOrtSession('embed_images_fp16', 1);
90
-
91
- // Step 4: Load decoder (largest, q4 ~1.1GB)
92
- statusEl.textContent = 'Loading language decoder (~1.1GB)...';
93
- progressBar.style.width = '60%';
94
- progressText.textContent = '60%';
95
- decoder = await loadOrtSession('decoder_q4', 1);
96
-
97
- progressBar.style.width = '100%';
98
- progressText.textContent = '100%';
99
- statusEl.textContent = 'Model ready — running fully on your device (WebGPU)';
100
  progressEl.style.display = 'none';
101
 
102
  isLoaded = true;
103
  inputEl.disabled = false;
104
  sendBtn.disabled = false;
105
  imageBtn.disabled = false;
106
- inputEl.placeholder = 'Ask anything... (optionally attach an image)';
107
  loadBtn.style.display = 'none';
 
 
 
108
 
109
  checkCache();
110
- addSystemMessage('✓ LFM2.5-VL-1.6B loaded & running on-device via WebGPU. You can attach images!');
111
 
112
  } catch(err) {
113
  console.error(err);
114
- statusEl.textContent = `Error: ${err.message}`;
115
  progressEl.style.display = 'none';
116
  loadBtn.disabled = false;
117
  loadBtn.textContent = 'Retry Load';
118
- if (err.message && err.message.includes('WebGPU')) {
119
- addSystemMessage('⚠️ WebGPU not supported. Use Chrome 113+ or Edge 113+ with WebGPU enabled.');
 
120
  } else {
121
- addSystemMessage(`Error loading model: ${err.message}`);
122
  }
123
  }
124
  }
125
 
126
- async function loadOrtSession(name, dataFiles = 1) {
127
- const onnxUrl = `${MODEL_BASE}/onnx/${name}.onnx`;
128
- const externalData = [];
129
- for (let i = 0; i < dataFiles; i++) {
130
- const suffix = i === 0 ? '' : `_${i}`;
131
- externalData.push({
132
- path: `${name}.onnx_data${suffix}`,
133
- data: `${MODEL_BASE}/onnx/${name}.onnx_data${suffix}`
134
- });
135
- }
136
- return ort.InferenceSession.create(onnxUrl, {
137
- executionProviders: ['webgpu'],
138
- externalData,
139
- });
140
- }
141
-
142
  // ─── IMAGE HANDLING ───────────────────────────────────────────────────────
143
  imageBtn.addEventListener('click', () => imageInput.click());
144
 
@@ -162,55 +344,99 @@
162
  imagePreviewImg.src = '';
163
  });
164
 
165
- // Process image to pixel_values for the model
 
 
 
 
 
 
 
 
 
 
166
  async function processImage(dataUrl) {
167
  return new Promise((resolve, reject) => {
168
  const img = new Image();
169
  img.onload = () => {
170
- // Resize to max 512x512 preserving aspect ratio
171
  let w = img.width, h = img.height;
172
- const maxDim = 512;
173
- if (w > maxDim || h > maxDim) {
174
- if (w > h) { h = Math.round(h * maxDim / w); w = maxDim; }
175
- else { w = Math.round(w * maxDim / h); h = maxDim; }
176
  }
177
- // Round to multiple of 14 (patch size for SigLIP2)
178
- w = Math.max(14, Math.round(w / 14) * 14);
179
- h = Math.max(14, Math.round(h / 14) * 14);
180
 
181
  const canvas = document.createElement('canvas');
182
  canvas.width = w; canvas.height = h;
183
  const ctx = canvas.getContext('2d');
184
  ctx.drawImage(img, 0, 0, w, h);
185
- const imageData = ctx.getImageData(0, 0, w, h);
186
- const { data } = imageData;
187
-
188
- // Normalize with SigLIP2 mean/std
189
- const mean = [0.5, 0.5, 0.5];
190
- const std = [0.5, 0.5, 0.5];
191
-
192
- const nPixels = w * h;
193
- const floats = new Float32Array(3 * nPixels);
194
- for (let i = 0; i < nPixels; i++) {
195
- floats[0 * nPixels + i] = (data[i * 4 + 0] / 255 - mean[0]) / std[0];
196
- floats[1 * nPixels + i] = (data[i * 4 + 1] / 255 - mean[1]) / std[1];
197
- floats[2 * nPixels + i] = (data[i * 4 + 2] / 255 - mean[2]) / std[2];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  }
199
 
200
- const pixelValues = new ort.Tensor('float32', floats, [1, 3, h, w]);
201
- const pixelAttentionMask = new ort.Tensor('int64',
202
- new BigInt64Array(h * w).fill(1n), [1, h, w]);
203
- const numPatchH = h / 14, numPatchW = w / 14;
204
- const spatialShapes = new ort.Tensor('int64',
205
- new BigInt64Array([BigInt(numPatchH), BigInt(numPatchW)]), [1, 2]);
 
 
 
 
 
206
 
207
- resolve({ pixelValues, pixelAttentionMask, spatialShapes, width: w, height: h });
208
  };
209
  img.onerror = reject;
210
  img.src = dataUrl;
211
  });
212
  }
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  // ─── HELPERS ─────────────────────────────────────────────────────────────
215
  async function getTextEmbeddings(ids) {
216
  const tensor = new ort.Tensor('int64',
@@ -353,46 +579,74 @@
353
  const imgData = await processImage(attachedImage);
354
 
355
  // Get image embeddings from vision encoder
 
356
  const imgOut = await embedImages.run({
357
- pixel_values: imgData.pixelValues,
358
  pixel_attention_mask: imgData.pixelAttentionMask,
359
- spatial_shapes: imgData.spatialShapes,
360
  });
361
- const imageEmbeds = imgOut.image_features || imgOut.outputs || Object.values(imgOut)[0];
362
-
363
- // Find <image> token positions and replace with image embeddings
364
- const imageTokenId = tokenizer.convert_tokens_to_ids('<image>');
 
 
 
 
 
 
 
365
  const ids = Array.from(inputIds);
 
 
366
  const imagePositions = ids.reduce((acc, id, i) => {
367
  if (id === imageTokenId) acc.push(i);
368
  return acc;
369
  }, []);
 
370
 
371
- // Build merged embeddings: replace image token positions with image embeddings
 
 
 
372
  const embedDim = inputsEmbeds.dims[2];
373
- const totalLen = inputsEmbeds.dims[1] - imagePositions.length + imageEmbeds.dims[0];
374
- const mergedData = new Float32Array(totalLen * embedDim);
375
-
376
- let srcIdx = 0, dstIdx = 0, imgIdx = 0;
377
- for (let i = 0; i < ids.length; i++) {
378
- if (ids[i] === imageTokenId && imgIdx < imageEmbeds.dims[0]) {
379
- // Copy image embedding
380
- const embedData = imageEmbeds.data;
381
- for (let d = 0; d < embedDim; d++) {
382
- mergedData[dstIdx * embedDim + d] = embedData[imgIdx * embedDim + d];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  }
384
- imgIdx++;
385
- dstIdx++;
386
- } else {
387
- // Copy text embedding
388
- for (let d = 0; d < embedDim; d++) {
389
- mergedData[dstIdx * embedDim + d] = inputsEmbeds.data[i * embedDim + d];
390
- }
391
- dstIdx++;
392
  }
 
393
  }
394
-
395
- inputsEmbeds = new ort.Tensor('float32', mergedData, [1, totalLen, embedDim]);
396
  statusEl.textContent = 'Generating response...';
397
  } else {
398
  statusEl.textContent = 'Generating response...';
@@ -403,7 +657,7 @@
403
  // Generation loop
404
  const cache = initCache();
405
  const eosId = tokenizer.eos_token_id;
406
- const imEndId = tokenizer.convert_tokens_to_ids('<|im_end|>');
407
  const generatedTokens = [];
408
  let curLen = inputsEmbeds.dims[1];
409
  let embeds = inputsEmbeds;
@@ -996,6 +1250,158 @@
996
  }
997
 
998
  .spec-val.green { color: #5a8a00; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  </style>
1000
  </head>
1001
  <body>
@@ -1016,6 +1422,7 @@
1016
  <div id="progress">
1017
  <div class="progress-track"><div id="progress-bar"></div></div>
1018
  <span id="progress-text"></span>
 
1019
  </div>
1020
  <div class="model-tag">LFM2.5-VL-1.6B · ONNX · WebGPU</div>
1021
  <button id="load-btn">Load Model</button>
@@ -1097,5 +1504,58 @@
1097
  </div>
1098
  </div>
1099
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1100
  </body>
1101
  </html>
 
16
  const HEAD_DIM = 128;
17
  const MAX_NEW_TOKENS = 512;
18
 
19
+ // Approximate file sizes in bytes for progress weighting
20
+ const FILE_SIZES = {
21
+ 'embed_tokens_fp16.onnx_data': 30 * 1024 * 1024,
22
+ 'embed_images_fp16.onnx_data': 400 * 1024 * 1024,
23
+ 'decoder_q4.onnx_data': 1100 * 1024 * 1024,
24
+ };
25
+ const TOTAL_BYTES = Object.values(FILE_SIZES).reduce((a, b) => a + b, 0);
26
+
27
+ // Per-file downloaded bytes tracker
28
+ const downloadedBytes = {};
29
+ let compilingPhase = false;
30
+
31
  let tokenizer = null;
32
  let embedTokens = null;
33
  let embedImages = null;
34
  let decoder = null;
35
  let isLoaded = false;
36
  let isGenerating = false;
 
 
37
  let chatHistory = [];
38
 
39
  const $ = id => document.getElementById(id);
40
+ const statusEl = $('status');
41
+ const progressEl = $('progress');
42
+ const progressBar = $('progress-bar');
43
+ const progressText = $('progress-text');
44
+ const progressDetail = $('progress-detail');
45
+ const chatContainer = $('chat-container');
46
+ const inputEl = $('user-input');
47
+ const sendBtn = $('send-btn');
48
+ const loadBtn = $('load-btn');
49
+ const storageInfo = $('storage-info');
50
+ const cacheIndicator = $('cache-indicator');
51
+ const imageBtn = $('image-btn');
52
+ const imageInput = $('image-input');
53
+ const imagePreview = $('image-preview');
54
  const imagePreviewImg = $('image-preview-img');
55
+ const removeImageBtn = $('remove-image-btn');
56
+ const loadingOverlay = $('loading-overlay');
57
+ const loadingStep = $('loading-step');
58
+ const loadingFile = $('loading-file');
59
+ const loadingBytes = $('loading-bytes');
60
+ const loadingEta = $('loading-eta');
61
+ const loadingBarFill = $('loading-bar-fill');
62
+ const loadingPct = $('loading-pct');
63
+
64
+ let currentImageData = null;
65
+
66
+ // ─── SPEED / ETA TRACKING ────────────────────────────────────────────────
67
+ let downloadStart = null;
68
+ let lastSpeedBytes = 0;
69
+ let lastSpeedTime = 0;
70
+ let speedSamples = [];
71
+
72
+ function formatBytes(bytes) {
73
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
74
+ return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
75
+ }
76
+
77
+ function formatSpeed(bps) {
78
+ if (bps < 1024 * 1024) return `${(bps / 1024).toFixed(0)} KB/s`;
79
+ return `${(bps / 1024 / 1024).toFixed(1)} MB/s`;
80
+ }
81
+
82
+ function formatEta(seconds) {
83
+ if (!isFinite(seconds) || seconds < 0) return '—';
84
+ if (seconds < 60) return `~${Math.ceil(seconds)}s`;
85
+ return `~${Math.ceil(seconds / 60)}min ${Math.ceil(seconds % 60)}s`;
86
+ }
87
+
88
+ function updateOverallProgress(currentFile) {
89
+ const total = Object.values(downloadedBytes).reduce((a, b) => a + b, 0);
90
+ const pct = Math.min(99, Math.round((total / TOTAL_BYTES) * 100));
91
+
92
+ // Speed calculation (rolling average over last 5 samples)
93
+ const now = Date.now();
94
+ if (lastSpeedTime) {
95
+ const dt = (now - lastSpeedTime) / 1000;
96
+ const db = total - lastSpeedBytes;
97
+ if (dt > 0.3) {
98
+ const sample = db / dt;
99
+ speedSamples.push(sample);
100
+ if (speedSamples.length > 8) speedSamples.shift();
101
+ lastSpeedBytes = total;
102
+ lastSpeedTime = now;
103
+ }
104
+ } else {
105
+ downloadStart = now;
106
+ lastSpeedTime = now;
107
+ lastSpeedBytes = 0;
108
+ }
109
+
110
+ const avgSpeed = speedSamples.length
111
+ ? speedSamples.reduce((a, b) => a + b, 0) / speedSamples.length
112
+ : 0;
113
+ const remaining = avgSpeed > 0 ? (TOTAL_BYTES - total) / avgSpeed : Infinity;
114
+
115
+ // Update big overlay
116
+ loadingBarFill.style.width = `${pct}%`;
117
+ loadingPct.textContent = `${pct}%`;
118
+ loadingBytes.textContent = `${formatBytes(total)} / ${formatBytes(TOTAL_BYTES)}`;
119
+ if (avgSpeed > 0) {
120
+ loadingEta.textContent = `${formatSpeed(avgSpeed)} · ETA ${formatEta(remaining)}`;
121
+ }
122
+
123
+ // Update header mini bar
124
+ progressBar.style.width = `${pct}%`;
125
+ progressText.textContent = `${pct}%`;
126
+
127
+ // Current file label
128
+ if (currentFile) {
129
+ const fileBytes = downloadedBytes[currentFile] || 0;
130
+ const fileTotal = FILE_SIZES[currentFile] || 0;
131
+ const filePct = fileTotal ? Math.min(100, Math.round(fileBytes / fileTotal * 100)) : 0;
132
+ loadingFile.textContent = currentFile;
133
+ if (progressDetail) progressDetail.textContent = `${currentFile} — ${filePct}%`;
134
+ }
135
+ }
136
+
137
+ // ─── FETCH WITH PROGRESS ─────────────────────────────────────────────────
138
+ async function fetchWithProgress(url, label) {
139
+ const key = label;
140
+ downloadedBytes[key] = 0;
141
+
142
+ const resp = await fetch(url);
143
+ if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
144
+
145
+ const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
146
+ const knownSize = FILE_SIZES[label] || contentLength || 1;
147
+
148
+ const reader = resp.body.getReader();
149
+ const chunks = [];
150
+ let received = 0;
151
+
152
+ while (true) {
153
+ const { done, value } = await reader.read();
154
+ if (done) break;
155
+ chunks.push(value);
156
+ received += value.length;
157
+ downloadedBytes[key] = received;
158
+ updateOverallProgress(label);
159
+ }
160
+
161
+ // Merge chunks
162
+ const total = chunks.reduce((s, c) => s + c.length, 0);
163
+ const merged = new Uint8Array(total);
164
+ let offset = 0;
165
+ for (const c of chunks) { merged.set(c, offset); offset += c.length; }
166
+ downloadedBytes[key] = merged.length;
167
+ return merged;
168
+ }
169
+
170
+ // ─── LOAD ORT SESSION WITH PROGRESS ──────────────────────────────────────
171
+ async function loadOrtSessionWithProgress(name, stepLabel, stepNum) {
172
+ const onnxUrl = `${MODEL_BASE}/onnx/${name}.onnx`;
173
+ const dataLabel = `${name}.onnx_data`;
174
+ const dataUrl = `${MODEL_BASE}/onnx/${dataLabel}`;
175
+
176
+ setStep(stepLabel, 'Fetching model header...', stepNum);
177
+
178
+ // Fetch the small .onnx file (just the graph, no weights)
179
+ const onnxResp = await fetch(onnxUrl);
180
+ if (!onnxResp.ok) throw new Error(`Failed to fetch ${name}.onnx`);
181
+ const onnxBuffer = await onnxResp.arrayBuffer();
182
+
183
+ // Fetch the large external data with progress
184
+ setStep(stepLabel, `Downloading ${dataLabel}...`);
185
+ const dataBuffer = await fetchWithProgress(dataUrl, dataLabel);
186
+
187
+ // Compiling phase
188
+ setStep(stepLabel, 'Compiling WebGPU shaders...');
189
+ loadingEta.textContent = 'Compiling shaders — this can take 30–60s, please wait...';
190
+ compilingPhase = true;
191
+
192
+ const session = await ort.InferenceSession.create(onnxBuffer, {
193
+ executionProviders: ['webgpu'],
194
+ externalData: [{ path: dataLabel, data: dataBuffer.buffer }],
195
+ });
196
+
197
+ compilingPhase = false;
198
+ return session;
199
+ }
200
 
201
+ // Step index mapping
202
+ const STEP_MAP = {
203
+ 1: 'Step 1 / 4',
204
+ 2: 'Step 2 / 4',
205
+ 3: 'Step 3 / 4',
206
+ 4: 'Step 4 / 4',
207
+ };
208
+
209
+ function setStep(step, file, stepNum) {
210
+ loadingStep.textContent = step;
211
+ loadingFile.textContent = file;
212
+ statusEl.textContent = `${step} — ${file}`;
213
+
214
+ // Update step dots
215
+ for (let i = 1; i <= 4; i++) {
216
+ const dot = document.getElementById(`step-dot-${i}`);
217
+ const lbl = document.getElementById(`step-lbl-${i}`);
218
+ if (!dot) continue;
219
+ if (stepNum && i < stepNum) {
220
+ dot.className = 'lo-step-dot done';
221
+ lbl.className = 'lo-step-label done';
222
+ } else if (stepNum && i === stepNum) {
223
+ dot.className = 'lo-step-dot active';
224
+ lbl.className = 'lo-step-label active';
225
+ } else {
226
+ dot.className = 'lo-step-dot';
227
+ lbl.className = 'lo-step-label';
228
+ }
229
+ }
230
+ }
231
 
232
  // ─── CACHE CHECK ─────────────────────────────────────────────────────────
233
  async function checkCache() {
 
236
  const est = await navigator.storage.estimate();
237
  const usedMB = ((est.usage || 0) / 1024 / 1024).toFixed(0);
238
  const quotaGB = ((est.quota || 0) / 1024 / 1024 / 1024).toFixed(1);
239
+ storageInfo.textContent = `${usedMB}MB used / ${quotaGB}GB available`;
240
  }
241
  } catch(e) {}
242
  }
 
246
  loadBtn.disabled = true;
247
  loadBtn.textContent = 'Loading...';
248
  progressEl.style.display = 'flex';
249
+ loadingOverlay.style.display = 'flex';
250
  $('welcome').style.display = 'none';
251
+ downloadStart = Date.now();
252
 
253
  try {
 
254
  ort.env.wasm.numThreads = 1;
255
 
256
+ // Step 1: Tokenizer
257
+ setStep('Step 1 / 4 — Tokenizer', 'Downloading config files...', 1);
 
 
258
  tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
259
+ setStep('Step 1 / 4 — Tokenizer', 'Done ✓', 1);
260
+
261
+ // Step 2: Token embedder (~30MB)
262
+ embedTokens = await loadOrtSessionWithProgress(
263
+ 'embed_tokens_fp16',
264
+ 'Step 2 / 4 — Token Embedder (~30 MB)',
265
+ 2
266
+ );
267
+
268
+ // Step 3: Vision encoder (~400MB)
269
+ embedImages = await loadOrtSessionWithProgress(
270
+ 'embed_images_fp16',
271
+ 'Step 3 / 4 — Vision Encoder (~400 MB)',
272
+ 3
273
+ );
274
+
275
+ // Step 4: Decoder (~1.1GB)
276
+ decoder = await loadOrtSessionWithProgress(
277
+ 'decoder_q4',
278
+ 'Step 4 / 4 — Language Decoder (~1.1 GB)',
279
+ 4
280
+ );
281
+
282
+ // Done! Mark all steps done
283
+ for (let i = 1; i <= 4; i++) {
284
+ const dot = document.getElementById(`step-dot-${i}`);
285
+ const lbl = document.getElementById(`step-lbl-${i}`);
286
+ if (dot) { dot.className = 'lo-step-dot done'; lbl.className = 'lo-step-label done'; }
287
+ }
288
+ loadingBarFill.style.width = '100%';
289
+ loadingPct.textContent = '100%';
290
+ loadingEta.textContent = `Completed in ${((Date.now() - downloadStart) / 1000).toFixed(0)}s`;
291
 
292
+ await new Promise(r => setTimeout(r, 600));
293
+ loadingOverlay.style.display = 'none';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  progressEl.style.display = 'none';
295
 
296
  isLoaded = true;
297
  inputEl.disabled = false;
298
  sendBtn.disabled = false;
299
  imageBtn.disabled = false;
300
+ inputEl.placeholder = 'Ask anything... (optionally attach an image 🖼)';
301
  loadBtn.style.display = 'none';
302
+ statusEl.textContent = 'Model ready — running fully on your device';
303
+ cacheIndicator.innerHTML = `<span class="dot cached"></span> Model running on-device`;
304
+ cacheIndicator.classList.add('has-cache');
305
 
306
  checkCache();
307
+ addSystemMessage('✓ LFM2.5-VL-1.6B loaded. Runs 100% in-browser via WebGPU. Attach an image or just chat!');
308
 
309
  } catch(err) {
310
  console.error(err);
311
+ loadingOverlay.style.display = 'none';
312
  progressEl.style.display = 'none';
313
  loadBtn.disabled = false;
314
  loadBtn.textContent = 'Retry Load';
315
+ statusEl.textContent = `Error: ${err.message.slice(0, 80)}`;
316
+ if (err.message.includes('WebGPU') || err.message.includes('gpu')) {
317
+ addSystemMessage('⚠️ WebGPU not supported. Please use Chrome 113+ or Edge 113+ and check chrome://flags/#enable-unsafe-webgpu');
318
  } else {
319
+ addSystemMessage(`Error loading model: ${err.message}\n\nTry refreshing and loading again — large file downloads sometimes fail.`);
320
  }
321
  }
322
  }
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  // ─── IMAGE HANDLING ───────────────────────────────────────────────────────
325
  imageBtn.addEventListener('click', () => imageInput.click());
326
 
 
344
  imagePreviewImg.src = '';
345
  });
346
 
347
+ // ── IMAGE PROCESSING ──────────────────────────────────────────────────────
348
+ // SigLIP2 NaFlex expects patches of 16x16 pixels from tiles of up to 512x512.
349
+ // embed_images_fp16 input shape:
350
+ // pixel_values: [total_patches, 3, 16, 16] (rank 4 — one entry per patch)
351
+ // pixel_attention_mask:[total_patches, seq_per_patch] where seq_per_patch = (512/16)^2 = 1024...
352
+ // spatial_shapes: [num_tiles, 2] each row = [nPatchH, nPatchW] for that tile
353
+ // We use a single tile resized to ≤512x512, snapped to multiples of 16.
354
+
355
+ const PATCH_SIZE = 16; // pixel patch size
356
+ const MAX_TILE = 512; // max tile dimension
357
+
358
  async function processImage(dataUrl) {
359
  return new Promise((resolve, reject) => {
360
  const img = new Image();
361
  img.onload = () => {
362
+ // Resize to fit inside MAX_TILE x MAX_TILE preserving aspect ratio
363
  let w = img.width, h = img.height;
364
+ if (w > MAX_TILE || h > MAX_TILE) {
365
+ if (w >= h) { h = Math.round(h * MAX_TILE / w); w = MAX_TILE; }
366
+ else { w = Math.round(w * MAX_TILE / h); h = MAX_TILE; }
 
367
  }
368
+ // Snap to nearest multiple of PATCH_SIZE
369
+ w = Math.max(PATCH_SIZE, Math.round(w / PATCH_SIZE) * PATCH_SIZE);
370
+ h = Math.max(PATCH_SIZE, Math.round(h / PATCH_SIZE) * PATCH_SIZE);
371
 
372
  const canvas = document.createElement('canvas');
373
  canvas.width = w; canvas.height = h;
374
  const ctx = canvas.getContext('2d');
375
  ctx.drawImage(img, 0, 0, w, h);
376
+ const rgba = ctx.getImageData(0, 0, w, h).data;
377
+
378
+ // Number of patches in each dimension for this single tile
379
+ const nPatchH = h / PATCH_SIZE; // rows of patches
380
+ const nPatchW = w / PATCH_SIZE; // cols of patches
381
+ const totalPatches = nPatchH * nPatchW;
382
+
383
+ // Build pixel_values: [totalPatches, 3, PATCH_SIZE, PATCH_SIZE]
384
+ // Normalise: (x/255 - 0.5) / 0.5 (SigLIP2 mean=0.5, std=0.5)
385
+ const patchElems = 3 * PATCH_SIZE * PATCH_SIZE;
386
+ const pvData = new Float32Array(totalPatches * patchElems);
387
+
388
+ for (let pr = 0; pr < nPatchH; pr++) {
389
+ for (let pc = 0; pc < nPatchW; pc++) {
390
+ const patchIdx = pr * nPatchW + pc;
391
+ for (let py = 0; py < PATCH_SIZE; py++) {
392
+ for (let px = 0; px < PATCH_SIZE; px++) {
393
+ const imgY = pr * PATCH_SIZE + py;
394
+ const imgX = pc * PATCH_SIZE + px;
395
+ const pixOff = (imgY * w + imgX) * 4; // RGBA offset in imageData
396
+ const base = patchIdx * patchElems;
397
+ // channel-first: [3, PATCH_SIZE, PATCH_SIZE]
398
+ pvData[base + 0 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 0] / 255 - 0.5) / 0.5;
399
+ pvData[base + 1 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 1] / 255 - 0.5) / 0.5;
400
+ pvData[base + 2 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 2] / 255 - 0.5) / 0.5;
401
+ }
402
+ }
403
+ }
404
  }
405
 
406
+ // pixel_attention_mask: [totalPatches, nPatchH * nPatchW] all ones (all patches valid)
407
+ // Each patch attends to all other patches within the same tile
408
+ const seqPerPatch = nPatchH * nPatchW;
409
+ const pamData = new BigInt64Array(totalPatches * seqPerPatch).fill(1n);
410
+
411
+ // spatial_shapes: [1, 2] — one tile with shape [nPatchH, nPatchW]
412
+ const ssData = new BigInt64Array([BigInt(nPatchH), BigInt(nPatchW)]);
413
+
414
+ const pixelValues = new ort.Tensor('float32', pvData, [totalPatches, 3, PATCH_SIZE, PATCH_SIZE]);
415
+ const pixelAttentionMask = new ort.Tensor('int64', pamData, [totalPatches, seqPerPatch]);
416
+ const spatialShapes = new ort.Tensor('int64', ssData, [1, 2]);
417
 
418
+ resolve({ pixelValues, pixelAttentionMask, spatialShapes, nPatchH, nPatchW, totalPatches });
419
  };
420
  img.onerror = reject;
421
  img.src = dataUrl;
422
  });
423
  }
424
 
425
+ // Look up the integer token ID for a special token string by scanning the vocab
426
+ function findTokenId(tokStr) {
427
+ // Transformers.js tokenizer exposes vocab via .vocab or ._tokenizer.model.vocab
428
+ try {
429
+ const vocab = tokenizer.vocab || tokenizer._tokenizer?.model?.vocab;
430
+ if (vocab && vocab[tokStr] !== undefined) return vocab[tokStr];
431
+ } catch(e) {}
432
+ // Fallback: encode the bare string and take the first token
433
+ try {
434
+ const ids = tokenizer.encode(tokStr, { add_special_tokens: false });
435
+ if (ids && ids.length > 0) return ids[0];
436
+ } catch(e) {}
437
+ return null;
438
+ }
439
+
440
  // ─── HELPERS ─────────────────────────────────────────────────────────────
441
  async function getTextEmbeddings(ids) {
442
  const tensor = new ort.Tensor('int64',
 
579
  const imgData = await processImage(attachedImage);
580
 
581
  // Get image embeddings from vision encoder
582
+ // Output is image_features: [total_image_tokens, hidden_size]
583
  const imgOut = await embedImages.run({
584
+ pixel_values: imgData.pixelValues,
585
  pixel_attention_mask: imgData.pixelAttentionMask,
586
+ spatial_shapes: imgData.spatialShapes,
587
  });
588
+ // The output key may vary grab the first output tensor
589
+ const imageEmbeds = imgOut.image_features
590
+ || imgOut.outputs
591
+ || imgOut[Object.keys(imgOut)[0]];
592
+ console.log('Image embed output keys:', Object.keys(imgOut));
593
+ console.log('Image embed shape:', imageEmbeds.dims);
594
+
595
+ // Find <image> token positions in the token sequence.
596
+ // In Transformers.js, use findTokenId() helper — no .convert_tokens_to_ids()
597
+ const imageTokenId = findTokenId('<image>');
598
+ console.log('Image token ID:', imageTokenId);
599
  const ids = Array.from(inputIds);
600
+
601
+ // Count how many image positions we have
602
  const imagePositions = ids.reduce((acc, id, i) => {
603
  if (id === imageTokenId) acc.push(i);
604
  return acc;
605
  }, []);
606
+ console.log('Image token positions:', imagePositions.length, 'image embeds:', imageEmbeds.dims[0]);
607
 
608
+ // The vision encoder returns one embedding vector per image token slot.
609
+ // We replace each <image> token embedding with the corresponding image embed.
610
+ // If there are more image embed vectors than <image> tokens, we expand:
611
+ // the single <image> token placeholder is replaced by ALL image embed vectors.
612
  const embedDim = inputsEmbeds.dims[2];
613
+ const numImgVecs = imageEmbeds.dims[0]; // actual number of image feature vectors
614
+
615
+ if (imagePositions.length === 0) {
616
+ // No <image> placeholder in tokenised text just prepend image embeds
617
+ const totalLen = numImgVecs + inputsEmbeds.dims[1];
618
+ const mergedData = new Float32Array(totalLen * embedDim);
619
+ mergedData.set(new Float32Array(imageEmbeds.data.buffer, imageEmbeds.data.byteOffset, numImgVecs * embedDim), 0);
620
+ mergedData.set(new Float32Array(inputsEmbeds.data.buffer, inputsEmbeds.data.byteOffset, inputsEmbeds.dims[1] * embedDim), numImgVecs * embedDim);
621
+ inputsEmbeds = new ort.Tensor('float32', mergedData, [1, totalLen, embedDim]);
622
+ } else {
623
+ // Replace <image> token(s) with image embed vectors (expanding 1→N if needed)
624
+ const numReplace = imagePositions.length; // usually 1
625
+ const expandPer = Math.ceil(numImgVecs / numReplace);
626
+ const totalLen = inputsEmbeds.dims[1] - numReplace + numImgVecs;
627
+ const mergedData = new Float32Array(totalLen * embedDim);
628
+ const imgEmbeds32 = new Float32Array(imageEmbeds.data.buffer ?? imageEmbeds.data, 0, numImgVecs * embedDim);
629
+ const txtData32 = new Float32Array(inputsEmbeds.data.buffer ?? inputsEmbeds.data, 0, inputsEmbeds.dims[1] * embedDim);
630
+
631
+ let dst = 0;
632
+ let imgCursor = 0;
633
+ const imgPosSet = new Set(imagePositions);
634
+ for (let i = 0; i < ids.length; i++) {
635
+ if (imgPosSet.has(i)) {
636
+ // Insert all remaining image embed vectors at first <image> token, skip rest
637
+ if (imgCursor < numImgVecs) {
638
+ const toCopy = (i === imagePositions[0]) ? numImgVecs : 0;
639
+ mergedData.set(imgEmbeds32.subarray(0, toCopy * embedDim), dst * embedDim);
640
+ dst += toCopy;
641
+ imgCursor = numImgVecs;
642
+ }
643
+ } else {
644
+ mergedData.set(txtData32.subarray(i * embedDim, (i + 1) * embedDim), dst * embedDim);
645
+ dst++;
646
  }
 
 
 
 
 
 
 
 
647
  }
648
+ inputsEmbeds = new ort.Tensor('float32', mergedData, [1, dst, embedDim]);
649
  }
 
 
650
  statusEl.textContent = 'Generating response...';
651
  } else {
652
  statusEl.textContent = 'Generating response...';
 
657
  // Generation loop
658
  const cache = initCache();
659
  const eosId = tokenizer.eos_token_id;
660
+ const imEndId = findTokenId('<|im_end|>');
661
  const generatedTokens = [];
662
  let curLen = inputsEmbeds.dims[1];
663
  let embeds = inputsEmbeds;
 
1250
  }
1251
 
1252
  .spec-val.green { color: #5a8a00; }
1253
+
1254
+ /* LOADING OVERLAY */
1255
+ #loading-overlay {
1256
+ display: none;
1257
+ position: fixed;
1258
+ inset: 0;
1259
+ z-index: 100;
1260
+ background: rgba(8,8,8,0.97);
1261
+ flex-direction: column;
1262
+ align-items: center;
1263
+ justify-content: center;
1264
+ backdrop-filter: blur(4px);
1265
+ }
1266
+ .lo-inner {
1267
+ width: min(540px, 90vw);
1268
+ display: flex;
1269
+ flex-direction: column;
1270
+ gap: 28px;
1271
+ }
1272
+ .lo-title {
1273
+ font-family: 'Bebas Neue', sans-serif;
1274
+ font-size: 11px;
1275
+ letter-spacing: 4px;
1276
+ color: var(--accent);
1277
+ opacity: 0.6;
1278
+ margin-bottom: 4px;
1279
+ }
1280
+ #loading-step {
1281
+ font-family: 'DM Mono', monospace;
1282
+ font-size: 13px;
1283
+ color: var(--text);
1284
+ }
1285
+ .lo-bar-wrap {
1286
+ height: 6px;
1287
+ background: #1a1a1a;
1288
+ border-radius: 3px;
1289
+ overflow: visible;
1290
+ position: relative;
1291
+ }
1292
+ #loading-bar-fill {
1293
+ height: 100%;
1294
+ background: var(--accent);
1295
+ border-radius: 3px;
1296
+ width: 0%;
1297
+ transition: width 0.5s ease;
1298
+ box-shadow: 0 0 12px var(--accent);
1299
+ position: relative;
1300
+ }
1301
+ #loading-bar-fill::after {
1302
+ content: '';
1303
+ position: absolute;
1304
+ right: -1px; top: -3px;
1305
+ width: 12px; height: 12px;
1306
+ background: var(--accent);
1307
+ border-radius: 50%;
1308
+ box-shadow: 0 0 10px var(--accent), 0 0 20px var(--accent);
1309
+ }
1310
+ .lo-stats {
1311
+ display: flex;
1312
+ justify-content: space-between;
1313
+ align-items: baseline;
1314
+ }
1315
+ #loading-pct {
1316
+ font-family: 'Bebas Neue', sans-serif;
1317
+ font-size: 52px;
1318
+ letter-spacing: 2px;
1319
+ color: var(--accent);
1320
+ line-height: 1;
1321
+ }
1322
+ .lo-right {
1323
+ display: flex;
1324
+ flex-direction: column;
1325
+ align-items: flex-end;
1326
+ gap: 5px;
1327
+ }
1328
+ #loading-bytes {
1329
+ font-family: 'DM Mono', monospace;
1330
+ font-size: 11px;
1331
+ color: #555;
1332
+ }
1333
+ #loading-eta {
1334
+ font-family: 'DM Mono', monospace;
1335
+ font-size: 10px;
1336
+ color: var(--muted);
1337
+ text-align: right;
1338
+ max-width: 300px;
1339
+ }
1340
+ .lo-steps {
1341
+ display: flex;
1342
+ flex-direction: column;
1343
+ border: 1px solid var(--border);
1344
+ }
1345
+ .lo-step-row {
1346
+ display: flex;
1347
+ align-items: center;
1348
+ gap: 12px;
1349
+ padding: 9px 14px;
1350
+ border-bottom: 1px solid var(--border);
1351
+ }
1352
+ .lo-step-row:last-child { border-bottom: none; }
1353
+ .lo-step-dot {
1354
+ width: 6px; height: 6px;
1355
+ border-radius: 50%;
1356
+ background: #1e1e1e;
1357
+ flex-shrink: 0;
1358
+ transition: background 0.3s;
1359
+ }
1360
+ .lo-step-dot.active {
1361
+ background: var(--accent);
1362
+ box-shadow: 0 0 8px var(--accent);
1363
+ animation: thinkBounce 1s infinite;
1364
+ }
1365
+ .lo-step-dot.done { background: #3a6a00; animation: none; }
1366
+ .lo-step-label {
1367
+ font-family: 'DM Mono', monospace;
1368
+ font-size: 10px;
1369
+ color: #2a2a2a;
1370
+ transition: color 0.3s;
1371
+ flex: 1;
1372
+ }
1373
+ .lo-step-label.active { color: var(--text); }
1374
+ .lo-step-label.done { color: #3a6a00; }
1375
+ .lo-step-size {
1376
+ font-family: 'DM Mono', monospace;
1377
+ font-size: 9px;
1378
+ color: #1e1e1e;
1379
+ }
1380
+ #loading-file {
1381
+ font-family: 'DM Mono', monospace;
1382
+ font-size: 9px;
1383
+ color: #2a2a2a;
1384
+ white-space: nowrap;
1385
+ overflow: hidden;
1386
+ text-overflow: ellipsis;
1387
+ max-width: 100%;
1388
+ margin-top: 4px;
1389
+ }
1390
+ .lo-note {
1391
+ font-family: 'DM Mono', monospace;
1392
+ font-size: 9px;
1393
+ color: #1e1e1e;
1394
+ line-height: 1.8;
1395
+ }
1396
+ #progress-detail {
1397
+ font-family: 'DM Mono', monospace;
1398
+ font-size: 9px;
1399
+ color: #444;
1400
+ white-space: nowrap;
1401
+ overflow: hidden;
1402
+ text-overflow: ellipsis;
1403
+ max-width: 200px;
1404
+ }
1405
  </style>
1406
  </head>
1407
  <body>
 
1422
  <div id="progress">
1423
  <div class="progress-track"><div id="progress-bar"></div></div>
1424
  <span id="progress-text"></span>
1425
+ <span id="progress-detail"></span>
1426
  </div>
1427
  <div class="model-tag">LFM2.5-VL-1.6B · ONNX · WebGPU</div>
1428
  <button id="load-btn">Load Model</button>
 
1504
  </div>
1505
  </div>
1506
 
1507
+ <!-- Loading overlay -->
1508
+ <div id="loading-overlay">
1509
+ <div class="lo-inner">
1510
+ <div>
1511
+ <div class="lo-title">DOWNLOADING MODEL</div>
1512
+ <div id="loading-step">Initializing...</div>
1513
+ <div id="loading-file"></div>
1514
+ </div>
1515
+
1516
+ <div>
1517
+ <div class="lo-stats">
1518
+ <div id="loading-pct">0%</div>
1519
+ <div class="lo-right">
1520
+ <div id="loading-bytes">0 MB / ~1.5 GB</div>
1521
+ <div id="loading-eta">Calculating speed...</div>
1522
+ </div>
1523
+ </div>
1524
+ <div class="lo-bar-wrap" style="margin-top:12px">
1525
+ <div id="loading-bar-fill"></div>
1526
+ </div>
1527
+ </div>
1528
+
1529
+ <div class="lo-steps">
1530
+ <div class="lo-step-row" id="step-row-1">
1531
+ <div class="lo-step-dot" id="step-dot-1"></div>
1532
+ <div class="lo-step-label" id="step-lbl-1">Tokenizer</div>
1533
+ <div class="lo-step-size">~5 MB</div>
1534
+ </div>
1535
+ <div class="lo-step-row" id="step-row-2">
1536
+ <div class="lo-step-dot" id="step-dot-2"></div>
1537
+ <div class="lo-step-label" id="step-lbl-2">Token Embedder</div>
1538
+ <div class="lo-step-size">~30 MB</div>
1539
+ </div>
1540
+ <div class="lo-step-row" id="step-row-3">
1541
+ <div class="lo-step-dot" id="step-dot-3"></div>
1542
+ <div class="lo-step-label" id="step-lbl-3">Vision Encoder (SigLIP2)</div>
1543
+ <div class="lo-step-size">~400 MB</div>
1544
+ </div>
1545
+ <div class="lo-step-row" id="step-row-4">
1546
+ <div class="lo-step-dot" id="step-dot-4"></div>
1547
+ <div class="lo-step-label" id="step-lbl-4">Language Decoder (Q4)</div>
1548
+ <div class="lo-step-size">~1.1 GB</div>
1549
+ </div>
1550
+ </div>
1551
+
1552
+ <div class="lo-note">
1553
+ ⚡ First load downloads ~1.5 GB from Hugging Face.<br>
1554
+ 🔒 Everything runs 100% in-browser — zero data leaves your device.<br>
1555
+ 🛜 Keep this tab open. Do not refresh during download.
1556
+ </div>
1557
+ </div>
1558
+ </div>
1559
+
1560
  </body>
1561
  </html>