Spaces:
Running
Running
Update index.html
Browse files- index.html +585 -125
index.html
CHANGED
|
@@ -16,36 +16,218 @@
|
|
| 16 |
const HEAD_DIM = 128;
|
| 17 |
const MAX_NEW_TOKENS = 512;
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
let tokenizer = null;
|
| 20 |
let embedTokens = null;
|
| 21 |
let embedImages = null;
|
| 22 |
let decoder = null;
|
| 23 |
let isLoaded = false;
|
| 24 |
let isGenerating = false;
|
| 25 |
-
|
| 26 |
-
// Conversation history stored as array of {role, content} where content may have image
|
| 27 |
let chatHistory = [];
|
| 28 |
|
| 29 |
const $ = id => document.getElementById(id);
|
| 30 |
-
const statusEl
|
| 31 |
-
const progressEl
|
| 32 |
-
const progressBar
|
| 33 |
-
const progressText
|
| 34 |
-
const
|
| 35 |
-
const
|
| 36 |
-
const
|
| 37 |
-
const
|
| 38 |
-
const
|
| 39 |
-
const
|
| 40 |
-
const
|
| 41 |
-
const
|
| 42 |
-
const
|
|
|
|
| 43 |
const imagePreviewImg = $('image-preview-img');
|
| 44 |
-
const removeImageBtn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
//
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
// ─── CACHE CHECK ─────────────────────────────────────────────────────────
|
| 51 |
async function checkCache() {
|
|
@@ -54,7 +236,7 @@
|
|
| 54 |
const est = await navigator.storage.estimate();
|
| 55 |
const usedMB = ((est.usage || 0) / 1024 / 1024).toFixed(0);
|
| 56 |
const quotaGB = ((est.quota || 0) / 1024 / 1024 / 1024).toFixed(1);
|
| 57 |
-
storageInfo.textContent = `
|
| 58 |
}
|
| 59 |
} catch(e) {}
|
| 60 |
}
|
|
@@ -64,81 +246,81 @@
|
|
| 64 |
loadBtn.disabled = true;
|
| 65 |
loadBtn.textContent = 'Loading...';
|
| 66 |
progressEl.style.display = 'flex';
|
|
|
|
| 67 |
$('welcome').style.display = 'none';
|
|
|
|
| 68 |
|
| 69 |
try {
|
| 70 |
-
// Configure ONNX runtime
|
| 71 |
ort.env.wasm.numThreads = 1;
|
| 72 |
|
| 73 |
-
// Step 1:
|
| 74 |
-
|
| 75 |
-
progressBar.style.width = '5%';
|
| 76 |
-
progressText.textContent = '5%';
|
| 77 |
tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
progressBar.style.width = '20%';
|
| 82 |
-
progressText.textContent = '20%';
|
| 83 |
-
embedTokens = await loadOrtSession('embed_tokens_fp16', 1);
|
| 84 |
-
|
| 85 |
-
// Step 3: Load embed_images session
|
| 86 |
-
statusEl.textContent = 'Loading vision encoder (~400MB)...';
|
| 87 |
-
progressBar.style.width = '40%';
|
| 88 |
-
progressText.textContent = '40%';
|
| 89 |
-
embedImages = await loadOrtSession('embed_images_fp16', 1);
|
| 90 |
-
|
| 91 |
-
// Step 4: Load decoder (largest, q4 ~1.1GB)
|
| 92 |
-
statusEl.textContent = 'Loading language decoder (~1.1GB)...';
|
| 93 |
-
progressBar.style.width = '60%';
|
| 94 |
-
progressText.textContent = '60%';
|
| 95 |
-
decoder = await loadOrtSession('decoder_q4', 1);
|
| 96 |
-
|
| 97 |
-
progressBar.style.width = '100%';
|
| 98 |
-
progressText.textContent = '100%';
|
| 99 |
-
statusEl.textContent = 'Model ready — running fully on your device (WebGPU)';
|
| 100 |
progressEl.style.display = 'none';
|
| 101 |
|
| 102 |
isLoaded = true;
|
| 103 |
inputEl.disabled = false;
|
| 104 |
sendBtn.disabled = false;
|
| 105 |
imageBtn.disabled = false;
|
| 106 |
-
inputEl.placeholder = 'Ask anything... (optionally attach an image)';
|
| 107 |
loadBtn.style.display = 'none';
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
checkCache();
|
| 110 |
-
addSystemMessage('✓ LFM2.5-VL-1.6B loaded
|
| 111 |
|
| 112 |
} catch(err) {
|
| 113 |
console.error(err);
|
| 114 |
-
|
| 115 |
progressEl.style.display = 'none';
|
| 116 |
loadBtn.disabled = false;
|
| 117 |
loadBtn.textContent = 'Retry Load';
|
| 118 |
-
|
| 119 |
-
|
|
|
|
| 120 |
} else {
|
| 121 |
-
addSystemMessage(`Error loading model: ${err.message}`);
|
| 122 |
}
|
| 123 |
}
|
| 124 |
}
|
| 125 |
|
| 126 |
-
async function loadOrtSession(name, dataFiles = 1) {
|
| 127 |
-
const onnxUrl = `${MODEL_BASE}/onnx/${name}.onnx`;
|
| 128 |
-
const externalData = [];
|
| 129 |
-
for (let i = 0; i < dataFiles; i++) {
|
| 130 |
-
const suffix = i === 0 ? '' : `_${i}`;
|
| 131 |
-
externalData.push({
|
| 132 |
-
path: `${name}.onnx_data${suffix}`,
|
| 133 |
-
data: `${MODEL_BASE}/onnx/${name}.onnx_data${suffix}`
|
| 134 |
-
});
|
| 135 |
-
}
|
| 136 |
-
return ort.InferenceSession.create(onnxUrl, {
|
| 137 |
-
executionProviders: ['webgpu'],
|
| 138 |
-
externalData,
|
| 139 |
-
});
|
| 140 |
-
}
|
| 141 |
-
|
| 142 |
// ─── IMAGE HANDLING ───────────────────────────────────────────────────────
|
| 143 |
imageBtn.addEventListener('click', () => imageInput.click());
|
| 144 |
|
|
@@ -162,55 +344,99 @@
|
|
| 162 |
imagePreviewImg.src = '';
|
| 163 |
});
|
| 164 |
|
| 165 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
async function processImage(dataUrl) {
|
| 167 |
return new Promise((resolve, reject) => {
|
| 168 |
const img = new Image();
|
| 169 |
img.onload = () => {
|
| 170 |
-
// Resize to
|
| 171 |
let w = img.width, h = img.height;
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
else { w = Math.round(w * maxDim / h); h = maxDim; }
|
| 176 |
}
|
| 177 |
-
//
|
| 178 |
-
w = Math.max(
|
| 179 |
-
h = Math.max(
|
| 180 |
|
| 181 |
const canvas = document.createElement('canvas');
|
| 182 |
canvas.width = w; canvas.height = h;
|
| 183 |
const ctx = canvas.getContext('2d');
|
| 184 |
ctx.drawImage(img, 0, 0, w, h);
|
| 185 |
-
const
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
const
|
| 190 |
-
const
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
}
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
const
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
-
resolve({ pixelValues, pixelAttentionMask, spatialShapes,
|
| 208 |
};
|
| 209 |
img.onerror = reject;
|
| 210 |
img.src = dataUrl;
|
| 211 |
});
|
| 212 |
}
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
// ─── HELPERS ─────────────────────────────────────────────────────────────
|
| 215 |
async function getTextEmbeddings(ids) {
|
| 216 |
const tensor = new ort.Tensor('int64',
|
|
@@ -353,46 +579,74 @@
|
|
| 353 |
const imgData = await processImage(attachedImage);
|
| 354 |
|
| 355 |
// Get image embeddings from vision encoder
|
|
|
|
| 356 |
const imgOut = await embedImages.run({
|
| 357 |
-
pixel_values:
|
| 358 |
pixel_attention_mask: imgData.pixelAttentionMask,
|
| 359 |
-
spatial_shapes:
|
| 360 |
});
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
const ids = Array.from(inputIds);
|
|
|
|
|
|
|
| 366 |
const imagePositions = ids.reduce((acc, id, i) => {
|
| 367 |
if (id === imageTokenId) acc.push(i);
|
| 368 |
return acc;
|
| 369 |
}, []);
|
|
|
|
| 370 |
|
| 371 |
-
//
|
|
|
|
|
|
|
|
|
|
| 372 |
const embedDim = inputsEmbeds.dims[2];
|
| 373 |
-
const
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
}
|
| 384 |
-
imgIdx++;
|
| 385 |
-
dstIdx++;
|
| 386 |
-
} else {
|
| 387 |
-
// Copy text embedding
|
| 388 |
-
for (let d = 0; d < embedDim; d++) {
|
| 389 |
-
mergedData[dstIdx * embedDim + d] = inputsEmbeds.data[i * embedDim + d];
|
| 390 |
-
}
|
| 391 |
-
dstIdx++;
|
| 392 |
}
|
|
|
|
| 393 |
}
|
| 394 |
-
|
| 395 |
-
inputsEmbeds = new ort.Tensor('float32', mergedData, [1, totalLen, embedDim]);
|
| 396 |
statusEl.textContent = 'Generating response...';
|
| 397 |
} else {
|
| 398 |
statusEl.textContent = 'Generating response...';
|
|
@@ -403,7 +657,7 @@
|
|
| 403 |
// Generation loop
|
| 404 |
const cache = initCache();
|
| 405 |
const eosId = tokenizer.eos_token_id;
|
| 406 |
-
const imEndId =
|
| 407 |
const generatedTokens = [];
|
| 408 |
let curLen = inputsEmbeds.dims[1];
|
| 409 |
let embeds = inputsEmbeds;
|
|
@@ -996,6 +1250,158 @@
|
|
| 996 |
}
|
| 997 |
|
| 998 |
.spec-val.green { color: #5a8a00; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
</style>
|
| 1000 |
</head>
|
| 1001 |
<body>
|
|
@@ -1016,6 +1422,7 @@
|
|
| 1016 |
<div id="progress">
|
| 1017 |
<div class="progress-track"><div id="progress-bar"></div></div>
|
| 1018 |
<span id="progress-text"></span>
|
|
|
|
| 1019 |
</div>
|
| 1020 |
<div class="model-tag">LFM2.5-VL-1.6B · ONNX · WebGPU</div>
|
| 1021 |
<button id="load-btn">Load Model</button>
|
|
@@ -1097,5 +1504,58 @@
|
|
| 1097 |
</div>
|
| 1098 |
</div>
|
| 1099 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1100 |
</body>
|
| 1101 |
</html>
|
|
|
|
| 16 |
const HEAD_DIM = 128;
|
| 17 |
const MAX_NEW_TOKENS = 512;
|
| 18 |
|
| 19 |
+
// Approximate file sizes in bytes for progress weighting
|
| 20 |
+
const FILE_SIZES = {
|
| 21 |
+
'embed_tokens_fp16.onnx_data': 30 * 1024 * 1024,
|
| 22 |
+
'embed_images_fp16.onnx_data': 400 * 1024 * 1024,
|
| 23 |
+
'decoder_q4.onnx_data': 1100 * 1024 * 1024,
|
| 24 |
+
};
|
| 25 |
+
const TOTAL_BYTES = Object.values(FILE_SIZES).reduce((a, b) => a + b, 0);
|
| 26 |
+
|
| 27 |
+
// Per-file downloaded bytes tracker
|
| 28 |
+
const downloadedBytes = {};
|
| 29 |
+
let compilingPhase = false;
|
| 30 |
+
|
| 31 |
let tokenizer = null;
|
| 32 |
let embedTokens = null;
|
| 33 |
let embedImages = null;
|
| 34 |
let decoder = null;
|
| 35 |
let isLoaded = false;
|
| 36 |
let isGenerating = false;
|
|
|
|
|
|
|
| 37 |
let chatHistory = [];
|
| 38 |
|
| 39 |
const $ = id => document.getElementById(id);
|
| 40 |
+
const statusEl = $('status');
|
| 41 |
+
const progressEl = $('progress');
|
| 42 |
+
const progressBar = $('progress-bar');
|
| 43 |
+
const progressText = $('progress-text');
|
| 44 |
+
const progressDetail = $('progress-detail');
|
| 45 |
+
const chatContainer = $('chat-container');
|
| 46 |
+
const inputEl = $('user-input');
|
| 47 |
+
const sendBtn = $('send-btn');
|
| 48 |
+
const loadBtn = $('load-btn');
|
| 49 |
+
const storageInfo = $('storage-info');
|
| 50 |
+
const cacheIndicator = $('cache-indicator');
|
| 51 |
+
const imageBtn = $('image-btn');
|
| 52 |
+
const imageInput = $('image-input');
|
| 53 |
+
const imagePreview = $('image-preview');
|
| 54 |
const imagePreviewImg = $('image-preview-img');
|
| 55 |
+
const removeImageBtn = $('remove-image-btn');
|
| 56 |
+
const loadingOverlay = $('loading-overlay');
|
| 57 |
+
const loadingStep = $('loading-step');
|
| 58 |
+
const loadingFile = $('loading-file');
|
| 59 |
+
const loadingBytes = $('loading-bytes');
|
| 60 |
+
const loadingEta = $('loading-eta');
|
| 61 |
+
const loadingBarFill = $('loading-bar-fill');
|
| 62 |
+
const loadingPct = $('loading-pct');
|
| 63 |
+
|
| 64 |
+
let currentImageData = null;
|
| 65 |
+
|
| 66 |
+
// ─── SPEED / ETA TRACKING ────────────────────────────────────────────────
|
| 67 |
+
let downloadStart = null;
|
| 68 |
+
let lastSpeedBytes = 0;
|
| 69 |
+
let lastSpeedTime = 0;
|
| 70 |
+
let speedSamples = [];
|
| 71 |
+
|
| 72 |
+
function formatBytes(bytes) {
|
| 73 |
+
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
|
| 74 |
+
return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
function formatSpeed(bps) {
|
| 78 |
+
if (bps < 1024 * 1024) return `${(bps / 1024).toFixed(0)} KB/s`;
|
| 79 |
+
return `${(bps / 1024 / 1024).toFixed(1)} MB/s`;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
function formatEta(seconds) {
|
| 83 |
+
if (!isFinite(seconds) || seconds < 0) return '—';
|
| 84 |
+
if (seconds < 60) return `~${Math.ceil(seconds)}s`;
|
| 85 |
+
return `~${Math.ceil(seconds / 60)}min ${Math.ceil(seconds % 60)}s`;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
function updateOverallProgress(currentFile) {
|
| 89 |
+
const total = Object.values(downloadedBytes).reduce((a, b) => a + b, 0);
|
| 90 |
+
const pct = Math.min(99, Math.round((total / TOTAL_BYTES) * 100));
|
| 91 |
+
|
| 92 |
+
// Speed calculation (rolling average over last 5 samples)
|
| 93 |
+
const now = Date.now();
|
| 94 |
+
if (lastSpeedTime) {
|
| 95 |
+
const dt = (now - lastSpeedTime) / 1000;
|
| 96 |
+
const db = total - lastSpeedBytes;
|
| 97 |
+
if (dt > 0.3) {
|
| 98 |
+
const sample = db / dt;
|
| 99 |
+
speedSamples.push(sample);
|
| 100 |
+
if (speedSamples.length > 8) speedSamples.shift();
|
| 101 |
+
lastSpeedBytes = total;
|
| 102 |
+
lastSpeedTime = now;
|
| 103 |
+
}
|
| 104 |
+
} else {
|
| 105 |
+
downloadStart = now;
|
| 106 |
+
lastSpeedTime = now;
|
| 107 |
+
lastSpeedBytes = 0;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
const avgSpeed = speedSamples.length
|
| 111 |
+
? speedSamples.reduce((a, b) => a + b, 0) / speedSamples.length
|
| 112 |
+
: 0;
|
| 113 |
+
const remaining = avgSpeed > 0 ? (TOTAL_BYTES - total) / avgSpeed : Infinity;
|
| 114 |
+
|
| 115 |
+
// Update big overlay
|
| 116 |
+
loadingBarFill.style.width = `${pct}%`;
|
| 117 |
+
loadingPct.textContent = `${pct}%`;
|
| 118 |
+
loadingBytes.textContent = `${formatBytes(total)} / ${formatBytes(TOTAL_BYTES)}`;
|
| 119 |
+
if (avgSpeed > 0) {
|
| 120 |
+
loadingEta.textContent = `${formatSpeed(avgSpeed)} · ETA ${formatEta(remaining)}`;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
// Update header mini bar
|
| 124 |
+
progressBar.style.width = `${pct}%`;
|
| 125 |
+
progressText.textContent = `${pct}%`;
|
| 126 |
+
|
| 127 |
+
// Current file label
|
| 128 |
+
if (currentFile) {
|
| 129 |
+
const fileBytes = downloadedBytes[currentFile] || 0;
|
| 130 |
+
const fileTotal = FILE_SIZES[currentFile] || 0;
|
| 131 |
+
const filePct = fileTotal ? Math.min(100, Math.round(fileBytes / fileTotal * 100)) : 0;
|
| 132 |
+
loadingFile.textContent = currentFile;
|
| 133 |
+
if (progressDetail) progressDetail.textContent = `${currentFile} — ${filePct}%`;
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
// ─── FETCH WITH PROGRESS ─────────────────────────────────────────────────
|
| 138 |
+
async function fetchWithProgress(url, label) {
|
| 139 |
+
const key = label;
|
| 140 |
+
downloadedBytes[key] = 0;
|
| 141 |
+
|
| 142 |
+
const resp = await fetch(url);
|
| 143 |
+
if (!resp.ok) throw new Error(`HTTP ${resp.status} for ${url}`);
|
| 144 |
+
|
| 145 |
+
const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
|
| 146 |
+
const knownSize = FILE_SIZES[label] || contentLength || 1;
|
| 147 |
+
|
| 148 |
+
const reader = resp.body.getReader();
|
| 149 |
+
const chunks = [];
|
| 150 |
+
let received = 0;
|
| 151 |
+
|
| 152 |
+
while (true) {
|
| 153 |
+
const { done, value } = await reader.read();
|
| 154 |
+
if (done) break;
|
| 155 |
+
chunks.push(value);
|
| 156 |
+
received += value.length;
|
| 157 |
+
downloadedBytes[key] = received;
|
| 158 |
+
updateOverallProgress(label);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
// Merge chunks
|
| 162 |
+
const total = chunks.reduce((s, c) => s + c.length, 0);
|
| 163 |
+
const merged = new Uint8Array(total);
|
| 164 |
+
let offset = 0;
|
| 165 |
+
for (const c of chunks) { merged.set(c, offset); offset += c.length; }
|
| 166 |
+
downloadedBytes[key] = merged.length;
|
| 167 |
+
return merged;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
// ─── LOAD ORT SESSION WITH PROGRESS ──────────────────────────────────────
|
| 171 |
+
async function loadOrtSessionWithProgress(name, stepLabel, stepNum) {
|
| 172 |
+
const onnxUrl = `${MODEL_BASE}/onnx/${name}.onnx`;
|
| 173 |
+
const dataLabel = `${name}.onnx_data`;
|
| 174 |
+
const dataUrl = `${MODEL_BASE}/onnx/${dataLabel}`;
|
| 175 |
+
|
| 176 |
+
setStep(stepLabel, 'Fetching model header...', stepNum);
|
| 177 |
+
|
| 178 |
+
// Fetch the small .onnx file (just the graph, no weights)
|
| 179 |
+
const onnxResp = await fetch(onnxUrl);
|
| 180 |
+
if (!onnxResp.ok) throw new Error(`Failed to fetch ${name}.onnx`);
|
| 181 |
+
const onnxBuffer = await onnxResp.arrayBuffer();
|
| 182 |
+
|
| 183 |
+
// Fetch the large external data with progress
|
| 184 |
+
setStep(stepLabel, `Downloading ${dataLabel}...`);
|
| 185 |
+
const dataBuffer = await fetchWithProgress(dataUrl, dataLabel);
|
| 186 |
+
|
| 187 |
+
// Compiling phase
|
| 188 |
+
setStep(stepLabel, 'Compiling WebGPU shaders...');
|
| 189 |
+
loadingEta.textContent = 'Compiling shaders — this can take 30–60s, please wait...';
|
| 190 |
+
compilingPhase = true;
|
| 191 |
+
|
| 192 |
+
const session = await ort.InferenceSession.create(onnxBuffer, {
|
| 193 |
+
executionProviders: ['webgpu'],
|
| 194 |
+
externalData: [{ path: dataLabel, data: dataBuffer.buffer }],
|
| 195 |
+
});
|
| 196 |
+
|
| 197 |
+
compilingPhase = false;
|
| 198 |
+
return session;
|
| 199 |
+
}
|
| 200 |
|
| 201 |
+
// Step index mapping
|
| 202 |
+
const STEP_MAP = {
|
| 203 |
+
1: 'Step 1 / 4',
|
| 204 |
+
2: 'Step 2 / 4',
|
| 205 |
+
3: 'Step 3 / 4',
|
| 206 |
+
4: 'Step 4 / 4',
|
| 207 |
+
};
|
| 208 |
+
|
| 209 |
+
function setStep(step, file, stepNum) {
|
| 210 |
+
loadingStep.textContent = step;
|
| 211 |
+
loadingFile.textContent = file;
|
| 212 |
+
statusEl.textContent = `${step} — ${file}`;
|
| 213 |
+
|
| 214 |
+
// Update step dots
|
| 215 |
+
for (let i = 1; i <= 4; i++) {
|
| 216 |
+
const dot = document.getElementById(`step-dot-${i}`);
|
| 217 |
+
const lbl = document.getElementById(`step-lbl-${i}`);
|
| 218 |
+
if (!dot) continue;
|
| 219 |
+
if (stepNum && i < stepNum) {
|
| 220 |
+
dot.className = 'lo-step-dot done';
|
| 221 |
+
lbl.className = 'lo-step-label done';
|
| 222 |
+
} else if (stepNum && i === stepNum) {
|
| 223 |
+
dot.className = 'lo-step-dot active';
|
| 224 |
+
lbl.className = 'lo-step-label active';
|
| 225 |
+
} else {
|
| 226 |
+
dot.className = 'lo-step-dot';
|
| 227 |
+
lbl.className = 'lo-step-label';
|
| 228 |
+
}
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
|
| 232 |
// ─── CACHE CHECK ─────────────────────────────────────────────────────────
|
| 233 |
async function checkCache() {
|
|
|
|
| 236 |
const est = await navigator.storage.estimate();
|
| 237 |
const usedMB = ((est.usage || 0) / 1024 / 1024).toFixed(0);
|
| 238 |
const quotaGB = ((est.quota || 0) / 1024 / 1024 / 1024).toFixed(1);
|
| 239 |
+
storageInfo.textContent = `${usedMB}MB used / ${quotaGB}GB available`;
|
| 240 |
}
|
| 241 |
} catch(e) {}
|
| 242 |
}
|
|
|
|
| 246 |
loadBtn.disabled = true;
|
| 247 |
loadBtn.textContent = 'Loading...';
|
| 248 |
progressEl.style.display = 'flex';
|
| 249 |
+
loadingOverlay.style.display = 'flex';
|
| 250 |
$('welcome').style.display = 'none';
|
| 251 |
+
downloadStart = Date.now();
|
| 252 |
|
| 253 |
try {
|
|
|
|
| 254 |
ort.env.wasm.numThreads = 1;
|
| 255 |
|
| 256 |
+
// Step 1: Tokenizer
|
| 257 |
+
setStep('Step 1 / 4 — Tokenizer', 'Downloading config files...', 1);
|
|
|
|
|
|
|
| 258 |
tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
|
| 259 |
+
setStep('Step 1 / 4 — Tokenizer', 'Done ✓', 1);
|
| 260 |
+
|
| 261 |
+
// Step 2: Token embedder (~30MB)
|
| 262 |
+
embedTokens = await loadOrtSessionWithProgress(
|
| 263 |
+
'embed_tokens_fp16',
|
| 264 |
+
'Step 2 / 4 — Token Embedder (~30 MB)',
|
| 265 |
+
2
|
| 266 |
+
);
|
| 267 |
+
|
| 268 |
+
// Step 3: Vision encoder (~400MB)
|
| 269 |
+
embedImages = await loadOrtSessionWithProgress(
|
| 270 |
+
'embed_images_fp16',
|
| 271 |
+
'Step 3 / 4 — Vision Encoder (~400 MB)',
|
| 272 |
+
3
|
| 273 |
+
);
|
| 274 |
+
|
| 275 |
+
// Step 4: Decoder (~1.1GB)
|
| 276 |
+
decoder = await loadOrtSessionWithProgress(
|
| 277 |
+
'decoder_q4',
|
| 278 |
+
'Step 4 / 4 — Language Decoder (~1.1 GB)',
|
| 279 |
+
4
|
| 280 |
+
);
|
| 281 |
+
|
| 282 |
+
// Done! Mark all steps done
|
| 283 |
+
for (let i = 1; i <= 4; i++) {
|
| 284 |
+
const dot = document.getElementById(`step-dot-${i}`);
|
| 285 |
+
const lbl = document.getElementById(`step-lbl-${i}`);
|
| 286 |
+
if (dot) { dot.className = 'lo-step-dot done'; lbl.className = 'lo-step-label done'; }
|
| 287 |
+
}
|
| 288 |
+
loadingBarFill.style.width = '100%';
|
| 289 |
+
loadingPct.textContent = '100%';
|
| 290 |
+
loadingEta.textContent = `Completed in ${((Date.now() - downloadStart) / 1000).toFixed(0)}s`;
|
| 291 |
|
| 292 |
+
await new Promise(r => setTimeout(r, 600));
|
| 293 |
+
loadingOverlay.style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
progressEl.style.display = 'none';
|
| 295 |
|
| 296 |
isLoaded = true;
|
| 297 |
inputEl.disabled = false;
|
| 298 |
sendBtn.disabled = false;
|
| 299 |
imageBtn.disabled = false;
|
| 300 |
+
inputEl.placeholder = 'Ask anything... (optionally attach an image 🖼)';
|
| 301 |
loadBtn.style.display = 'none';
|
| 302 |
+
statusEl.textContent = 'Model ready — running fully on your device';
|
| 303 |
+
cacheIndicator.innerHTML = `<span class="dot cached"></span> Model running on-device`;
|
| 304 |
+
cacheIndicator.classList.add('has-cache');
|
| 305 |
|
| 306 |
checkCache();
|
| 307 |
+
addSystemMessage('✓ LFM2.5-VL-1.6B loaded. Runs 100% in-browser via WebGPU. Attach an image or just chat!');
|
| 308 |
|
| 309 |
} catch(err) {
|
| 310 |
console.error(err);
|
| 311 |
+
loadingOverlay.style.display = 'none';
|
| 312 |
progressEl.style.display = 'none';
|
| 313 |
loadBtn.disabled = false;
|
| 314 |
loadBtn.textContent = 'Retry Load';
|
| 315 |
+
statusEl.textContent = `Error: ${err.message.slice(0, 80)}`;
|
| 316 |
+
if (err.message.includes('WebGPU') || err.message.includes('gpu')) {
|
| 317 |
+
addSystemMessage('⚠️ WebGPU not supported. Please use Chrome 113+ or Edge 113+ and check chrome://flags/#enable-unsafe-webgpu');
|
| 318 |
} else {
|
| 319 |
+
addSystemMessage(`❌ Error loading model: ${err.message}\n\nTry refreshing and loading again — large file downloads sometimes fail.`);
|
| 320 |
}
|
| 321 |
}
|
| 322 |
}
|
| 323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
// ─── IMAGE HANDLING ───────────────────────────────────────────────────────
|
| 325 |
imageBtn.addEventListener('click', () => imageInput.click());
|
| 326 |
|
|
|
|
| 344 |
imagePreviewImg.src = '';
|
| 345 |
});
|
| 346 |
|
| 347 |
+
// ── IMAGE PROCESSING ──────────────────────────────────────────────────────
|
| 348 |
+
// SigLIP2 NaFlex expects patches of 16x16 pixels from tiles of up to 512x512.
|
| 349 |
+
// embed_images_fp16 input shape:
|
| 350 |
+
// pixel_values: [total_patches, 3, 16, 16] (rank 4 — one entry per patch)
|
| 351 |
+
// pixel_attention_mask:[total_patches, seq_per_patch] where seq_per_patch = (512/16)^2 = 1024...
|
| 352 |
+
// spatial_shapes: [num_tiles, 2] each row = [nPatchH, nPatchW] for that tile
|
| 353 |
+
// We use a single tile resized to ≤512x512, snapped to multiples of 16.
|
| 354 |
+
|
| 355 |
+
const PATCH_SIZE = 16; // pixel patch size
|
| 356 |
+
const MAX_TILE = 512; // max tile dimension
|
| 357 |
+
|
| 358 |
async function processImage(dataUrl) {
|
| 359 |
return new Promise((resolve, reject) => {
|
| 360 |
const img = new Image();
|
| 361 |
img.onload = () => {
|
| 362 |
+
// Resize to fit inside MAX_TILE x MAX_TILE preserving aspect ratio
|
| 363 |
let w = img.width, h = img.height;
|
| 364 |
+
if (w > MAX_TILE || h > MAX_TILE) {
|
| 365 |
+
if (w >= h) { h = Math.round(h * MAX_TILE / w); w = MAX_TILE; }
|
| 366 |
+
else { w = Math.round(w * MAX_TILE / h); h = MAX_TILE; }
|
|
|
|
| 367 |
}
|
| 368 |
+
// Snap to nearest multiple of PATCH_SIZE
|
| 369 |
+
w = Math.max(PATCH_SIZE, Math.round(w / PATCH_SIZE) * PATCH_SIZE);
|
| 370 |
+
h = Math.max(PATCH_SIZE, Math.round(h / PATCH_SIZE) * PATCH_SIZE);
|
| 371 |
|
| 372 |
const canvas = document.createElement('canvas');
|
| 373 |
canvas.width = w; canvas.height = h;
|
| 374 |
const ctx = canvas.getContext('2d');
|
| 375 |
ctx.drawImage(img, 0, 0, w, h);
|
| 376 |
+
const rgba = ctx.getImageData(0, 0, w, h).data;
|
| 377 |
+
|
| 378 |
+
// Number of patches in each dimension for this single tile
|
| 379 |
+
const nPatchH = h / PATCH_SIZE; // rows of patches
|
| 380 |
+
const nPatchW = w / PATCH_SIZE; // cols of patches
|
| 381 |
+
const totalPatches = nPatchH * nPatchW;
|
| 382 |
+
|
| 383 |
+
// Build pixel_values: [totalPatches, 3, PATCH_SIZE, PATCH_SIZE]
|
| 384 |
+
// Normalise: (x/255 - 0.5) / 0.5 (SigLIP2 mean=0.5, std=0.5)
|
| 385 |
+
const patchElems = 3 * PATCH_SIZE * PATCH_SIZE;
|
| 386 |
+
const pvData = new Float32Array(totalPatches * patchElems);
|
| 387 |
+
|
| 388 |
+
for (let pr = 0; pr < nPatchH; pr++) {
|
| 389 |
+
for (let pc = 0; pc < nPatchW; pc++) {
|
| 390 |
+
const patchIdx = pr * nPatchW + pc;
|
| 391 |
+
for (let py = 0; py < PATCH_SIZE; py++) {
|
| 392 |
+
for (let px = 0; px < PATCH_SIZE; px++) {
|
| 393 |
+
const imgY = pr * PATCH_SIZE + py;
|
| 394 |
+
const imgX = pc * PATCH_SIZE + px;
|
| 395 |
+
const pixOff = (imgY * w + imgX) * 4; // RGBA offset in imageData
|
| 396 |
+
const base = patchIdx * patchElems;
|
| 397 |
+
// channel-first: [3, PATCH_SIZE, PATCH_SIZE]
|
| 398 |
+
pvData[base + 0 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 0] / 255 - 0.5) / 0.5;
|
| 399 |
+
pvData[base + 1 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 1] / 255 - 0.5) / 0.5;
|
| 400 |
+
pvData[base + 2 * PATCH_SIZE * PATCH_SIZE + py * PATCH_SIZE + px] = (rgba[pixOff + 2] / 255 - 0.5) / 0.5;
|
| 401 |
+
}
|
| 402 |
+
}
|
| 403 |
+
}
|
| 404 |
}
|
| 405 |
|
| 406 |
+
// pixel_attention_mask: [totalPatches, nPatchH * nPatchW] — all ones (all patches valid)
|
| 407 |
+
// Each patch attends to all other patches within the same tile
|
| 408 |
+
const seqPerPatch = nPatchH * nPatchW;
|
| 409 |
+
const pamData = new BigInt64Array(totalPatches * seqPerPatch).fill(1n);
|
| 410 |
+
|
| 411 |
+
// spatial_shapes: [1, 2] — one tile with shape [nPatchH, nPatchW]
|
| 412 |
+
const ssData = new BigInt64Array([BigInt(nPatchH), BigInt(nPatchW)]);
|
| 413 |
+
|
| 414 |
+
const pixelValues = new ort.Tensor('float32', pvData, [totalPatches, 3, PATCH_SIZE, PATCH_SIZE]);
|
| 415 |
+
const pixelAttentionMask = new ort.Tensor('int64', pamData, [totalPatches, seqPerPatch]);
|
| 416 |
+
const spatialShapes = new ort.Tensor('int64', ssData, [1, 2]);
|
| 417 |
|
| 418 |
+
resolve({ pixelValues, pixelAttentionMask, spatialShapes, nPatchH, nPatchW, totalPatches });
|
| 419 |
};
|
| 420 |
img.onerror = reject;
|
| 421 |
img.src = dataUrl;
|
| 422 |
});
|
| 423 |
}
|
| 424 |
|
| 425 |
+
// Look up the integer token ID for a special token string by scanning the vocab
|
| 426 |
+
function findTokenId(tokStr) {
|
| 427 |
+
// Transformers.js tokenizer exposes vocab via .vocab or ._tokenizer.model.vocab
|
| 428 |
+
try {
|
| 429 |
+
const vocab = tokenizer.vocab || tokenizer._tokenizer?.model?.vocab;
|
| 430 |
+
if (vocab && vocab[tokStr] !== undefined) return vocab[tokStr];
|
| 431 |
+
} catch(e) {}
|
| 432 |
+
// Fallback: encode the bare string and take the first token
|
| 433 |
+
try {
|
| 434 |
+
const ids = tokenizer.encode(tokStr, { add_special_tokens: false });
|
| 435 |
+
if (ids && ids.length > 0) return ids[0];
|
| 436 |
+
} catch(e) {}
|
| 437 |
+
return null;
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
// ─── HELPERS ─────────────────────────────────────────────────────────────
|
| 441 |
async function getTextEmbeddings(ids) {
|
| 442 |
const tensor = new ort.Tensor('int64',
|
|
|
|
| 579 |
const imgData = await processImage(attachedImage);
|
| 580 |
|
| 581 |
// Get image embeddings from vision encoder
|
| 582 |
+
// Output is image_features: [total_image_tokens, hidden_size]
|
| 583 |
const imgOut = await embedImages.run({
|
| 584 |
+
pixel_values: imgData.pixelValues,
|
| 585 |
pixel_attention_mask: imgData.pixelAttentionMask,
|
| 586 |
+
spatial_shapes: imgData.spatialShapes,
|
| 587 |
});
|
| 588 |
+
// The output key may vary — grab the first output tensor
|
| 589 |
+
const imageEmbeds = imgOut.image_features
|
| 590 |
+
|| imgOut.outputs
|
| 591 |
+
|| imgOut[Object.keys(imgOut)[0]];
|
| 592 |
+
console.log('Image embed output keys:', Object.keys(imgOut));
|
| 593 |
+
console.log('Image embed shape:', imageEmbeds.dims);
|
| 594 |
+
|
| 595 |
+
// Find <image> token positions in the token sequence.
|
| 596 |
+
// In Transformers.js, use findTokenId() helper — no .convert_tokens_to_ids()
|
| 597 |
+
const imageTokenId = findTokenId('<image>');
|
| 598 |
+
console.log('Image token ID:', imageTokenId);
|
| 599 |
const ids = Array.from(inputIds);
|
| 600 |
+
|
| 601 |
+
// Count how many image positions we have
|
| 602 |
const imagePositions = ids.reduce((acc, id, i) => {
|
| 603 |
if (id === imageTokenId) acc.push(i);
|
| 604 |
return acc;
|
| 605 |
}, []);
|
| 606 |
+
console.log('Image token positions:', imagePositions.length, 'image embeds:', imageEmbeds.dims[0]);
|
| 607 |
|
| 608 |
+
// The vision encoder returns one embedding vector per image token slot.
|
| 609 |
+
// We replace each <image> token embedding with the corresponding image embed.
|
| 610 |
+
// If there are more image embed vectors than <image> tokens, we expand:
|
| 611 |
+
// the single <image> token placeholder is replaced by ALL image embed vectors.
|
| 612 |
const embedDim = inputsEmbeds.dims[2];
|
| 613 |
+
const numImgVecs = imageEmbeds.dims[0]; // actual number of image feature vectors
|
| 614 |
+
|
| 615 |
+
if (imagePositions.length === 0) {
|
| 616 |
+
// No <image> placeholder in tokenised text — just prepend image embeds
|
| 617 |
+
const totalLen = numImgVecs + inputsEmbeds.dims[1];
|
| 618 |
+
const mergedData = new Float32Array(totalLen * embedDim);
|
| 619 |
+
mergedData.set(new Float32Array(imageEmbeds.data.buffer, imageEmbeds.data.byteOffset, numImgVecs * embedDim), 0);
|
| 620 |
+
mergedData.set(new Float32Array(inputsEmbeds.data.buffer, inputsEmbeds.data.byteOffset, inputsEmbeds.dims[1] * embedDim), numImgVecs * embedDim);
|
| 621 |
+
inputsEmbeds = new ort.Tensor('float32', mergedData, [1, totalLen, embedDim]);
|
| 622 |
+
} else {
|
| 623 |
+
// Replace <image> token(s) with image embed vectors (expanding 1→N if needed)
|
| 624 |
+
const numReplace = imagePositions.length; // usually 1
|
| 625 |
+
const expandPer = Math.ceil(numImgVecs / numReplace);
|
| 626 |
+
const totalLen = inputsEmbeds.dims[1] - numReplace + numImgVecs;
|
| 627 |
+
const mergedData = new Float32Array(totalLen * embedDim);
|
| 628 |
+
const imgEmbeds32 = new Float32Array(imageEmbeds.data.buffer ?? imageEmbeds.data, 0, numImgVecs * embedDim);
|
| 629 |
+
const txtData32 = new Float32Array(inputsEmbeds.data.buffer ?? inputsEmbeds.data, 0, inputsEmbeds.dims[1] * embedDim);
|
| 630 |
+
|
| 631 |
+
let dst = 0;
|
| 632 |
+
let imgCursor = 0;
|
| 633 |
+
const imgPosSet = new Set(imagePositions);
|
| 634 |
+
for (let i = 0; i < ids.length; i++) {
|
| 635 |
+
if (imgPosSet.has(i)) {
|
| 636 |
+
// Insert all remaining image embed vectors at first <image> token, skip rest
|
| 637 |
+
if (imgCursor < numImgVecs) {
|
| 638 |
+
const toCopy = (i === imagePositions[0]) ? numImgVecs : 0;
|
| 639 |
+
mergedData.set(imgEmbeds32.subarray(0, toCopy * embedDim), dst * embedDim);
|
| 640 |
+
dst += toCopy;
|
| 641 |
+
imgCursor = numImgVecs;
|
| 642 |
+
}
|
| 643 |
+
} else {
|
| 644 |
+
mergedData.set(txtData32.subarray(i * embedDim, (i + 1) * embedDim), dst * embedDim);
|
| 645 |
+
dst++;
|
| 646 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
}
|
| 648 |
+
inputsEmbeds = new ort.Tensor('float32', mergedData, [1, dst, embedDim]);
|
| 649 |
}
|
|
|
|
|
|
|
| 650 |
statusEl.textContent = 'Generating response...';
|
| 651 |
} else {
|
| 652 |
statusEl.textContent = 'Generating response...';
|
|
|
|
| 657 |
// Generation loop
|
| 658 |
const cache = initCache();
|
| 659 |
const eosId = tokenizer.eos_token_id;
|
| 660 |
+
const imEndId = findTokenId('<|im_end|>');
|
| 661 |
const generatedTokens = [];
|
| 662 |
let curLen = inputsEmbeds.dims[1];
|
| 663 |
let embeds = inputsEmbeds;
|
|
|
|
| 1250 |
}
|
| 1251 |
|
| 1252 |
.spec-val.green { color: #5a8a00; }
|
| 1253 |
+
|
| 1254 |
+
/* LOADING OVERLAY */
|
| 1255 |
+
#loading-overlay {
|
| 1256 |
+
display: none;
|
| 1257 |
+
position: fixed;
|
| 1258 |
+
inset: 0;
|
| 1259 |
+
z-index: 100;
|
| 1260 |
+
background: rgba(8,8,8,0.97);
|
| 1261 |
+
flex-direction: column;
|
| 1262 |
+
align-items: center;
|
| 1263 |
+
justify-content: center;
|
| 1264 |
+
backdrop-filter: blur(4px);
|
| 1265 |
+
}
|
| 1266 |
+
.lo-inner {
|
| 1267 |
+
width: min(540px, 90vw);
|
| 1268 |
+
display: flex;
|
| 1269 |
+
flex-direction: column;
|
| 1270 |
+
gap: 28px;
|
| 1271 |
+
}
|
| 1272 |
+
.lo-title {
|
| 1273 |
+
font-family: 'Bebas Neue', sans-serif;
|
| 1274 |
+
font-size: 11px;
|
| 1275 |
+
letter-spacing: 4px;
|
| 1276 |
+
color: var(--accent);
|
| 1277 |
+
opacity: 0.6;
|
| 1278 |
+
margin-bottom: 4px;
|
| 1279 |
+
}
|
| 1280 |
+
#loading-step {
|
| 1281 |
+
font-family: 'DM Mono', monospace;
|
| 1282 |
+
font-size: 13px;
|
| 1283 |
+
color: var(--text);
|
| 1284 |
+
}
|
| 1285 |
+
.lo-bar-wrap {
|
| 1286 |
+
height: 6px;
|
| 1287 |
+
background: #1a1a1a;
|
| 1288 |
+
border-radius: 3px;
|
| 1289 |
+
overflow: visible;
|
| 1290 |
+
position: relative;
|
| 1291 |
+
}
|
| 1292 |
+
#loading-bar-fill {
|
| 1293 |
+
height: 100%;
|
| 1294 |
+
background: var(--accent);
|
| 1295 |
+
border-radius: 3px;
|
| 1296 |
+
width: 0%;
|
| 1297 |
+
transition: width 0.5s ease;
|
| 1298 |
+
box-shadow: 0 0 12px var(--accent);
|
| 1299 |
+
position: relative;
|
| 1300 |
+
}
|
| 1301 |
+
#loading-bar-fill::after {
|
| 1302 |
+
content: '';
|
| 1303 |
+
position: absolute;
|
| 1304 |
+
right: -1px; top: -3px;
|
| 1305 |
+
width: 12px; height: 12px;
|
| 1306 |
+
background: var(--accent);
|
| 1307 |
+
border-radius: 50%;
|
| 1308 |
+
box-shadow: 0 0 10px var(--accent), 0 0 20px var(--accent);
|
| 1309 |
+
}
|
| 1310 |
+
.lo-stats {
|
| 1311 |
+
display: flex;
|
| 1312 |
+
justify-content: space-between;
|
| 1313 |
+
align-items: baseline;
|
| 1314 |
+
}
|
| 1315 |
+
#loading-pct {
|
| 1316 |
+
font-family: 'Bebas Neue', sans-serif;
|
| 1317 |
+
font-size: 52px;
|
| 1318 |
+
letter-spacing: 2px;
|
| 1319 |
+
color: var(--accent);
|
| 1320 |
+
line-height: 1;
|
| 1321 |
+
}
|
| 1322 |
+
.lo-right {
|
| 1323 |
+
display: flex;
|
| 1324 |
+
flex-direction: column;
|
| 1325 |
+
align-items: flex-end;
|
| 1326 |
+
gap: 5px;
|
| 1327 |
+
}
|
| 1328 |
+
#loading-bytes {
|
| 1329 |
+
font-family: 'DM Mono', monospace;
|
| 1330 |
+
font-size: 11px;
|
| 1331 |
+
color: #555;
|
| 1332 |
+
}
|
| 1333 |
+
#loading-eta {
|
| 1334 |
+
font-family: 'DM Mono', monospace;
|
| 1335 |
+
font-size: 10px;
|
| 1336 |
+
color: var(--muted);
|
| 1337 |
+
text-align: right;
|
| 1338 |
+
max-width: 300px;
|
| 1339 |
+
}
|
| 1340 |
+
.lo-steps {
|
| 1341 |
+
display: flex;
|
| 1342 |
+
flex-direction: column;
|
| 1343 |
+
border: 1px solid var(--border);
|
| 1344 |
+
}
|
| 1345 |
+
.lo-step-row {
|
| 1346 |
+
display: flex;
|
| 1347 |
+
align-items: center;
|
| 1348 |
+
gap: 12px;
|
| 1349 |
+
padding: 9px 14px;
|
| 1350 |
+
border-bottom: 1px solid var(--border);
|
| 1351 |
+
}
|
| 1352 |
+
.lo-step-row:last-child { border-bottom: none; }
|
| 1353 |
+
.lo-step-dot {
|
| 1354 |
+
width: 6px; height: 6px;
|
| 1355 |
+
border-radius: 50%;
|
| 1356 |
+
background: #1e1e1e;
|
| 1357 |
+
flex-shrink: 0;
|
| 1358 |
+
transition: background 0.3s;
|
| 1359 |
+
}
|
| 1360 |
+
.lo-step-dot.active {
|
| 1361 |
+
background: var(--accent);
|
| 1362 |
+
box-shadow: 0 0 8px var(--accent);
|
| 1363 |
+
animation: thinkBounce 1s infinite;
|
| 1364 |
+
}
|
| 1365 |
+
.lo-step-dot.done { background: #3a6a00; animation: none; }
|
| 1366 |
+
.lo-step-label {
|
| 1367 |
+
font-family: 'DM Mono', monospace;
|
| 1368 |
+
font-size: 10px;
|
| 1369 |
+
color: #2a2a2a;
|
| 1370 |
+
transition: color 0.3s;
|
| 1371 |
+
flex: 1;
|
| 1372 |
+
}
|
| 1373 |
+
.lo-step-label.active { color: var(--text); }
|
| 1374 |
+
.lo-step-label.done { color: #3a6a00; }
|
| 1375 |
+
.lo-step-size {
|
| 1376 |
+
font-family: 'DM Mono', monospace;
|
| 1377 |
+
font-size: 9px;
|
| 1378 |
+
color: #1e1e1e;
|
| 1379 |
+
}
|
| 1380 |
+
#loading-file {
|
| 1381 |
+
font-family: 'DM Mono', monospace;
|
| 1382 |
+
font-size: 9px;
|
| 1383 |
+
color: #2a2a2a;
|
| 1384 |
+
white-space: nowrap;
|
| 1385 |
+
overflow: hidden;
|
| 1386 |
+
text-overflow: ellipsis;
|
| 1387 |
+
max-width: 100%;
|
| 1388 |
+
margin-top: 4px;
|
| 1389 |
+
}
|
| 1390 |
+
.lo-note {
|
| 1391 |
+
font-family: 'DM Mono', monospace;
|
| 1392 |
+
font-size: 9px;
|
| 1393 |
+
color: #1e1e1e;
|
| 1394 |
+
line-height: 1.8;
|
| 1395 |
+
}
|
| 1396 |
+
#progress-detail {
|
| 1397 |
+
font-family: 'DM Mono', monospace;
|
| 1398 |
+
font-size: 9px;
|
| 1399 |
+
color: #444;
|
| 1400 |
+
white-space: nowrap;
|
| 1401 |
+
overflow: hidden;
|
| 1402 |
+
text-overflow: ellipsis;
|
| 1403 |
+
max-width: 200px;
|
| 1404 |
+
}
|
| 1405 |
</style>
|
| 1406 |
</head>
|
| 1407 |
<body>
|
|
|
|
| 1422 |
<div id="progress">
|
| 1423 |
<div class="progress-track"><div id="progress-bar"></div></div>
|
| 1424 |
<span id="progress-text"></span>
|
| 1425 |
+
<span id="progress-detail"></span>
|
| 1426 |
</div>
|
| 1427 |
<div class="model-tag">LFM2.5-VL-1.6B · ONNX · WebGPU</div>
|
| 1428 |
<button id="load-btn">Load Model</button>
|
|
|
|
| 1504 |
</div>
|
| 1505 |
</div>
|
| 1506 |
|
| 1507 |
+
<!-- Loading overlay -->
|
| 1508 |
+
<div id="loading-overlay">
|
| 1509 |
+
<div class="lo-inner">
|
| 1510 |
+
<div>
|
| 1511 |
+
<div class="lo-title">DOWNLOADING MODEL</div>
|
| 1512 |
+
<div id="loading-step">Initializing...</div>
|
| 1513 |
+
<div id="loading-file"></div>
|
| 1514 |
+
</div>
|
| 1515 |
+
|
| 1516 |
+
<div>
|
| 1517 |
+
<div class="lo-stats">
|
| 1518 |
+
<div id="loading-pct">0%</div>
|
| 1519 |
+
<div class="lo-right">
|
| 1520 |
+
<div id="loading-bytes">0 MB / ~1.5 GB</div>
|
| 1521 |
+
<div id="loading-eta">Calculating speed...</div>
|
| 1522 |
+
</div>
|
| 1523 |
+
</div>
|
| 1524 |
+
<div class="lo-bar-wrap" style="margin-top:12px">
|
| 1525 |
+
<div id="loading-bar-fill"></div>
|
| 1526 |
+
</div>
|
| 1527 |
+
</div>
|
| 1528 |
+
|
| 1529 |
+
<div class="lo-steps">
|
| 1530 |
+
<div class="lo-step-row" id="step-row-1">
|
| 1531 |
+
<div class="lo-step-dot" id="step-dot-1"></div>
|
| 1532 |
+
<div class="lo-step-label" id="step-lbl-1">Tokenizer</div>
|
| 1533 |
+
<div class="lo-step-size">~5 MB</div>
|
| 1534 |
+
</div>
|
| 1535 |
+
<div class="lo-step-row" id="step-row-2">
|
| 1536 |
+
<div class="lo-step-dot" id="step-dot-2"></div>
|
| 1537 |
+
<div class="lo-step-label" id="step-lbl-2">Token Embedder</div>
|
| 1538 |
+
<div class="lo-step-size">~30 MB</div>
|
| 1539 |
+
</div>
|
| 1540 |
+
<div class="lo-step-row" id="step-row-3">
|
| 1541 |
+
<div class="lo-step-dot" id="step-dot-3"></div>
|
| 1542 |
+
<div class="lo-step-label" id="step-lbl-3">Vision Encoder (SigLIP2)</div>
|
| 1543 |
+
<div class="lo-step-size">~400 MB</div>
|
| 1544 |
+
</div>
|
| 1545 |
+
<div class="lo-step-row" id="step-row-4">
|
| 1546 |
+
<div class="lo-step-dot" id="step-dot-4"></div>
|
| 1547 |
+
<div class="lo-step-label" id="step-lbl-4">Language Decoder (Q4)</div>
|
| 1548 |
+
<div class="lo-step-size">~1.1 GB</div>
|
| 1549 |
+
</div>
|
| 1550 |
+
</div>
|
| 1551 |
+
|
| 1552 |
+
<div class="lo-note">
|
| 1553 |
+
⚡ First load downloads ~1.5 GB from Hugging Face.<br>
|
| 1554 |
+
🔒 Everything runs 100% in-browser — zero data leaves your device.<br>
|
| 1555 |
+
🛜 Keep this tab open. Do not refresh during download.
|
| 1556 |
+
</div>
|
| 1557 |
+
</div>
|
| 1558 |
+
</div>
|
| 1559 |
+
|
| 1560 |
</body>
|
| 1561 |
</html>
|