ykhrustalev commited on
Commit
cda2f30
·
verified ·
1 Parent(s): 133cd61

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +130 -0
README.md CHANGED
@@ -22,6 +22,7 @@ tags:
22
  - reasoning
23
  - onnx
24
  - onnxruntime
 
25
  base_model:
26
  - LiquidAI/LFM2.5-1.2B-Thinking
27
  ---
@@ -153,6 +154,135 @@ else:
153
  print(full_response)
154
  ```
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  ## Output Format
157
 
158
  The model produces output in two parts:
 
22
  - reasoning
23
  - onnx
24
  - onnxruntime
25
+ - webgpu
26
  base_model:
27
  - LiquidAI/LFM2.5-1.2B-Thinking
28
  ---
 
154
  print(full_response)
155
  ```
156
 
157
+ ## WebGPU (Browser)
158
+
159
+ ### Installation
160
+
161
+ ```bash
162
+ npm install onnxruntime-web @huggingface/transformers
163
+ ```
164
+
165
+ ### Enable WebGPU
166
+
167
+ WebGPU is required for browser inference. To enable:
168
+
169
+ 1. **Chrome/Edge**: Navigate to `chrome://flags/#enable-unsafe-webgpu`, enable, and restart
170
+ 2. **Verify**: Check `chrome://gpu` for "WebGPU" status
171
+ 3. **Test**: Run `navigator.gpu.requestAdapter()` in DevTools console
172
+
173
+ ### Inference
174
+
175
+ ```javascript
176
+ import * as ort from "onnxruntime-web/webgpu";
177
+ import { AutoTokenizer } from "@huggingface/transformers";
178
+
179
+ // Check WebGPU availability
180
+ if (!navigator.gpu) {
181
+ throw new Error("WebGPU not available. Enable at chrome://flags/#enable-unsafe-webgpu");
182
+ }
183
+ const adapter = await navigator.gpu.requestAdapter();
184
+ if (!adapter) {
185
+ throw new Error("WebGPU adapter not found. Check chrome://gpu for status.");
186
+ }
187
+
188
+ ort.env.wasm.numThreads = 1;
189
+
190
+ const modelId = "LiquidAI/LFM2.5-1.2B-Thinking-ONNX";
191
+ const modelBase = `https://huggingface.co/${modelId}/resolve/main`;
192
+
193
+ // Load tokenizer
194
+ const tokenizer = await AutoTokenizer.from_pretrained(modelId);
195
+
196
+ // Load ONNX session with external data
197
+ const onnxPath = `${modelBase}/onnx/model_q4.onnx`;
198
+ const dataPath = `${modelBase}/onnx/model_q4.onnx_data`;
199
+ const session = await ort.InferenceSession.create(onnxPath, {
200
+ executionProviders: ["webgpu"],
201
+ externalData: [{ path: "model_q4.onnx_data", data: dataPath }],
202
+ });
203
+
204
+ // Model config (from config.json)
205
+ const hiddenSize = 2048;
206
+ const numKVHeads = 8;
207
+ const headDim = 256;
208
+
209
+ // Initialize KV cache
210
+ function initCache() {
211
+ const cache = {};
212
+ for (const name of session.inputNames) {
213
+ if (name.startsWith("past_conv")) {
214
+ cache[name] = new ort.Tensor("float32", new Float32Array(hiddenSize * 3), [1, hiddenSize, 3]);
215
+ } else if (name.startsWith("past_key_values")) {
216
+ cache[name] = new ort.Tensor("float32", new Float32Array(0), [1, numKVHeads, 0, headDim]);
217
+ }
218
+ }
219
+ return cache;
220
+ }
221
+
222
+ // Update cache from outputs
223
+ function updateCache(cache, outputs) {
224
+ for (const [name, tensor] of Object.entries(outputs)) {
225
+ if (name.startsWith("present_conv")) {
226
+ cache[name.replace("present_conv", "past_conv")] = tensor;
227
+ } else if (name.startsWith("present.")) {
228
+ cache[name.replace("present.", "past_key_values.")] = tensor;
229
+ }
230
+ }
231
+ }
232
+
233
+ // Build prompt and tokenize
234
+ const messages = [{ role: "user", content: "What is 25 * 37?" }];
235
+ const prompt = tokenizer.apply_chat_template(messages, { add_generation_prompt: true, tokenize: false });
236
+ const inputIds = tokenizer.encode(prompt);
237
+
238
+ // Generation loop
239
+ const cache = initCache();
240
+ const eosTokenId = tokenizer.eos_token_id;
241
+ const generatedTokens = [];
242
+ let curLen = inputIds.length;
243
+ let ids = inputIds;
244
+
245
+ for (let step = 0; step < 512; step++) {
246
+ const inputIdsTensor = new ort.Tensor("int64", new BigInt64Array(ids.map(BigInt)), [1, ids.length]);
247
+ const attentionMask = new ort.Tensor("int64", new BigInt64Array(curLen).fill(1n), [1, curLen]);
248
+
249
+ const outputs = await session.run({ input_ids: inputIdsTensor, attention_mask: attentionMask, ...cache });
250
+
251
+ // Greedy decode: argmax of last token logits
252
+ const logits = outputs.logits;
253
+ const vocabSize = logits.dims[2];
254
+ const lastLogits = logits.data.slice((logits.dims[1] - 1) * vocabSize);
255
+ const nextToken = lastLogits.indexOf(Math.max(...lastLogits));
256
+
257
+ generatedTokens.push(nextToken);
258
+ if (nextToken === eosTokenId) break;
259
+
260
+ updateCache(cache, outputs);
261
+ ids = [nextToken];
262
+ curLen++;
263
+ }
264
+
265
+ // Parse thinking and response
266
+ const fullResponse = tokenizer.decode(generatedTokens, { skip_special_tokens: true });
267
+ const thinkMatch = fullResponse.match(/<think>([\s\S]*?)<\/think>/);
268
+ if (thinkMatch) {
269
+ const thinking = thinkMatch[1].trim();
270
+ const answer = fullResponse.slice(thinkMatch.index + thinkMatch[0].length).trim();
271
+ console.log("Thinking:", thinking);
272
+ console.log("Answer:", answer);
273
+ } else {
274
+ console.log(fullResponse);
275
+ }
276
+ ```
277
+
278
+ ### WebGPU Notes
279
+
280
+ - Recommended: `model_q4.onnx` for best performance/quality balance
281
+ - For higher quality: `model_fp16.onnx`
282
+ - Models use external data files (`.onnx_data`) that are loaded automatically
283
+ - int64 tensors require `BigInt64Array`
284
+ - Reasoning models may generate longer outputs; adjust max tokens as needed
285
+
286
  ## Output Format
287
 
288
  The model produces output in two parts: