Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -22,6 +22,7 @@ tags:
|
|
| 22 |
- reasoning
|
| 23 |
- onnx
|
| 24 |
- onnxruntime
|
|
|
|
| 25 |
base_model:
|
| 26 |
- LiquidAI/LFM2.5-1.2B-Thinking
|
| 27 |
---
|
|
@@ -153,6 +154,135 @@ else:
|
|
| 153 |
print(full_response)
|
| 154 |
```
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
## Output Format
|
| 157 |
|
| 158 |
The model produces output in two parts:
|
|
|
|
| 22 |
- reasoning
|
| 23 |
- onnx
|
| 24 |
- onnxruntime
|
| 25 |
+
- webgpu
|
| 26 |
base_model:
|
| 27 |
- LiquidAI/LFM2.5-1.2B-Thinking
|
| 28 |
---
|
|
|
|
| 154 |
print(full_response)
|
| 155 |
```
|
| 156 |
|
| 157 |
+
## WebGPU (Browser)
|
| 158 |
+
|
| 159 |
+
### Installation
|
| 160 |
+
|
| 161 |
+
```bash
|
| 162 |
+
npm install onnxruntime-web @huggingface/transformers
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
### Enable WebGPU
|
| 166 |
+
|
| 167 |
+
WebGPU is required for browser inference. To enable:
|
| 168 |
+
|
| 169 |
+
1. **Chrome/Edge**: Navigate to `chrome://flags/#enable-unsafe-webgpu`, enable, and restart
|
| 170 |
+
2. **Verify**: Check `chrome://gpu` for "WebGPU" status
|
| 171 |
+
3. **Test**: Run `navigator.gpu.requestAdapter()` in DevTools console
|
| 172 |
+
|
| 173 |
+
### Inference
|
| 174 |
+
|
| 175 |
+
```javascript
|
| 176 |
+
import * as ort from "onnxruntime-web/webgpu";
|
| 177 |
+
import { AutoTokenizer } from "@huggingface/transformers";
|
| 178 |
+
|
| 179 |
+
// Check WebGPU availability
|
| 180 |
+
if (!navigator.gpu) {
|
| 181 |
+
throw new Error("WebGPU not available. Enable at chrome://flags/#enable-unsafe-webgpu");
|
| 182 |
+
}
|
| 183 |
+
const adapter = await navigator.gpu.requestAdapter();
|
| 184 |
+
if (!adapter) {
|
| 185 |
+
throw new Error("WebGPU adapter not found. Check chrome://gpu for status.");
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
ort.env.wasm.numThreads = 1;
|
| 189 |
+
|
| 190 |
+
const modelId = "LiquidAI/LFM2.5-1.2B-Thinking-ONNX";
|
| 191 |
+
const modelBase = `https://huggingface.co/${modelId}/resolve/main`;
|
| 192 |
+
|
| 193 |
+
// Load tokenizer
|
| 194 |
+
const tokenizer = await AutoTokenizer.from_pretrained(modelId);
|
| 195 |
+
|
| 196 |
+
// Load ONNX session with external data
|
| 197 |
+
const onnxPath = `${modelBase}/onnx/model_q4.onnx`;
|
| 198 |
+
const dataPath = `${modelBase}/onnx/model_q4.onnx_data`;
|
| 199 |
+
const session = await ort.InferenceSession.create(onnxPath, {
|
| 200 |
+
executionProviders: ["webgpu"],
|
| 201 |
+
externalData: [{ path: "model_q4.onnx_data", data: dataPath }],
|
| 202 |
+
});
|
| 203 |
+
|
| 204 |
+
// Model config (from config.json)
|
| 205 |
+
const hiddenSize = 2048;
|
| 206 |
+
const numKVHeads = 8;
|
| 207 |
+
const headDim = 256;
|
| 208 |
+
|
| 209 |
+
// Initialize KV cache
|
| 210 |
+
function initCache() {
|
| 211 |
+
const cache = {};
|
| 212 |
+
for (const name of session.inputNames) {
|
| 213 |
+
if (name.startsWith("past_conv")) {
|
| 214 |
+
cache[name] = new ort.Tensor("float32", new Float32Array(hiddenSize * 3), [1, hiddenSize, 3]);
|
| 215 |
+
} else if (name.startsWith("past_key_values")) {
|
| 216 |
+
cache[name] = new ort.Tensor("float32", new Float32Array(0), [1, numKVHeads, 0, headDim]);
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
return cache;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
// Update cache from outputs
|
| 223 |
+
function updateCache(cache, outputs) {
|
| 224 |
+
for (const [name, tensor] of Object.entries(outputs)) {
|
| 225 |
+
if (name.startsWith("present_conv")) {
|
| 226 |
+
cache[name.replace("present_conv", "past_conv")] = tensor;
|
| 227 |
+
} else if (name.startsWith("present.")) {
|
| 228 |
+
cache[name.replace("present.", "past_key_values.")] = tensor;
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
// Build prompt and tokenize
|
| 234 |
+
const messages = [{ role: "user", content: "What is 25 * 37?" }];
|
| 235 |
+
const prompt = tokenizer.apply_chat_template(messages, { add_generation_prompt: true, tokenize: false });
|
| 236 |
+
const inputIds = tokenizer.encode(prompt);
|
| 237 |
+
|
| 238 |
+
// Generation loop
|
| 239 |
+
const cache = initCache();
|
| 240 |
+
const eosTokenId = tokenizer.eos_token_id;
|
| 241 |
+
const generatedTokens = [];
|
| 242 |
+
let curLen = inputIds.length;
|
| 243 |
+
let ids = inputIds;
|
| 244 |
+
|
| 245 |
+
for (let step = 0; step < 512; step++) {
|
| 246 |
+
const inputIdsTensor = new ort.Tensor("int64", new BigInt64Array(ids.map(BigInt)), [1, ids.length]);
|
| 247 |
+
const attentionMask = new ort.Tensor("int64", new BigInt64Array(curLen).fill(1n), [1, curLen]);
|
| 248 |
+
|
| 249 |
+
const outputs = await session.run({ input_ids: inputIdsTensor, attention_mask: attentionMask, ...cache });
|
| 250 |
+
|
| 251 |
+
// Greedy decode: argmax of last token logits
|
| 252 |
+
const logits = outputs.logits;
|
| 253 |
+
const vocabSize = logits.dims[2];
|
| 254 |
+
const lastLogits = logits.data.slice((logits.dims[1] - 1) * vocabSize);
|
| 255 |
+
const nextToken = lastLogits.indexOf(Math.max(...lastLogits));
|
| 256 |
+
|
| 257 |
+
generatedTokens.push(nextToken);
|
| 258 |
+
if (nextToken === eosTokenId) break;
|
| 259 |
+
|
| 260 |
+
updateCache(cache, outputs);
|
| 261 |
+
ids = [nextToken];
|
| 262 |
+
curLen++;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
// Parse thinking and response
|
| 266 |
+
const fullResponse = tokenizer.decode(generatedTokens, { skip_special_tokens: true });
|
| 267 |
+
const thinkMatch = fullResponse.match(/<think>([\s\S]*?)<\/think>/);
|
| 268 |
+
if (thinkMatch) {
|
| 269 |
+
const thinking = thinkMatch[1].trim();
|
| 270 |
+
const answer = fullResponse.slice(thinkMatch.index + thinkMatch[0].length).trim();
|
| 271 |
+
console.log("Thinking:", thinking);
|
| 272 |
+
console.log("Answer:", answer);
|
| 273 |
+
} else {
|
| 274 |
+
console.log(fullResponse);
|
| 275 |
+
}
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
### WebGPU Notes
|
| 279 |
+
|
| 280 |
+
- Recommended: `model_q4.onnx` for best performance/quality balance
|
| 281 |
+
- For higher quality: `model_fp16.onnx`
|
| 282 |
+
- Models use external data files (`.onnx_data`) that are loaded automatically
|
| 283 |
+
- int64 tensors require `BigInt64Array`
|
| 284 |
+
- Reasoning models may generate longer outputs; adjust max tokens as needed
|
| 285 |
+
|
| 286 |
## Output Format
|
| 287 |
|
| 288 |
The model produces output in two parts:
|