Spaces:
Running
Running
| // core/engine.js β the inference ENGINE adapter (the only module that touches the wasm | |
| // tokenizer + the WebGPU `gpu` object). DOM-free. Wraps a model that core/loader.js has | |
| // already loaded onto the GPU and exposes a clean, UI-agnostic API: | |
| // | |
| // const engine = await createEngine(modelEntry, { gpu, info, imageKappa }); | |
| // const { text, outIds } = await engine.generate(ids, { onToken, signal }); | |
| // const rec = await engine.buildReceipt({ ... }); // PROV-O, re-derivable (Law L5) | |
| // | |
| // The token loop, framing, memo and receipt logic are lifted byte-for-byte from the | |
| // original index.html think()/run()/sealReceipt β only the DOM writes are replaced by an | |
| // onToken callback and the running/handedOff flags by an AbortSignal, so output (and the | |
| // receipt ΞΊ) is identical to the original app. | |
| import { qvac_tokenize, qvac_continue, kappa } from "../pkg/holospaces_web.js"; | |
| import { clean, didHolo, kappaTokens, sealReceipt, verifyIntegrity, idBytes, kappaBytes } from "./kappa.js"; | |
| const _perf = () => (typeof performance !== "undefined" ? performance.now() : 0); | |
| const _sleep = (ms) => new Promise((r) => setTimeout(r, ms)); | |
| // The engine is itself a content-addressed object β hash the wasm once (lazy). | |
| let _engineK = null; | |
| export async function engineKappa() { | |
| if (_engineK) return _engineK; | |
| try { const b = new Uint8Array(await (await fetch(new URL("../pkg/holospaces_web_bg.wasm", import.meta.url))).arrayBuffer()); _engineK = await kappaBytes(b); } | |
| catch { _engineK = "did:holo:sha256:(engine unavailable)"; } | |
| return _engineK; | |
| } | |
| export async function createEngine(modelEntry, loaded) { | |
| const { gpu, info, imageKappa } = loaded; | |
| const m = modelEntry; | |
| const engineReady = engineKappa(); | |
| // model ΞΊ: the ΞΊ-disk's VERIFIED image_kappa when present (a real content address of the | |
| // weights, every sector re-derived); else the model's declared identity. | |
| const modelKappa = imageKappa | |
| ? "did:holo:sha256:" + String(imageKappa).replace(/^(did:holo:)?sha256:/, "") | |
| : await didHolo({ "@type": "schema:SoftwareSourceCode", name: m.name, size: m.size, fmt: m.fmt || "", family: m.fam || "" }); | |
| const memo = new Map(); | |
| let _drafter = null; // learned speculative drafter (fn(seq,max)=>ids); null β standard decode. Set via setDrafter(). | |
| const tokenize = (text) => { try { return JSON.parse(qvac_tokenize(text)).ids || []; } catch { return []; } }; | |
| const detokenize = (ids) => { try { return clean(JSON.parse(qvac_continue(JSON.stringify(ids), 0, 0, 0, ids.length)).text || ""); } catch { return ""; } }; | |
| const fingerprint = (ids) => kappa(idBytes(ids)); // live mind ΞΊ (blake3, from wasm) | |
| // Frame one user turn. Qwen2/3 use ChatML (its <|im_*|> markers are atomic BPE tokens); | |
| // other instruction models use a plain Q/A frame. (Verbatim from the original run().) | |
| function frameTurn(prompt, hasHistory) { | |
| if (m.qwen) { | |
| const noThink = m.qwen3 ? "<think>\n\n</think>\n\n" : ""; // Qwen3: skip the thinking block for fast direct answers | |
| return (hasHistory ? "<|im_end|>\n" : "") + `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n` + noThink; | |
| } | |
| if (m.llama3) // LLaMA-3 header template (BitNet b1.58 etc.) | |
| return (hasHistory ? "<|eot_id|>" : "") + `<|start_header_id|>user<|end_header_id|>\n\n${prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`; | |
| if (m.olmo) // OLMo/OLMoE: <|user|>/<|assistant|> role tags (each turn self-delimited; leading bos via m.bos) | |
| return `<|user|>\n${prompt}\n<|assistant|>\n`; | |
| if (m.userWord) // word-frame (Falcon-E: its ChatML template stalls EMPIRICALLY; "User:/Falcon:" answers β see q-falcon-templates sweep) | |
| return (hasHistory ? "\n" : "") + "User: " + prompt + "\nFalcon:"; | |
| return "Question: " + prompt + "\nAnswer:"; | |
| } | |
| function params() { | |
| const temp = m.temp || 0; | |
| return { decode: temp > 0 ? "sampled@t=" + temp : "greedy-argmax", maxTokens: m.cap, repetitionPenalty: m.rep ?? 1.05, template: m.qwen ? "chatml" : m.llama3 ? "llama3" : "qa", thinking: !m.qwen3 }; | |
| } | |
| // The streaming token loop. `ids` is the whole running conversation (the mind); generation | |
| // appends to it. onToken({ text, ids, outIds, stats }) fires per step; signal aborts. | |
| async function generate(ids, { onToken, signal, repPenalty, maxNew } = {}) { | |
| const rep = repPenalty ?? m.rep ?? 1.3; | |
| const newCap = maxNew ?? m.cap ?? 80; // max NEW tokens this call | |
| const kvCap = (m.ctx || m.cap || 80) + 8; // the engine's KV allocation (loader kvOf) | |
| const promptLen = ids.length; | |
| const tStart = _perf(); | |
| let first = true, decodeStart = 0, decodeTok = 0, ttft = 0, tokps = 0, msExec = 0, err = null, outText = ""; | |
| if (promptLen >= kvCap - 1) err = new Error(`context full: ${promptLen} tokens β₯ ${kvCap} KV positions`); | |
| // SPECULATIVE PATH (opt-in): when a learned drafter is registered and the engine has the batched-verify | |
| // head, the drafter proposes and the target batch-verifies β output is BYTE-IDENTICAL to greedy decode | |
| // (greedy verify), streamed via onCommit. Any incompatibility/throw falls straight through to the standard | |
| // loop below, so default Q (no drafter) and unsupported models are completely unaffected. | |
| if (!err && (_drafter || globalThis.__spec) && gpu.specDecode && gpu.setDrafter) { | |
| try { | |
| gpu.setDrafter(_drafter); | |
| const out = []; const t0 = _perf(); let ttft2 = 0, tokps2 = 0; | |
| const seq = await gpu.specDecode(ids.slice(), newCap, rep, (tk) => { | |
| if (signal && signal.aborted) return false; | |
| out.push(tk); | |
| if (!ttft2) ttft2 = _perf() - tStart; | |
| const dt = _perf() - t0; if (dt > 0) tokps2 = out.length / (dt / 1000); | |
| if (onToken) onToken({ text: detokenize(out), ids: ids.slice(0, promptLen).concat(out), outIds: out.slice(), stats: { ttft: ttft2, tokps: tokps2, msExec: gpu.timing ? gpu.timing.exec : 0, gpuBytes: gpu.gpuBytes, spec: gpu.specStats ? gpu.specStats() : null } }); | |
| return out.length < newCap; | |
| }); | |
| const outIds = seq.slice(promptLen); | |
| let text = detokenize(outIds); | |
| if (m.stopText) { const ix = text.indexOf(m.stopText); if (ix >= 0) text = text.slice(0, ix); } | |
| return { text, outIds, ids: seq, stats: { ttft: ttft2, tokps: tokps2, msExec: gpu.timing ? gpu.timing.exec : 0, spec: gpu.specStats ? gpu.specStats() : null }, error: null }; | |
| } catch (e) { try { gpu.setDrafter(null); } catch {} /* fall through to the standard decode loop */ } | |
| } | |
| while (!err && !(signal && signal.aborted) && ids.length - promptLen < newCap && ids.length < kvCap - 1) { | |
| const prevLen = ids.length; | |
| try { ids = await (gpu.decode || gpu.generate)(ids, first ? 1 : 6, rep); } // batched GPU decode head (4 B/token readback) when the engine has it | |
| catch (e) { err = e; break; } | |
| const dn = ids.length - prevLen; | |
| if (dn > 0) { | |
| msExec = gpu.timing ? gpu.timing.exec : msExec; | |
| if (first) { ttft = _perf() - tStart; decodeStart = _perf(); first = false; } // TTFT = prefill + first token | |
| else { decodeTok += dn; const dt = _perf() - decodeStart; if (dt > 0) tokps = decodeTok / (dt / 1000); } // steady decode rate | |
| } | |
| const di = ids.slice(promptLen); | |
| // incremental detokenize: decode only a bounded window (the dn new tokens + a small left context) | |
| // and append the delta β O(1) per step, not re-detokenizing the whole growing output (was O(nΒ²)). | |
| { const a = Math.max(promptLen, ids.length - (dn + 8)); const wf = detokenize(ids.slice(a)); const wp = a >= ids.length - dn ? "" : detokenize(ids.slice(a, ids.length - dn)); outText += wf.slice(wp.length); } | |
| let text = outText, hitStop = false; | |
| if (m.stopText) { const ix = text.indexOf(m.stopText); if (ix >= 0) { text = text.slice(0, ix); hitStop = true; } } // word-framed models stop on the next "User:" turn | |
| if (onToken) onToken({ text, ids: ids.slice(), outIds: di.slice(), stats: { ttft, tokps, msExec, gpuBytes: gpu.gpuBytes } }); | |
| if (hitStop) break; | |
| if (ids.length <= prevLen) break; // EOS / no progress | |
| // degeneration guard: a long run of one repeated character (the repetition collapse of | |
| // small/experimental quants) will never recover β stop instead of burning the budget. | |
| if (text.length > 80 && /(.)\1{63}$/.test(text)) { err = new Error("degenerate repetition β stopped"); break; } | |
| await _sleep(0); // yield to the event loop so the UI can paint β no artificial throttle (gpu.decode already awaits the GPU) | |
| } | |
| const outIds = ids.slice(promptLen); | |
| let text = detokenize(outIds); | |
| if (m.stopText) { const ix = text.indexOf(m.stopText); if (ix >= 0) text = text.slice(0, ix); } | |
| return { text, outIds, ids, stats: { ttft, tokps, msExec }, error: err }; | |
| } | |
| // DIFFUSION decode (Dream-class): iterative bidirectional unmasking over a fixed `steps` budget, | |
| // wall-clock fixed by steps not output length. Greedy β deterministic β ΞΊ-re-derivable (Law L5). | |
| // `ids` is the framed prompt; we diffuse `genLen` masked positions after it. Returns the same shape | |
| // as generate() so callers (and the brain seam) are agnostic. onToken fires ONCE with the final fill | |
| // (diffusion has no left-to-right token stream β the whole block resolves together). | |
| // Two modes: APPEND (genLen masks at the suffix β generation) or FILL (ids ALREADY contain mask ids | |
| // anywhere β infill/surgical edit, conditioning on BOTH sides; diffusion's structural edge over AR). | |
| // `causal` flips the parity gate (causal block=1 must equal the sequential engine β validates the pass). | |
| async function diffuse(ids, { genLen, steps, fill, causal, signal, onToken } = {}) { | |
| if (!gpu || !gpu.diffuse) throw new Error("this model has no diffusion engine (load a diffusion ΞΊ-object)"); | |
| const gl = fill ? 0 : (genLen ?? Math.min(m.cap || 64, (m.ctx || 192) - ids.length - 1)); | |
| const S = steps ?? m.steps ?? 12; | |
| const tStart = _perf(); | |
| const seq = await gpu.diffuse(ids, gl, { steps: S, fill: !!fill, causal: !!causal, signal }); | |
| // append β output is the generated suffix; fill β the whole sequence is the answer (a span edited in place) | |
| const outIds = fill ? seq.slice() : seq.slice(ids.length); | |
| let text = detokenize(outIds); | |
| if (m.stopText && !fill) { const ix = text.indexOf(m.stopText); if (ix >= 0) text = text.slice(0, ix); } | |
| const stats = { ttft: _perf() - tStart, tokps: 0, msExec: gpu.timing ? gpu.timing.exec : 0, steps: S, fill: !!fill, diff: gpu.diffStats ? gpu.diffStats() : null }; | |
| if (onToken) onToken({ text, ids: seq.slice(), outIds: outIds.slice(), stats }); | |
| return { text, outIds, ids: seq, stats, error: null }; | |
| } | |
| // ΞΊ-memo: identical (context β prompt β model β params) β replay in O(1), no decode. | |
| const memoKey = async (ctxIds, turnIds, p) => didHolo({ ctx: await kappaTokens(ctxIds.concat(turnIds)), model: modelKappa, params: p || params() }); | |
| async function buildReceipt({ promptText, ctxIds, turnIds, outIds, fromMemo, evaluateText, paramsPatch, extraUsed }) { | |
| return sealReceipt({ | |
| promptText, ctxIds, turnIds, outIds, text: detokenize(outIds), params: { ...params(), ...(paramsPatch || {}) }, fromMemo, | |
| modelKappa, engineKappa: await engineReady, evaluateText, extraUsed, | |
| }); | |
| } | |
| // Re-derivation (greedy only): re-run the exact inference and reproduce ΞΊ(output) byte-for-byte. | |
| async function reDerive(rec) { | |
| if (!gpu) return { ok: false, reason: "load the model to re-derive" }; | |
| if (/sampled/.test(rec.params.decode)) return { ok: false, reason: "sampled decode β only the ΞΊ-binding is verifiable, not re-derivation" }; | |
| try { | |
| let seq = rec.ctxIds.concat(rec.turnIds); const start = seq.length; | |
| gpu.reset(); | |
| seq = await (gpu.decode || gpu.generate)(seq, rec.outIds.length, rec.params.repetitionPenalty); // same head as the live path β replay must match byte-for-byte | |
| const got = await kappaTokens(seq.slice(start)), want = rec.body["prov:generated"]["holo:outputTokens"]; | |
| return { ok: got === want, got, want }; | |
| } finally { try { gpu.reset(); } catch {} } | |
| } | |
| return { | |
| model: m, dims: gpu.dims, modelKappa, bosId: info?.bos ?? null, get gpuBytes() { return gpu.gpuBytes; }, | |
| tokenize, detokenize, fingerprint, frameTurn, params, | |
| generate, | |
| // register/clear the learned speculative drafter (fn(seq,max)=>ids). Off by default; safe fallback. | |
| setDrafter: (fn) => { _drafter = fn || null; try { gpu.setDrafter && gpu.setDrafter(_drafter); } catch (e) {} }, | |
| specAvailable: !!(gpu.specDecode && gpu.setDrafter), | |
| memoKey, memoGet: (k) => memo.get(k), memoHas: (k) => memo.has(k), memoSet: (k, v) => memo.set(k, v), | |
| buildReceipt, verify: verifyIntegrity, reDerive, | |
| stats: () => gpu.timing, reset: () => { try { gpu.reset(); } catch {} }, destroy: () => { try { gpu.destroy(); } catch {} }, | |
| _gpu: gpu, // raw engine handle (diagnostics: per-pass GPU trace via the step() path + window.__profile) | |
| }; | |
| } | |