// holo-load2bit.mjs — the LOAD-DIRECT consumer (the "load" half of the 7B infra). Given a pre-compiled // 2-bit κ-object (manifest.json + content-addressed b/<κ>.gz blocks, produced by compile2bit.mjs), it builds // the engine manifest + a fetchTensor that streams blocks, verifies each by re-deriving its κ (Law L5), // gunzips, and hands the engine the weights ALREADY 2-bit — no re-quant at load. The engine reads // manifest.preQuantized=true (parts() returns the blocks verbatim) and incoherent=false (LDLQ ⇒ no FWHT). // Hosting = serve the κ-object dir from anywhere; the κ-verify makes any mirror untrusted-safe. // // A family FINETUNE is stored as `base-κ + delta` (format "holo-delta/1"); loadKappaObject detects that and // transparently delegates to holo-load-delta.mjs, which reconstructs the finetune's blocks and returns the // SAME { manifest, fetchTensor } shape — so the engine, KV-cache, and Q's brain loader need no changes. import { f16ToF32 } from "./qvac-ingest.mjs"; import { blake3hex } from "./holo-blake3.mjs"; // BLAKE3 σ-axis κ verify (opt-in via manifest.hash==="blake3") — the GPU-parallel tree hash import { gpuBlake3Hex, gpuBlake3Available } from "./gpu-blake3.mjs"; // the SAME BLAKE3, run entirely on the GPU (2.74 GB/s) async function gunzip(u8) { const ds = new DecompressionStream("gzip"); const w = ds.writable.getWriter(); w.write(u8); w.close(); return new Uint8Array(await new Response(ds.readable).arrayBuffer()); } const hex = (buf) => [...new Uint8Array(buf)].map((b) => b.toString(16).padStart(2, "0")).join(""); // ── PERSISTENT κ-CACHE (the returning-user path): a κ-object's manifest + blocks are content-addressed and // IMMUTABLE, so once fetched they can live in the browser's Cache API FOREVER, keyed by their own URL. A // returning user then loads Q from local disk — ~0 network, no server (serverless by construction). It is // untrusted-safe: every cached block is still L5 re-derived below, so a poisoned cache entry is rejected // exactly like a poisoned network body. First visit warms the cache; every visit after is disk-speed. ── const KCACHE = "holo-kappa-v1"; let _persistAsked = false; async function _askPersist() { if (_persistAsked) return; _persistAsked = true; // once per session: ask the browser NOT to evict the model under storage pressure try { if (navigator.storage && navigator.storage.persist && !(await navigator.storage.persisted())) await navigator.storage.persist(); } catch (e) {} } // fetch a URL as bytes, disk-cached by URL. cache HIT → no network. MISS → fetch once (no-store bounds the // in-flight RAM to one body) then store to disk. All failures fall back to a plain fetch (never block a load). const _inflight = new Map(); // URL → in-flight fetch promise, so a parallel prefetch + the engine's read of the // SAME block share ONE network fetch (never double-download). async function cachedBytes(url) { let cache = null; try { cache = await caches.open(KCACHE); const hit = await cache.match(url); if (hit) return new Uint8Array(await hit.arrayBuffer()); } catch (e) { cache = null; } if (_inflight.has(url)) return _inflight.get(url); const p = (async () => { const buf = await (await fetch(url, { cache: "no-store" })).arrayBuffer(); if (cache) { try { await cache.put(url, new Response(buf.slice(0), { headers: { "Content-Type": "application/octet-stream" } })); _askPersist(); } catch (e) {} } return new Uint8Array(buf); })(); _inflight.set(url, p); try { return await p; } finally { _inflight.delete(url); } } // FAST FIRST LOAD: warm the whole block cache with bounded concurrency, so the engine's sequential per-tensor // reads hit the cache instead of paying one HF round-trip at a time (~1.5 blocks/s → tens of blocks/s over HTTP/2). // Fire-and-forget; the engine's getBlock shares any in-flight fetch (no double-download). Cross-origin-CDN safe. function prefetchBlocks(baseUrl, kappas, conc) { try { // HF serves over HTTP/2: many small blocks parallelize well, so fan out wide to saturate bandwidth (the one // big embed block holds a single lane, the rest fill the others). Tunable via globalThis.__prefetchConc for // constrained links/mobile. This is the whole first-load story for a resident κ-object — bandwidth, not RTT. conc = conc || (typeof globalThis !== "undefined" && globalThis.__prefetchConc) || 24; const urls = [...new Set(kappas.filter(Boolean))].map((k) => baseUrl + "/b/" + String(k).replace(":", "_") + ".gz"); let i = 0; const worker = async () => { while (i < urls.length) { const u = urls[i++]; try { await cachedBytes(u); } catch (e) {} } }; Promise.all(Array.from({ length: Math.min(conc, urls.length) }, worker)).catch(() => {}); } catch (e) {} } // reshape a raw (gunzipped) block to what the engine reads. PURE — shared with the delta loader so both // paths apply identical per-fmt handling. 2bit+fp16 → 2bit+f32 scales; everything else verbatim. export function reshapeTensor(rec, raw) { if (rec.fmt === "2bit" && rec.fp16) { // [2-bit packed][fp16 scales] → [2-bit][f32 scales] const Kp = rec.K, q2 = (rec.N * Kp) / 4, nsc = rec.N * (Kp / 32); const f16 = new Uint16Array(raw.buffer, raw.byteOffset + q2, nsc); const out = new Uint8Array(q2 + nsc * 4); out.set(raw.subarray(0, q2), 0); const f32 = new Float32Array(out.buffer, q2, nsc); for (let i = 0; i < nsc; i++) f32[i] = f16ToF32(f16[i]); return out; } return raw; // 2bit+f32 (incoherence), q8 (embed), f32 (norms) — verbatim } // build the engine manifest from model meta + normalized tensor records {name→{N,K,fmt,s?}}. PURE — shared. export function buildEngineManifest(man, normRecs, e8lutData) { const tensors = Object.entries(normRecs).map(([name, rec]) => ({ name, N: rec.N, K: rec.K, blk: rec.fmt !== "f32", fmt: rec.fmt, ...(rec.s !== undefined ? { s: rec.s } : {}) })); const native = man.mode === "q4" || man.mode === "q3" || man.mode === "e8" || man.mode === "bitnet"; // native-bits κ-object return { d: man.d, n_heads: man.n_heads, n_kv_heads: man.n_kv_heads, ff: man.ff, vocab: man.vocab, n_layers: man.n_layers, hd: man.hd, bits: native ? man.bits : 8, layout: man.layout, rope_base: man.rope_base, ...(man.maskId !== undefined ? { maskId: man.maskId, diffusion: true } : {}), attn_bias: man.attn_bias, qk_norm: man.qk_norm, qk_norm_dim: man.qk_norm_dim, tied: man.tied, ...(man.sub_norm ? { sub_norm: true } : {}), ...(man.bitlinear ? { bitlinear: true } : {}), ...(man.ffn_act ? { ffn_act: man.ffn_act } : {}), ...(man.moe ? { moe: man.moe } : {}), ...(native ? {} : { twoBit: true, incoherent: man.incoherent === true, preQuantized: true }), tensors, ...(e8lutData ? { e8lutData } : {}), }; } export async function loadKappaObject(baseUrl, opts = {}) { // Law L5: the manifest is the ROOT that names every block's κ. Verify the manifest's OWN bytes // re-derive to a pinned κ BEFORE trusting man.tensors[*].kappa — otherwise a tampered manifest can // re-point every block to a forged-but-self-consistent κ and each per-block check passes against the // forgery. The pin is an EXTERNAL anchor (catalog/lock), never the manifest's own self-asserted root. const manRaw = await cachedBytes(baseUrl + "/manifest.json"); // disk-cached (pinned + verified below), so a returning user's load is 0-network end to end const manKappa = "sha256:" + hex(await crypto.subtle.digest("SHA-256", manRaw)); const pin = opts.expectKappa ? String(opts.expectKappa).replace(/^did:holo:/, "") : null; if (pin) { if (manKappa !== pin) throw new Error("manifest κ MISMATCH (Law L5): " + manKappa.slice(0, 24) + "… ≠ pinned " + pin.slice(0, 24) + "…"); } else if (!opts.allowUnpinned) throw new Error("manifest unpinned (Law L5): pass opts.expectKappa (catalog pin) or opts.allowUnpinned for dev"); const man = JSON.parse(new TextDecoder().decode(manRaw)); // FAMILY FINETUNE: a `base-κ + delta` object — reconstruct via the delta loader (same return shape). if (man.format === "holo-delta/1") return (await import("./holo-load-delta.mjs")).loadDeltaObject(baseUrl, { ...opts, manifest: man }); // FAST FIRST LOAD: prefetch every block into the cache in parallel while the engine builds (turns a // latency-bound sequential stream into a bandwidth-bound one — critical when serving off a remote CDN like HF). if (opts.prefetch !== false) { try { prefetchBlocks(baseUrl, Object.values(man.tensors || {}).map((r) => r.kappa)); } catch (e) {} } // RAM-bounded, DISK-cached: the engine fetches each tensor once, so decoded blocks are handed over and // released — in-flight RAM stays ~one block (a 7B κ-object decompresses to >2.6 GB). The gzipped block // bytes are persisted to the Cache API by their content-addressed URL (cachedBytes), so a returning user // reads them from disk with no network. Every block — cached or fresh — is L5 re-derived (κ must match), // so the cache is untrusted-safe. const getBlock = async (kappa) => { const gz = await cachedBytes(baseUrl + "/b/" + kappa.replace(":", "_") + ".gz"); // Law L5: re-derive the κ. BLAKE3 (σ-axis, GPU-parallel tree hash) when the manifest declares it — the same // hash the on-GPU verifier reproduces; else SHA-256 (SRI axis, default). Untrusted-CDN-safe either way. let got; if (man.hash === "blake3") { // DEFAULT: re-derive on the SHARED GPU device (2.74 GB/s). SAFE fallback: on any GPU error OR if the GPU // digest disagrees with the pin, re-derive on the CPU — so a GPU quirk degrades to CPU speed, never fails // the load. Only a CPU mismatch (real corruption) throws. Set globalThis.__gpuVerify=false to force CPU. if (gpuBlake3Available() && (typeof globalThis === "undefined" || globalThis.__gpuVerify !== false)) { const t0 = performance.now(); let h = null; try { h = await gpuBlake3Hex(gz); } catch (e) {} if (h && "blake3:" + h === kappa) { // verified ON THE GPU, matches the pin got = kappa; try { const v = globalThis.__vs || (globalThis.__vs = { n: 0, bytes: 0, ms: 0 }); v.n++; v.bytes += gz.length; v.ms += performance.now() - t0; } catch (e) {} } else got = "blake3:" + blake3hex(gz); // GPU errored/disagreed → CPU re-derive (never fail on a GPU quirk) } else got = "blake3:" + blake3hex(gz); } else got = "sha256:" + hex(await crypto.subtle.digest("SHA-256", gz)); if (got !== kappa) throw new Error("κ MISMATCH " + kappa.slice(0, 24)); return await gunzip(gz); }; const fetchTensor = async (name) => { const rec = man.tensors[name]; if (!rec) return new Uint8Array(0); return reshapeTensor(rec, await getBlock(rec.kappa)); }; // E₈ codebook (mode e8): the 256×8 LUT is its own content-addressed block — fetch + κ-verify (Law L5) let e8lutData; if (man.e8lut) { const b = await getBlock(man.e8lut.replace(/^did:holo:/, "")); e8lutData = new Float32Array(b.buffer, b.byteOffset, 2048); } const manifest = buildEngineManifest(man, man.tensors, e8lutData); // bundled tokenizer (SERVERLESS load): the header (tokenizer + arch) should load same-origin/on-device, no // external host. A RELATIVE `source` already resolves against the κ-object's own dir. When the manifest // declares a REMOTE (http) source but the κ-object ALSO ships a local tokenizer.gguf, PREFER the bundle — // one cheap HEAD probe, and the manifest bytes (hence its Law-L5 pin) stay untouched. No bundle served → // fall back to the declared source. This makes a κ-object with a bundled header 100% serverless to load. const base = baseUrl.replace(/\/+$/, ""); if (man.source && !/^https?:\/\//.test(man.source)) man.source = base + "/" + man.source; else if (man.source && /^https?:\/\//.test(man.source)) { try { const h = await fetch(base + "/tokenizer.gguf", { method: "HEAD" }); if (h && h.ok) man.source = base + "/tokenizer.gguf"; } catch (e) {} } return { manifest, fetchTensor, info: man }; }