q / holo-load2bit.mjs
Humuhumu33's picture
Upload holo-load2bit.mjs with huggingface_hub
f789de8 verified
Raw
History Blame Contribute Delete
12.2 kB
// holo-load2bit.mjs — the LOAD-DIRECT consumer (the "load" half of the 7B infra). Given a pre-compiled
// 2-bit κ-object (manifest.json + content-addressed b/<κ>.gz blocks, produced by compile2bit.mjs), it builds
// the engine manifest + a fetchTensor that streams blocks, verifies each by re-deriving its κ (Law L5),
// gunzips, and hands the engine the weights ALREADY 2-bit — no re-quant at load. The engine reads
// manifest.preQuantized=true (parts() returns the blocks verbatim) and incoherent=false (LDLQ ⇒ no FWHT).
// Hosting = serve the κ-object dir from anywhere; the κ-verify makes any mirror untrusted-safe.
//
// A family FINETUNE is stored as `base-κ + delta` (format "holo-delta/1"); loadKappaObject detects that and
// transparently delegates to holo-load-delta.mjs, which reconstructs the finetune's blocks and returns the
// SAME { manifest, fetchTensor } shape — so the engine, KV-cache, and Q's brain loader need no changes.
import { f16ToF32 } from "./qvac-ingest.mjs";
import { blake3hex } from "./holo-blake3.mjs"; // BLAKE3 σ-axis κ verify (opt-in via manifest.hash==="blake3") — the GPU-parallel tree hash
import { gpuBlake3Hex, gpuBlake3Available } from "./gpu-blake3.mjs"; // the SAME BLAKE3, run entirely on the GPU (2.74 GB/s)
async function gunzip(u8) { const ds = new DecompressionStream("gzip"); const w = ds.writable.getWriter(); w.write(u8); w.close(); return new Uint8Array(await new Response(ds.readable).arrayBuffer()); }
const hex = (buf) => [...new Uint8Array(buf)].map((b) => b.toString(16).padStart(2, "0")).join("");
// ── PERSISTENT κ-CACHE (the returning-user path): a κ-object's manifest + blocks are content-addressed and
// IMMUTABLE, so once fetched they can live in the browser's Cache API FOREVER, keyed by their own URL. A
// returning user then loads Q from local disk — ~0 network, no server (serverless by construction). It is
// untrusted-safe: every cached block is still L5 re-derived below, so a poisoned cache entry is rejected
// exactly like a poisoned network body. First visit warms the cache; every visit after is disk-speed. ──
const KCACHE = "holo-kappa-v1";
let _persistAsked = false;
async function _askPersist() {
if (_persistAsked) return; _persistAsked = true; // once per session: ask the browser NOT to evict the model under storage pressure
try { if (navigator.storage && navigator.storage.persist && !(await navigator.storage.persisted())) await navigator.storage.persist(); } catch (e) {}
}
// fetch a URL as bytes, disk-cached by URL. cache HIT → no network. MISS → fetch once (no-store bounds the
// in-flight RAM to one body) then store to disk. All failures fall back to a plain fetch (never block a load).
const _inflight = new Map(); // URL → in-flight fetch promise, so a parallel prefetch + the engine's read of the
// SAME block share ONE network fetch (never double-download).
async function cachedBytes(url) {
let cache = null;
try { cache = await caches.open(KCACHE); const hit = await cache.match(url); if (hit) return new Uint8Array(await hit.arrayBuffer()); } catch (e) { cache = null; }
if (_inflight.has(url)) return _inflight.get(url);
const p = (async () => {
const buf = await (await fetch(url, { cache: "no-store" })).arrayBuffer();
if (cache) { try { await cache.put(url, new Response(buf.slice(0), { headers: { "Content-Type": "application/octet-stream" } })); _askPersist(); } catch (e) {} }
return new Uint8Array(buf);
})();
_inflight.set(url, p);
try { return await p; } finally { _inflight.delete(url); }
}
// FAST FIRST LOAD: warm the whole block cache with bounded concurrency, so the engine's sequential per-tensor
// reads hit the cache instead of paying one HF round-trip at a time (~1.5 blocks/s → tens of blocks/s over HTTP/2).
// Fire-and-forget; the engine's getBlock shares any in-flight fetch (no double-download). Cross-origin-CDN safe.
function prefetchBlocks(baseUrl, kappas, conc) {
try {
// HF serves over HTTP/2: many small blocks parallelize well, so fan out wide to saturate bandwidth (the one
// big embed block holds a single lane, the rest fill the others). Tunable via globalThis.__prefetchConc for
// constrained links/mobile. This is the whole first-load story for a resident κ-object — bandwidth, not RTT.
conc = conc || (typeof globalThis !== "undefined" && globalThis.__prefetchConc) || 24;
const urls = [...new Set(kappas.filter(Boolean))].map((k) => baseUrl + "/b/" + String(k).replace(":", "_") + ".gz");
let i = 0;
const worker = async () => { while (i < urls.length) { const u = urls[i++]; try { await cachedBytes(u); } catch (e) {} } };
Promise.all(Array.from({ length: Math.min(conc, urls.length) }, worker)).catch(() => {});
} catch (e) {}
}
// reshape a raw (gunzipped) block to what the engine reads. PURE — shared with the delta loader so both
// paths apply identical per-fmt handling. 2bit+fp16 → 2bit+f32 scales; everything else verbatim.
export function reshapeTensor(rec, raw) {
if (rec.fmt === "2bit" && rec.fp16) { // [2-bit packed][fp16 scales] → [2-bit][f32 scales]
const Kp = rec.K, q2 = (rec.N * Kp) / 4, nsc = rec.N * (Kp / 32);
const f16 = new Uint16Array(raw.buffer, raw.byteOffset + q2, nsc);
const out = new Uint8Array(q2 + nsc * 4); out.set(raw.subarray(0, q2), 0);
const f32 = new Float32Array(out.buffer, q2, nsc); for (let i = 0; i < nsc; i++) f32[i] = f16ToF32(f16[i]);
return out;
}
return raw; // 2bit+f32 (incoherence), q8 (embed), f32 (norms) — verbatim
}
// build the engine manifest from model meta + normalized tensor records {name→{N,K,fmt,s?}}. PURE — shared.
export function buildEngineManifest(man, normRecs, e8lutData) {
const tensors = Object.entries(normRecs).map(([name, rec]) => ({ name, N: rec.N, K: rec.K, blk: rec.fmt !== "f32", fmt: rec.fmt, ...(rec.s !== undefined ? { s: rec.s } : {}) }));
const native = man.mode === "q4" || man.mode === "q3" || man.mode === "e8" || man.mode === "bitnet"; // native-bits κ-object
return {
d: man.d, n_heads: man.n_heads, n_kv_heads: man.n_kv_heads, ff: man.ff, vocab: man.vocab, n_layers: man.n_layers, hd: man.hd,
bits: native ? man.bits : 8, layout: man.layout, rope_base: man.rope_base, ...(man.maskId !== undefined ? { maskId: man.maskId, diffusion: true } : {}), attn_bias: man.attn_bias, qk_norm: man.qk_norm, qk_norm_dim: man.qk_norm_dim, tied: man.tied,
...(man.sub_norm ? { sub_norm: true } : {}), ...(man.bitlinear ? { bitlinear: true } : {}), ...(man.ffn_act ? { ffn_act: man.ffn_act } : {}), ...(man.moe ? { moe: man.moe } : {}),
...(native ? {} : { twoBit: true, incoherent: man.incoherent === true, preQuantized: true }), tensors, ...(e8lutData ? { e8lutData } : {}),
};
}
export async function loadKappaObject(baseUrl, opts = {}) {
// Law L5: the manifest is the ROOT that names every block's κ. Verify the manifest's OWN bytes
// re-derive to a pinned κ BEFORE trusting man.tensors[*].kappa — otherwise a tampered manifest can
// re-point every block to a forged-but-self-consistent κ and each per-block check passes against the
// forgery. The pin is an EXTERNAL anchor (catalog/lock), never the manifest's own self-asserted root.
const manRaw = await cachedBytes(baseUrl + "/manifest.json"); // disk-cached (pinned + verified below), so a returning user's load is 0-network end to end
const manKappa = "sha256:" + hex(await crypto.subtle.digest("SHA-256", manRaw));
const pin = opts.expectKappa ? String(opts.expectKappa).replace(/^did:holo:/, "") : null;
if (pin) { if (manKappa !== pin) throw new Error("manifest κ MISMATCH (Law L5): " + manKappa.slice(0, 24) + "… ≠ pinned " + pin.slice(0, 24) + "…"); }
else if (!opts.allowUnpinned) throw new Error("manifest unpinned (Law L5): pass opts.expectKappa (catalog pin) or opts.allowUnpinned for dev");
const man = JSON.parse(new TextDecoder().decode(manRaw));
// FAMILY FINETUNE: a `base-κ + delta` object — reconstruct via the delta loader (same return shape).
if (man.format === "holo-delta/1") return (await import("./holo-load-delta.mjs")).loadDeltaObject(baseUrl, { ...opts, manifest: man });
// FAST FIRST LOAD: prefetch every block into the cache in parallel while the engine builds (turns a
// latency-bound sequential stream into a bandwidth-bound one — critical when serving off a remote CDN like HF).
if (opts.prefetch !== false) { try { prefetchBlocks(baseUrl, Object.values(man.tensors || {}).map((r) => r.kappa)); } catch (e) {} }
// RAM-bounded, DISK-cached: the engine fetches each tensor once, so decoded blocks are handed over and
// released — in-flight RAM stays ~one block (a 7B κ-object decompresses to >2.6 GB). The gzipped block
// bytes are persisted to the Cache API by their content-addressed URL (cachedBytes), so a returning user
// reads them from disk with no network. Every block — cached or fresh — is L5 re-derived (κ must match),
// so the cache is untrusted-safe.
const getBlock = async (kappa) => {
const gz = await cachedBytes(baseUrl + "/b/" + kappa.replace(":", "_") + ".gz");
// Law L5: re-derive the κ. BLAKE3 (σ-axis, GPU-parallel tree hash) when the manifest declares it — the same
// hash the on-GPU verifier reproduces; else SHA-256 (SRI axis, default). Untrusted-CDN-safe either way.
let got;
if (man.hash === "blake3") {
// DEFAULT: re-derive on the SHARED GPU device (2.74 GB/s). SAFE fallback: on any GPU error OR if the GPU
// digest disagrees with the pin, re-derive on the CPU — so a GPU quirk degrades to CPU speed, never fails
// the load. Only a CPU mismatch (real corruption) throws. Set globalThis.__gpuVerify=false to force CPU.
if (gpuBlake3Available() && (typeof globalThis === "undefined" || globalThis.__gpuVerify !== false)) {
const t0 = performance.now();
let h = null; try { h = await gpuBlake3Hex(gz); } catch (e) {}
if (h && "blake3:" + h === kappa) { // verified ON THE GPU, matches the pin
got = kappa;
try { const v = globalThis.__vs || (globalThis.__vs = { n: 0, bytes: 0, ms: 0 }); v.n++; v.bytes += gz.length; v.ms += performance.now() - t0; } catch (e) {}
} else got = "blake3:" + blake3hex(gz); // GPU errored/disagreed → CPU re-derive (never fail on a GPU quirk)
} else got = "blake3:" + blake3hex(gz);
} else got = "sha256:" + hex(await crypto.subtle.digest("SHA-256", gz));
if (got !== kappa) throw new Error("κ MISMATCH " + kappa.slice(0, 24));
return await gunzip(gz);
};
const fetchTensor = async (name) => {
const rec = man.tensors[name]; if (!rec) return new Uint8Array(0);
return reshapeTensor(rec, await getBlock(rec.kappa));
};
// E₈ codebook (mode e8): the 256×8 LUT is its own content-addressed block — fetch + κ-verify (Law L5)
let e8lutData;
if (man.e8lut) { const b = await getBlock(man.e8lut.replace(/^did:holo:/, "")); e8lutData = new Float32Array(b.buffer, b.byteOffset, 2048); }
const manifest = buildEngineManifest(man, man.tensors, e8lutData);
// bundled tokenizer (SERVERLESS load): the header (tokenizer + arch) should load same-origin/on-device, no
// external host. A RELATIVE `source` already resolves against the κ-object's own dir. When the manifest
// declares a REMOTE (http) source but the κ-object ALSO ships a local tokenizer.gguf, PREFER the bundle —
// one cheap HEAD probe, and the manifest bytes (hence its Law-L5 pin) stay untouched. No bundle served →
// fall back to the declared source. This makes a κ-object with a bundled header 100% serverless to load.
const base = baseUrl.replace(/\/+$/, "");
if (man.source && !/^https?:\/\//.test(man.source)) man.source = base + "/" + man.source;
else if (man.source && /^https?:\/\//.test(man.source)) {
try { const h = await fetch(base + "/tokenizer.gguf", { method: "HEAD" }); if (h && h.ok) man.source = base + "/tokenizer.gguf"; } catch (e) {}
}
return { manifest, fetchTensor, info: man };
}