Spaces:

HOLOGRAMTECH
/

q

Running

App Files Files Community

q / holo-load2bit.mjs

Humuhumu33's picture

Upload holo-load2bit.mjs with huggingface_hub

f789de8 verified about 6 hours ago

History Blame Contribute Delete

12.2 kB

	// holo-load2bit.mjs — the LOAD-DIRECT consumer (the "load" half of the 7B infra). Given a pre-compiled
	// 2-bit κ-object (manifest.json + content-addressed b/<κ>.gz blocks, produced by compile2bit.mjs), it builds
	// the engine manifest + a fetchTensor that streams blocks, verifies each by re-deriving its κ (Law L5),
	// gunzips, and hands the engine the weights ALREADY 2-bit — no re-quant at load. The engine reads
	// manifest.preQuantized=true (parts() returns the blocks verbatim) and incoherent=false (LDLQ ⇒ no FWHT).
	// Hosting = serve the κ-object dir from anywhere; the κ-verify makes any mirror untrusted-safe.
	//
	// A family FINETUNE is stored as `base-κ + delta` (format "holo-delta/1"); loadKappaObject detects that and
	// transparently delegates to holo-load-delta.mjs, which reconstructs the finetune's blocks and returns the
	// SAME { manifest, fetchTensor } shape — so the engine, KV-cache, and Q's brain loader need no changes.
	import { f16ToF32 } from "./qvac-ingest.mjs";
	import { blake3hex } from "./holo-blake3.mjs"; // BLAKE3 σ-axis κ verify (opt-in via manifest.hash==="blake3") — the GPU-parallel tree hash
	import { gpuBlake3Hex, gpuBlake3Available } from "./gpu-blake3.mjs"; // the SAME BLAKE3, run entirely on the GPU (2.74 GB/s)

	async function gunzip(u8) { const ds = new DecompressionStream("gzip"); const w = ds.writable.getWriter(); w.write(u8); w.close(); return new Uint8Array(await new Response(ds.readable).arrayBuffer()); }
	const hex = (buf) => [...new Uint8Array(buf)].map((b) => b.toString(16).padStart(2, "0")).join("");

	// ── PERSISTENT κ-CACHE (the returning-user path): a κ-object's manifest + blocks are content-addressed and
	// IMMUTABLE, so once fetched they can live in the browser's Cache API FOREVER, keyed by their own URL. A
	// returning user then loads Q from local disk — ~0 network, no server (serverless by construction). It is
	// untrusted-safe: every cached block is still L5 re-derived below, so a poisoned cache entry is rejected
	// exactly like a poisoned network body. First visit warms the cache; every visit after is disk-speed. ──
	const KCACHE = "holo-kappa-v1";
	let _persistAsked = false;
	async function _askPersist() {
	if (_persistAsked) return; _persistAsked = true; // once per session: ask the browser NOT to evict the model under storage pressure
	try { if (navigator.storage && navigator.storage.persist && !(await navigator.storage.persisted())) await navigator.storage.persist(); } catch (e) {}
	}
	// fetch a URL as bytes, disk-cached by URL. cache HIT → no network. MISS → fetch once (no-store bounds the
	// in-flight RAM to one body) then store to disk. All failures fall back to a plain fetch (never block a load).
	const _inflight = new Map(); // URL → in-flight fetch promise, so a parallel prefetch + the engine's read of the
	// SAME block share ONE network fetch (never double-download).
	async function cachedBytes(url) {
	let cache = null;
	try { cache = await caches.open(KCACHE); const hit = await cache.match(url); if (hit) return new Uint8Array(await hit.arrayBuffer()); } catch (e) { cache = null; }
	if (_inflight.has(url)) return _inflight.get(url);
	const p = (async () => {
	const buf = await (await fetch(url, { cache: "no-store" })).arrayBuffer();
	if (cache) { try { await cache.put(url, new Response(buf.slice(0), { headers: { "Content-Type": "application/octet-stream" } })); _askPersist(); } catch (e) {} }
	return new Uint8Array(buf);
	})();
	_inflight.set(url, p);
	try { return await p; } finally { _inflight.delete(url); }
	}
	// FAST FIRST LOAD: warm the whole block cache with bounded concurrency, so the engine's sequential per-tensor
	// reads hit the cache instead of paying one HF round-trip at a time (~1.5 blocks/s → tens of blocks/s over HTTP/2).
	// Fire-and-forget; the engine's getBlock shares any in-flight fetch (no double-download). Cross-origin-CDN safe.
	function prefetchBlocks(baseUrl, kappas, conc) {
	try {
	// HF serves over HTTP/2: many small blocks parallelize well, so fan out wide to saturate bandwidth (the one
	// big embed block holds a single lane, the rest fill the others). Tunable via globalThis.__prefetchConc for
	// constrained links/mobile. This is the whole first-load story for a resident κ-object — bandwidth, not RTT.
	conc = conc \|\| (typeof globalThis !== "undefined" && globalThis.__prefetchConc) \|\| 24;
	const urls = [...new Set(kappas.filter(Boolean))].map((k) => baseUrl + "/b/" + String(k).replace(":", "_") + ".gz");
	let i = 0;
	const worker = async () => { while (i < urls.length) { const u = urls[i++]; try { await cachedBytes(u); } catch (e) {} } };
	Promise.all(Array.from({ length: Math.min(conc, urls.length) }, worker)).catch(() => {});
	} catch (e) {}
	}

	// reshape a raw (gunzipped) block to what the engine reads. PURE — shared with the delta loader so both
	// paths apply identical per-fmt handling. 2bit+fp16 → 2bit+f32 scales; everything else verbatim.
	export function reshapeTensor(rec, raw) {
	if (rec.fmt === "2bit" && rec.fp16) { // [2-bit packed][fp16 scales] → [2-bit][f32 scales]
	const Kp = rec.K, q2 = (rec.N * Kp) / 4, nsc = rec.N * (Kp / 32);
	const f16 = new Uint16Array(raw.buffer, raw.byteOffset + q2, nsc);
	const out = new Uint8Array(q2 + nsc * 4); out.set(raw.subarray(0, q2), 0);
	const f32 = new Float32Array(out.buffer, q2, nsc); for (let i = 0; i < nsc; i++) f32[i] = f16ToF32(f16[i]);
	return out;
	}
	return raw; // 2bit+f32 (incoherence), q8 (embed), f32 (norms) — verbatim
	}

	// build the engine manifest from model meta + normalized tensor records {name→{N,K,fmt,s?}}. PURE — shared.
	export function buildEngineManifest(man, normRecs, e8lutData) {
	const tensors = Object.entries(normRecs).map(([name, rec]) => ({ name, N: rec.N, K: rec.K, blk: rec.fmt !== "f32", fmt: rec.fmt, ...(rec.s !== undefined ? { s: rec.s } : {}) }));
	const native = man.mode === "q4" \|\| man.mode === "q3" \|\| man.mode === "e8" \|\| man.mode === "bitnet"; // native-bits κ-object
	return {
	d: man.d, n_heads: man.n_heads, n_kv_heads: man.n_kv_heads, ff: man.ff, vocab: man.vocab, n_layers: man.n_layers, hd: man.hd,
	bits: native ? man.bits : 8, layout: man.layout, rope_base: man.rope_base, ...(man.maskId !== undefined ? { maskId: man.maskId, diffusion: true } : {}), attn_bias: man.attn_bias, qk_norm: man.qk_norm, qk_norm_dim: man.qk_norm_dim, tied: man.tied,
	...(man.sub_norm ? { sub_norm: true } : {}), ...(man.bitlinear ? { bitlinear: true } : {}), ...(man.ffn_act ? { ffn_act: man.ffn_act } : {}), ...(man.moe ? { moe: man.moe } : {}),
	...(native ? {} : { twoBit: true, incoherent: man.incoherent === true, preQuantized: true }), tensors, ...(e8lutData ? { e8lutData } : {}),
	};
	}

	export async function loadKappaObject(baseUrl, opts = {}) {
	// Law L5: the manifest is the ROOT that names every block's κ. Verify the manifest's OWN bytes
	// re-derive to a pinned κ BEFORE trusting man.tensors[*].kappa — otherwise a tampered manifest can
	// re-point every block to a forged-but-self-consistent κ and each per-block check passes against the
	// forgery. The pin is an EXTERNAL anchor (catalog/lock), never the manifest's own self-asserted root.
	const manRaw = await cachedBytes(baseUrl + "/manifest.json"); // disk-cached (pinned + verified below), so a returning user's load is 0-network end to end
	const manKappa = "sha256:" + hex(await crypto.subtle.digest("SHA-256", manRaw));
	const pin = opts.expectKappa ? String(opts.expectKappa).replace(/^did:holo:/, "") : null;
	if (pin) { if (manKappa !== pin) throw new Error("manifest κ MISMATCH (Law L5): " + manKappa.slice(0, 24) + "… ≠ pinned " + pin.slice(0, 24) + "…"); }
	else if (!opts.allowUnpinned) throw new Error("manifest unpinned (Law L5): pass opts.expectKappa (catalog pin) or opts.allowUnpinned for dev");
	const man = JSON.parse(new TextDecoder().decode(manRaw));
	// FAMILY FINETUNE: a `base-κ + delta` object — reconstruct via the delta loader (same return shape).
	if (man.format === "holo-delta/1") return (await import("./holo-load-delta.mjs")).loadDeltaObject(baseUrl, { ...opts, manifest: man });
	// FAST FIRST LOAD: prefetch every block into the cache in parallel while the engine builds (turns a
	// latency-bound sequential stream into a bandwidth-bound one — critical when serving off a remote CDN like HF).
	if (opts.prefetch !== false) { try { prefetchBlocks(baseUrl, Object.values(man.tensors \|\| {}).map((r) => r.kappa)); } catch (e) {} }
	// RAM-bounded, DISK-cached: the engine fetches each tensor once, so decoded blocks are handed over and
	// released — in-flight RAM stays ~one block (a 7B κ-object decompresses to >2.6 GB). The gzipped block
	// bytes are persisted to the Cache API by their content-addressed URL (cachedBytes), so a returning user
	// reads them from disk with no network. Every block — cached or fresh — is L5 re-derived (κ must match),
	// so the cache is untrusted-safe.
	const getBlock = async (kappa) => {
	const gz = await cachedBytes(baseUrl + "/b/" + kappa.replace(":", "_") + ".gz");
	// Law L5: re-derive the κ. BLAKE3 (σ-axis, GPU-parallel tree hash) when the manifest declares it — the same
	// hash the on-GPU verifier reproduces; else SHA-256 (SRI axis, default). Untrusted-CDN-safe either way.
	let got;
	if (man.hash === "blake3") {
	// DEFAULT: re-derive on the SHARED GPU device (2.74 GB/s). SAFE fallback: on any GPU error OR if the GPU
	// digest disagrees with the pin, re-derive on the CPU — so a GPU quirk degrades to CPU speed, never fails
	// the load. Only a CPU mismatch (real corruption) throws. Set globalThis.__gpuVerify=false to force CPU.
	if (gpuBlake3Available() && (typeof globalThis === "undefined" \|\| globalThis.__gpuVerify !== false)) {
	const t0 = performance.now();
	let h = null; try { h = await gpuBlake3Hex(gz); } catch (e) {}
	if (h && "blake3:" + h === kappa) { // verified ON THE GPU, matches the pin
	got = kappa;
	try { const v = globalThis.__vs \|\| (globalThis.__vs = { n: 0, bytes: 0, ms: 0 }); v.n++; v.bytes += gz.length; v.ms += performance.now() - t0; } catch (e) {}
	} else got = "blake3:" + blake3hex(gz); // GPU errored/disagreed → CPU re-derive (never fail on a GPU quirk)
	} else got = "blake3:" + blake3hex(gz);
	} else got = "sha256:" + hex(await crypto.subtle.digest("SHA-256", gz));
	if (got !== kappa) throw new Error("κ MISMATCH " + kappa.slice(0, 24));
	return await gunzip(gz);
	};
	const fetchTensor = async (name) => {
	const rec = man.tensors[name]; if (!rec) return new Uint8Array(0);
	return reshapeTensor(rec, await getBlock(rec.kappa));
	};
	// E₈ codebook (mode e8): the 256×8 LUT is its own content-addressed block — fetch + κ-verify (Law L5)
	let e8lutData;
	if (man.e8lut) { const b = await getBlock(man.e8lut.replace(/^did:holo:/, "")); e8lutData = new Float32Array(b.buffer, b.byteOffset, 2048); }
	const manifest = buildEngineManifest(man, man.tensors, e8lutData);
	// bundled tokenizer (SERVERLESS load): the header (tokenizer + arch) should load same-origin/on-device, no
	// external host. A RELATIVE `source` already resolves against the κ-object's own dir. When the manifest
	// declares a REMOTE (http) source but the κ-object ALSO ships a local tokenizer.gguf, PREFER the bundle —
	// one cheap HEAD probe, and the manifest bytes (hence its Law-L5 pin) stay untouched. No bundle served →
	// fall back to the declared source. This makes a κ-object with a bundled header 100% serverless to load.
	const base = baseUrl.replace(/\/+$/, "");
	if (man.source && !/^https?:\/\//.test(man.source)) man.source = base + "/" + man.source;
	else if (man.source && /^https?:\/\//.test(man.source)) {
	try { const h = await fetch(base + "/tokenizer.gguf", { method: "HEAD" }); if (h && h.ok) man.source = base + "/tokenizer.gguf"; } catch (e) {}
	}
	return { manifest, fetchTensor, info: man };
	}