Spaces:

HOLOGRAMTECH
/

q

Running

App Files Files Community

q / core /loader.js

Humuhumu33's picture

decode(): report live-path GPU ms/tok for clean CPU/GPU split

d2afec3 verified about 11 hours ago

History Blame Contribute Delete

17.4 kB

	// core/loader.js — model LOADING (the 5 substrate paths) + the model catalog + the
	// browser-cache manager. Lifted faithfully from the original index.html so a model still
	// loads byte-identically; the only change is that DOM status writes become onStatus/onProgress
	// callbacks, and each path RETURNS { gpu, info, manifest, imageKappa } instead of mutating
	// globals. core/engine.js then wraps the returned gpu. (window.__gpu / window.__kd handles are
	// still exposed for the probe + system-monitor panels.)

	import init, { kappa, qvac_load_model, qvac_load_gpu, qvac_tokenize, qvac_continue, qvac_gpu_manifest, qvac_gpu_tensor, qvac_gpu_free, qvac_panic_hook } from "../pkg/holospaces_web.js";
	import { createQvacGPU } from "../qvac-gpu.js?v=62";
	import { modelAsSource } from "./semantic.js"; // C2: a loaded model carries a W3C @type (schema:SoftwareSourceCode)

	// the model κ-object's W3C linked-data view — content-addressed identity (Law L1) + schema.org type.
	const modelLinkedData = (m, root) => modelAsSource({
	name: m.name, family: m.fam, params: m.size, format: m.fmt,
	kappa: root ? (String(root).startsWith("did:") ? root : "did:holo:" + String(root)) : "did:holo:sha256:0",
	});

	// The compiled κ-objects present on disk (models/<name>, built by compile2bit.mjs). Each loads
	// DIRECT off the substrate (verified by re-derivation, no re-quant) via its `kappaUrl`.
	// cap = max NEW tokens per turn; ctx = KV-cache positions allocated on the GPU (the context
	// window — sized so agentic turns with tool schemas + tool responses fit; KV VRAM scales with it).
	export const MODELS = [
	// NATIVELY-TERNARY κ-objects (t2, 1.58 bpw trained-in — see the atlas-bridge witness receipts):
	// Falcon-E: its declared ChatML template STALLS empirically (instant <\|end_of_text\|>); the
	// measured working frame is word-style "User:/Falcon:" with a textual stop (q-falcon-templates sweep).
	{ fam: "Falcon-E", name: "Falcon-E-3B · ternary", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-falcon-e-3b/resolve/main", manifestKappa: "did:holo:sha256:6b753fe8186f2b4194424115c36014698580a2aab8427e9b40365893ac6b77ca", size: "0.63 GB", fmt: "t2 1.58-bit κ", cap: 200, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, userWord: true, stopText: "\nUser:", tools: false, rep: 1.18, kappa: true },
	{ fam: "BitNet", name: "BitNet-2B-4T · ternary", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-bitnet-2b/resolve/main", manifestKappa: "did:holo:sha256:fcf835659d88d2fe6f683cf1ab8de6a6ba6214ea0deeee4b1bcf3da1a4c05412", size: "0.69 GB", fmt: "t2 1.58-bit κ", cap: 900, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, llama3: true, tools: false, bos: true, eosText: "<\|eot_id\|>", rep: 1.05, kappa: true },
	// TriLM: the LARGEST natively-ternary-trained model (Spectra 3.9B, ICLR'25); per-row/channel
	// scale structure → t2r (trit codes + per-256-block scales, exact). BASE model → QA frame + stop.
	{ fam: "TriLM", name: "TriLM-3.9B · ternary", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-trilm-3.9b/resolve/main", manifestKappa: "did:holo:sha256:499032ceb19c0476345a72cf5fea6caec83054c98486c91a5891dfad0d25ea30", size: "0.87 GB", fmt: "t2r 2.1-bit κ", cap: 200, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, stopText: "\nQuestion:", tools: false, rep: 1.18, kappa: true },
	// AGENTIC CODER: Qwen2.5-Coder-7B (q3f) — the Holo Code agent brain. Qwen2.5 arch ⇒ ChatML +
	// agentic tool framing work (capability floor for tool use is ~7B; the small ternary models opt out).
	// Self-contained κ-object: tokenizer bundled (source="tokenizer.gguf"), no external dependency.
	{ fam: "Qwen2.5-Coder", name: "Qwen2.5-Coder-7B · agentic", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-qwen-coder-7b/resolve/main", manifestKappa: "did:holo:sha256:539941cb060c7dd583e2e86697e53f2c5d511d597c65d09d9c780fbded2c3edf", size: "3.4 GB", fmt: "q3f κ", cap: 900, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, code: true, qwen: true, rep: 1.05, kappa: true },
	// MIXTURE-OF-EXPERTS (G5): OLMoE-1B-7B (Allen AI, Apache-2.0) — 64 experts, 8 active/token, ~1.3B
	// active of 7B. The first RESIDENT-MoE κ-object: experts RAM-resident + CPU top-k router (softmax
	// over all 64, no renorm = OLMoE norm_topk_prob:false). q4 (the engine's resident expert FFN path).
	{ fam: "OLMoE", name: "OLMoE-1B-7B · MoE (64×8)", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-olmoe-1b-7b/resolve/main", manifestKappa: "did:holo:sha256:9cf97ec1c761fd4ef51bc0cd4ac37a0cd8eaa11f1b19b3ae6a141486ad3fe5ad", size: "3.6 GB", fmt: "q4 MoE κ", cap: 400, ctx: 3000, kv4: false, gpu: true, gpuOnly: true, chat: true, olmo: true, bos: true, eosText: "<\|endoftext\|>", tools: false, rep: 1.1, kappa: true },
	// DIFFUSION (G6): Dream-7B (Dream-org/Dream-v0-Instruct-7B) — masked-diffusion LM on the Qwen2.5-7B
	// backbone (same dims ⇒ ChatML). NOT autoregressive: generation is iterative bidirectional unmasking
	// over `steps` denoising passes (engine.diffuse / gpu.diffuse), wall-clock fixed by steps not length.
	// maskId 151666 rides in the manifest (never tokenized from text). Greedy ⇒ deterministic ⇒ κ-re-derivable.
	{ fam: "Dream", name: "Dream-7B · diffusion", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-dream-7b/resolve/main", manifestKappa: "did:holo:sha256:7b862931ae088f348f1f7e9ea3adbd418924c2e07e6ddd134f926e5681ad760d", size: "2.9 GB", fmt: "q3f diffusion κ", cap: 192, ctx: 192, kv4: false, gpu: true, gpuOnly: true, chat: true, qwen: true, diffusion: true, steps: 12, rep: 1.0, kappa: true },
	// Qwen κ-objects (q3f/q4) were pruned from disk for space — re-derive via compile2bit, then re-list.
	];
	const kvOf = (m) => Math.max(96, (m.ctx \|\| m.cap) + 8);

	const _sizeGb = (s) => { const n = parseFloat(s) \|\| 0; return /mb/i.test(s) ? n / 1024 : n; };
	// default to the SMALLEST usable model — lowest latency, fastest first answer.
	export const defaultModelIndex = () => (MODELS.map((m, i) => i).filter((i) => !MODELS[i].disabled).sort((a, b) => _sizeGb(MODELS[a].size) - _sizeGb(MODELS[b].size))[0]) ?? 0;

	// ── wasm init (once) + tokenizer re-export so the rest of the app shares this instance ──
	let _initOnce = null;
	export function ready() { if (!_initOnce) _initOnce = init().then(() => { try { qvac_panic_hook(); } catch {} }); return _initOnce; }
	export { qvac_tokenize, qvac_continue, kappa };

	// ── browser-cache model manager (Cache API) — "Get" downloads + keeps; loading uses the copy ──
	export const MCACHE = "holo-q-models";
	const absUrl = (u) => new URL(u, location.href).href;
	let _cachedUrls = new Set();
	export async function refreshCached() { try { const c = await caches.open(MCACHE); _cachedUrls = new Set((await c.keys()).map((r) => r.url)); } catch { _cachedUrls = new Set(); } return _cachedUrls; }
	export const isCached = (m) => !!m.url && _cachedUrls.has(absUrl(m.url));
	export async function deleteCache(m) { try { const c = await caches.open(MCACHE); await c.delete(m.url); } catch {} await refreshCached(); }
	async function modelBytes(m, onStatus) {
	try { const c = await caches.open(MCACHE); const hit = await c.match(m.url); if (hit) return new Uint8Array(await hit.arrayBuffer()); } catch {}
	onStatus?.(`Downloading ${m.name} (${m.size})…`);
	const res = await fetch(m.url); if (!res.ok) { onStatus?.("download failed: HTTP " + res.status); return null; }
	return new Uint8Array(await res.arrayBuffer());
	}

	const noop = () => {};

	// loadModel(entry, { onStatus, onProgress }) → { gpu, info, manifest, imageKappa } \| null
	// `imageKappa` is the VERIFIED content address of the weights when the path provides one
	// (κ-object root, or κ-disk image_kappa); core/engine.js binds it as the receipt's model κ.
	export async function loadModel(m, { onStatus = noop, onProgress = noop } = {}) {
	await ready();
	onStatus(`Loading ${m.name}…`);
	try {
	if (m.gpuOnly && !navigator.gpu) { onStatus("This model needs WebGPU (not available here)."); return null; }
	if (m.kappaUrl) return await loadKappa(m, onStatus, onProgress);
	if (m.kdisk) return await loadModelKDisk(m, onStatus, onProgress);
	if (m.remote) return await loadModelRemote(m, onStatus, onProgress);
	if (m.diskIngest) return await loadModelDisk(m, onStatus, onProgress);
	let gguf = await modelBytes(m, onStatus); if (!gguf) { onStatus("could not load model"); return null; }
	const lr = JSON.parse(m.gpuOnly ? qvac_load_gpu(gguf) : qvac_load_model(gguf));
	gguf = null;
	if (lr.error) { onStatus("model error: " + lr.error); return null; }
	let gpu = null, manifest = null;
	if (navigator.gpu && m.gpu) {
	try {
	onStatus(`Uploading ${m.name} to the GPU…`);
	const bits = m.q4 ? 4 : 8;
	manifest = JSON.parse(qvac_gpu_manifest(bits)); manifest.twoBit = !!window.__twoBit;
	const __qp = new URLSearchParams(location.search).get("stream");
	const __qmode = __qp === null ? undefined : (__qp === "resident" \|\| __qp === "false" ? false : __qp);
	const stream = __qmode ?? window.__stream ?? m.stream ?? false;
	const __ft = (name) => { const raw = qvac_gpu_tensor(name, bits); return window.__weightHook ? window.__weightHook(name, raw, bits, manifest) : raw; };
	gpu = await createQvacGPU(manifest, __ft, kvOf(m), lr.eos ?? 2, stream);
	window.__gpu = gpu; qvac_gpu_free();
	} catch (e) { gpu = null; if (m.gpuOnly) { onStatus("GPU upload failed: " + e); return null; } }
	}
	onStatus("");
	return { gpu, info: lr, manifest, imageKappa: null };
	} catch (e) { console.error("[Q] load failed:", e, e && e.stack); onStatus("could not load model: " + e); return null; }
	}

	// LOAD-DIRECT: a pre-compiled 2-bit/Q4 κ-object (compile2bit.mjs output). Weights arrive ALREADY
	// quantized (no re-quant at load); the tokenizer comes from the source GGUF's header only.
	async function loadKappa(m, onStatus, onProgress) {
	onStatus("Loading κ-object manifest…");
	const ld = await import("../holo-load2bit.mjs");
	// Law L5: pin the manifest κ when the catalog supplies one (m.manifestKappa, or a string m.kappa).
	// Until every model carries a pin, unpinned entries load explicitly (allowUnpinned) — the gap is then
	// a visible data task (populate manifestKappa), not a silent trust of an unauthenticated root.
	const pin = (typeof m.manifestKappa === "string" && m.manifestKappa) \|\| (typeof m.kappa === "string" && m.kappa) \|\| null;
	const { manifest, fetchTensor, info } = await ld.loadKappaObject(m.kappaUrl.replace(/\/+$/, ""), pin ? { expectKappa: pin } : { allowUnpinned: true });
	const ing = await import("../qvac-ingest.mjs");
	onStatus("Building tokenizer (source header, no full download)…");
	const hdr = await ing.readHeader(info.source, ing.rangeReader());
	const lr = JSON.parse(qvac_load_gpu(hdr.headerBytes));
	if (lr.error) { onStatus("tokenizer error: " + lr.error); return null; }
	if (m.eosText) { try { const e = JSON.parse(qvac_tokenize(m.eosText)).ids; if (e && e.length === 1) lr.eos = e[0]; } catch {} } // chat-stop override (e.g. LLaMA-3 <\|eot_id\|> ≠ header eos)
	qvac_gpu_free();
	manifest.kv4 = !!m.kv4; // int4 KV cache (E6) — catalog opt-in
	// MoE forward reads the layer-packed attention (Wb[l]) + RAM-resident experts (readExpert via
	// fetchTensor) — i.e. stream="layer": attention JS-resident & paged per token, experts cached.
	const sm = manifest.moe ? "layer" : (m.stream \|\| window.__kappaStream \|\| false);
	onStatus(`Building engine from κ-object (${info.mode === "q4" ? "native Q4" : info.incoherent ? "incoherent 2-bit" : "LDLQ 2-bit"}, ${sm \|\| "resident"}, no requant)…`);
	const prog = (done, total) => onProgress(done, total, "streaming");
	const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, sm, sm ? prog : null);
	window.__gpu = gpu;
	onStatus("");
	return { gpu, info: lr, manifest, imageKappa: info.root \|\| null, ld: modelLinkedData(m, info.root) };
	}

	// Very-large-model path: the GGUF never enters wasm; only the header does (tokenizer + manifest),
	// then each tensor is streamed off disk (HTTP Range), converted in JS, paged to the GPU per layer.
	async function loadModelDisk(m, onStatus, onProgress) {
	const bits = m.q4 ? 4 : 8;
	const ing = await import("../qvac-ingest.mjs");
	let read = ing.rangeReader();
	try { const cachedResp = await (await caches.open(MCACHE)).match(m.url); if (cachedResp) { const blob = await cachedResp.blob(); read = async (_u, start, len) => new Uint8Array(await blob.slice(start, start + len).arrayBuffer()); } } catch {}
	onStatus(`Reading ${m.name} header…`);
	const hdr = await ing.readHeader(m.url, read);
	const lr = JSON.parse(qvac_load_gpu(hdr.headerBytes));
	if (lr.error) { onStatus("model error: " + lr.error); return null; }
	const manifest = JSON.parse(qvac_gpu_manifest(bits));
	qvac_gpu_free();
	const fetchTensor = ing.makeDiskFetcher({ url: m.url, readRange: read, dataOffset: hdr.dataOffset, tensors: hdr.tensors, manifest, bits });
	const mode = m.stream \|\| "layer";
	onStatus(`Preparing ${m.name} (one-time, streamed off disk)…`);
	const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, mode, (d, t) => onProgress(d, t, "layers"));
	window.__gpu = gpu; onStatus("");
	return { gpu, info: lr, manifest, imageKappa: null };
	}

	// Out-of-core: stream a PRE-BUILT .qvf frames file from the server, one layer per token via HTTP Range.
	async function loadModelRemote(m, onStatus, onProgress) {
	onStatus(`Loading ${m.name} index…`);
	const index = await (await fetch(m.framesUrl + ".json")).json();
	const url = m.framesUrl;
	const rr = async (off, len) => { const r = await fetch(url, { headers: { Range: `bytes=${off}-${off + len - 1}` } }); if (!r.ok && r.status !== 206) throw new Error("HTTP " + r.status); return new Uint8Array(await r.arrayBuffer()); };
	const header = await rr(index.headerOff, index.headerLen);
	const lr = JSON.parse(qvac_load_gpu(header));
	if (lr.error) { onStatus("model error: " + lr.error); return null; }
	const manifest = index.manifest; qvac_gpu_free();
	const fetchTensor = async (name) => { const s = index.singles[name]; return s ? await rr(s.off, s.len) : new Uint8Array(0); };
	const frameStore = { ready: true, read: (off, len) => rr(index.layersOff + off, len), readExpert: (l, e, role) => { const ri = { gate: 0, up: 1, down: 2 }[role]; const off = index.expertsOff + ((l * index.nExperts + e) * 3 + ri) * index.expertBytes; return rr(off, index.expertBytes); } };
	const layersBytes = (index.packStride \|\| 0) * (index.n_layers \|\| 0) + (manifest.moe ? (index.nExperts * 3 * index.expertBytes * index.n_layers) : 0);
	const cacheBudget = window.__cacheGB != null ? window.__cacheGB * 1073741824 : Math.min(layersBytes, 12 * 1073741824);
	onStatus(`Preparing ${m.name} (served off disk)…`);
	const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, "remote", (d, t) => onProgress(d, t, "remote"), frameStore, cacheBudget);
	window.__gpu = gpu; onStatus("");
	return { gpu, info: lr, manifest, imageKappa: null };
	}

	// HOLOGRAM: load through a content-addressed κ-DISK — every sector VERIFIED by re-derivation (Law L3/L5).
	async function loadModelKDisk(m, onStatus, onProgress) {
	onStatus(`Resolving ${m.name} κ-disk…`);
	const index = await (await fetch(m.kdiskUrl)).json();
	const { makeKDisk } = await import("../qvac-kdisk.mjs");
	const bases = window.__kdiskSources \|\| m.kdiskSources \|\| [location.origin];
	const sources = bases.map((b) => b.replace(/\/$/, "") + "/" + (index.dataFile \|\| (m.dataUrl \|\| "").replace(/^\.\//, "")));
	const kd = makeKDisk({ index, sources });
	window.__kd = kd;
	const iv = await kd.verifyImage();
	if (!iv.ok) { onStatus("κ-disk image_kappa mismatch — refusing to load"); return null; }
	const rr = kd.rr, qvf = index.qvf;
	const header = await rr(qvf.headerOff, qvf.headerLen);
	const lr = JSON.parse(qvac_load_gpu(header));
	if (lr.error) { onStatus("model error: " + lr.error); return null; }
	const manifest = qvf.manifest; qvac_gpu_free();
	const fetchTensor = async (name) => { const s = qvf.singles[name]; return s ? await rr(s.off, s.len) : new Uint8Array(0); };
	const frameStore = { ready: true, read: (off, len) => rr(qvf.layersOff + off, len),
	readExpert: async (l, e, role) => { const ri = { gate: 0, up: 1, down: 2 }[role]; const blkOff = qvf.expertsOff + (l * qvf.nExperts + e) * 3 * qvf.expertBytes; const blk = await rr(blkOff, 3 * qvf.expertBytes); return blk.slice(ri * qvf.expertBytes, (ri + 1) * qvf.expertBytes); } };
	const layersBytes = (qvf.packStride \|\| 0) * (qvf.n_layers \|\| 0) + (manifest.moe ? (qvf.nExperts * 3 * qvf.expertBytes * qvf.n_layers) : 0);
	const cacheBudget = window.__cacheGB != null ? window.__cacheGB * 1073741824 : Math.min(layersBytes, 1024 * 1048576);
	onStatus(`Realizing ${m.name} (verified off κ-disk)…`);
	const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, "remote", (d, t) => onProgress(d, t, "κ-disk"), frameStore, cacheBudget);
	window.__gpu = gpu;
	const st = kd.stats(); onStatus(`${index.imageKappa.slice(0, 22)}… · ${st.verified} sectors verified`);
	return { gpu, info: lr, manifest, imageKappa: kd.imageKappa \|\| index.imageKappa \|\| null };
	}