q / core /loader.js
Humuhumu33's picture
decode(): report live-path GPU ms/tok for clean CPU/GPU split
d2afec3 verified
Raw
History Blame Contribute Delete
17.4 kB
// core/loader.js — model LOADING (the 5 substrate paths) + the model catalog + the
// browser-cache manager. Lifted faithfully from the original index.html so a model still
// loads byte-identically; the only change is that DOM status writes become onStatus/onProgress
// callbacks, and each path RETURNS { gpu, info, manifest, imageKappa } instead of mutating
// globals. core/engine.js then wraps the returned gpu. (window.__gpu / window.__kd handles are
// still exposed for the probe + system-monitor panels.)
import init, { kappa, qvac_load_model, qvac_load_gpu, qvac_tokenize, qvac_continue, qvac_gpu_manifest, qvac_gpu_tensor, qvac_gpu_free, qvac_panic_hook } from "../pkg/holospaces_web.js";
import { createQvacGPU } from "../qvac-gpu.js?v=62";
import { modelAsSource } from "./semantic.js"; // C2: a loaded model carries a W3C @type (schema:SoftwareSourceCode)
// the model κ-object's W3C linked-data view — content-addressed identity (Law L1) + schema.org type.
const modelLinkedData = (m, root) => modelAsSource({
name: m.name, family: m.fam, params: m.size, format: m.fmt,
kappa: root ? (String(root).startsWith("did:") ? root : "did:holo:" + String(root)) : "did:holo:sha256:0",
});
// The compiled κ-objects present on disk (models/<name>, built by compile2bit.mjs). Each loads
// DIRECT off the substrate (verified by re-derivation, no re-quant) via its `kappaUrl`.
// cap = max NEW tokens per turn; ctx = KV-cache positions allocated on the GPU (the context
// window — sized so agentic turns with tool schemas + tool responses fit; KV VRAM scales with it).
export const MODELS = [
// NATIVELY-TERNARY κ-objects (t2, 1.58 bpw trained-in — see the atlas-bridge witness receipts):
// Falcon-E: its declared ChatML template STALLS empirically (instant <|end_of_text|>); the
// measured working frame is word-style "User:/Falcon:" with a textual stop (q-falcon-templates sweep).
{ fam: "Falcon-E", name: "Falcon-E-3B · ternary", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-falcon-e-3b/resolve/main", manifestKappa: "did:holo:sha256:6b753fe8186f2b4194424115c36014698580a2aab8427e9b40365893ac6b77ca", size: "0.63 GB", fmt: "t2 1.58-bit κ", cap: 200, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, userWord: true, stopText: "\nUser:", tools: false, rep: 1.18, kappa: true },
{ fam: "BitNet", name: "BitNet-2B-4T · ternary", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-bitnet-2b/resolve/main", manifestKappa: "did:holo:sha256:fcf835659d88d2fe6f683cf1ab8de6a6ba6214ea0deeee4b1bcf3da1a4c05412", size: "0.69 GB", fmt: "t2 1.58-bit κ", cap: 900, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, llama3: true, tools: false, bos: true, eosText: "<|eot_id|>", rep: 1.05, kappa: true },
// TriLM: the LARGEST natively-ternary-trained model (Spectra 3.9B, ICLR'25); per-row/channel
// scale structure → t2r (trit codes + per-256-block scales, exact). BASE model → QA frame + stop.
{ fam: "TriLM", name: "TriLM-3.9B · ternary", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-trilm-3.9b/resolve/main", manifestKappa: "did:holo:sha256:499032ceb19c0476345a72cf5fea6caec83054c98486c91a5891dfad0d25ea30", size: "0.87 GB", fmt: "t2r 2.1-bit κ", cap: 200, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, stopText: "\nQuestion:", tools: false, rep: 1.18, kappa: true },
// AGENTIC CODER: Qwen2.5-Coder-7B (q3f) — the Holo Code agent brain. Qwen2.5 arch ⇒ ChatML +
// agentic tool framing work (capability floor for tool use is ~7B; the small ternary models opt out).
// Self-contained κ-object: tokenizer bundled (source="tokenizer.gguf"), no external dependency.
{ fam: "Qwen2.5-Coder", name: "Qwen2.5-Coder-7B · agentic", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-qwen-coder-7b/resolve/main", manifestKappa: "did:holo:sha256:539941cb060c7dd583e2e86697e53f2c5d511d597c65d09d9c780fbded2c3edf", size: "3.4 GB", fmt: "q3f κ", cap: 900, ctx: 3000, kv4: true, gpu: true, gpuOnly: true, chat: true, code: true, qwen: true, rep: 1.05, kappa: true },
// MIXTURE-OF-EXPERTS (G5): OLMoE-1B-7B (Allen AI, Apache-2.0) — 64 experts, 8 active/token, ~1.3B
// active of 7B. The first RESIDENT-MoE κ-object: experts RAM-resident + CPU top-k router (softmax
// over all 64, no renorm = OLMoE norm_topk_prob:false). q4 (the engine's resident expert FFN path).
{ fam: "OLMoE", name: "OLMoE-1B-7B · MoE (64×8)", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-olmoe-1b-7b/resolve/main", manifestKappa: "did:holo:sha256:9cf97ec1c761fd4ef51bc0cd4ac37a0cd8eaa11f1b19b3ae6a141486ad3fe5ad", size: "3.6 GB", fmt: "q4 MoE κ", cap: 400, ctx: 3000, kv4: false, gpu: true, gpuOnly: true, chat: true, olmo: true, bos: true, eosText: "<|endoftext|>", tools: false, rep: 1.1, kappa: true },
// DIFFUSION (G6): Dream-7B (Dream-org/Dream-v0-Instruct-7B) — masked-diffusion LM on the Qwen2.5-7B
// backbone (same dims ⇒ ChatML). NOT autoregressive: generation is iterative bidirectional unmasking
// over `steps` denoising passes (engine.diffuse / gpu.diffuse), wall-clock fixed by steps not length.
// maskId 151666 rides in the manifest (never tokenized from text). Greedy ⇒ deterministic ⇒ κ-re-derivable.
{ fam: "Dream", name: "Dream-7B · diffusion", kappaUrl: "https://huggingface.co/HOLOGRAMTECH/q-dream-7b/resolve/main", manifestKappa: "did:holo:sha256:7b862931ae088f348f1f7e9ea3adbd418924c2e07e6ddd134f926e5681ad760d", size: "2.9 GB", fmt: "q3f diffusion κ", cap: 192, ctx: 192, kv4: false, gpu: true, gpuOnly: true, chat: true, qwen: true, diffusion: true, steps: 12, rep: 1.0, kappa: true },
// Qwen κ-objects (q3f/q4) were pruned from disk for space — re-derive via compile2bit, then re-list.
];
const kvOf = (m) => Math.max(96, (m.ctx || m.cap) + 8);
const _sizeGb = (s) => { const n = parseFloat(s) || 0; return /mb/i.test(s) ? n / 1024 : n; };
// default to the SMALLEST usable model — lowest latency, fastest first answer.
export const defaultModelIndex = () => (MODELS.map((m, i) => i).filter((i) => !MODELS[i].disabled).sort((a, b) => _sizeGb(MODELS[a].size) - _sizeGb(MODELS[b].size))[0]) ?? 0;
// ── wasm init (once) + tokenizer re-export so the rest of the app shares this instance ──
let _initOnce = null;
export function ready() { if (!_initOnce) _initOnce = init().then(() => { try { qvac_panic_hook(); } catch {} }); return _initOnce; }
export { qvac_tokenize, qvac_continue, kappa };
// ── browser-cache model manager (Cache API) — "Get" downloads + keeps; loading uses the copy ──
export const MCACHE = "holo-q-models";
const absUrl = (u) => new URL(u, location.href).href;
let _cachedUrls = new Set();
export async function refreshCached() { try { const c = await caches.open(MCACHE); _cachedUrls = new Set((await c.keys()).map((r) => r.url)); } catch { _cachedUrls = new Set(); } return _cachedUrls; }
export const isCached = (m) => !!m.url && _cachedUrls.has(absUrl(m.url));
export async function deleteCache(m) { try { const c = await caches.open(MCACHE); await c.delete(m.url); } catch {} await refreshCached(); }
async function modelBytes(m, onStatus) {
try { const c = await caches.open(MCACHE); const hit = await c.match(m.url); if (hit) return new Uint8Array(await hit.arrayBuffer()); } catch {}
onStatus?.(`Downloading ${m.name} (${m.size})…`);
const res = await fetch(m.url); if (!res.ok) { onStatus?.("download failed: HTTP " + res.status); return null; }
return new Uint8Array(await res.arrayBuffer());
}
const noop = () => {};
// loadModel(entry, { onStatus, onProgress }) → { gpu, info, manifest, imageKappa } | null
// `imageKappa` is the VERIFIED content address of the weights when the path provides one
// (κ-object root, or κ-disk image_kappa); core/engine.js binds it as the receipt's model κ.
export async function loadModel(m, { onStatus = noop, onProgress = noop } = {}) {
await ready();
onStatus(`Loading ${m.name}…`);
try {
if (m.gpuOnly && !navigator.gpu) { onStatus("This model needs WebGPU (not available here)."); return null; }
if (m.kappaUrl) return await loadKappa(m, onStatus, onProgress);
if (m.kdisk) return await loadModelKDisk(m, onStatus, onProgress);
if (m.remote) return await loadModelRemote(m, onStatus, onProgress);
if (m.diskIngest) return await loadModelDisk(m, onStatus, onProgress);
let gguf = await modelBytes(m, onStatus); if (!gguf) { onStatus("could not load model"); return null; }
const lr = JSON.parse(m.gpuOnly ? qvac_load_gpu(gguf) : qvac_load_model(gguf));
gguf = null;
if (lr.error) { onStatus("model error: " + lr.error); return null; }
let gpu = null, manifest = null;
if (navigator.gpu && m.gpu) {
try {
onStatus(`Uploading ${m.name} to the GPU…`);
const bits = m.q4 ? 4 : 8;
manifest = JSON.parse(qvac_gpu_manifest(bits)); manifest.twoBit = !!window.__twoBit;
const __qp = new URLSearchParams(location.search).get("stream");
const __qmode = __qp === null ? undefined : (__qp === "resident" || __qp === "false" ? false : __qp);
const stream = __qmode ?? window.__stream ?? m.stream ?? false;
const __ft = (name) => { const raw = qvac_gpu_tensor(name, bits); return window.__weightHook ? window.__weightHook(name, raw, bits, manifest) : raw; };
gpu = await createQvacGPU(manifest, __ft, kvOf(m), lr.eos ?? 2, stream);
window.__gpu = gpu; qvac_gpu_free();
} catch (e) { gpu = null; if (m.gpuOnly) { onStatus("GPU upload failed: " + e); return null; } }
}
onStatus("");
return { gpu, info: lr, manifest, imageKappa: null };
} catch (e) { console.error("[Q] load failed:", e, e && e.stack); onStatus("could not load model: " + e); return null; }
}
// LOAD-DIRECT: a pre-compiled 2-bit/Q4 κ-object (compile2bit.mjs output). Weights arrive ALREADY
// quantized (no re-quant at load); the tokenizer comes from the source GGUF's header only.
async function loadKappa(m, onStatus, onProgress) {
onStatus("Loading κ-object manifest…");
const ld = await import("../holo-load2bit.mjs");
// Law L5: pin the manifest κ when the catalog supplies one (m.manifestKappa, or a string m.kappa).
// Until every model carries a pin, unpinned entries load explicitly (allowUnpinned) — the gap is then
// a visible data task (populate manifestKappa), not a silent trust of an unauthenticated root.
const pin = (typeof m.manifestKappa === "string" && m.manifestKappa) || (typeof m.kappa === "string" && m.kappa) || null;
const { manifest, fetchTensor, info } = await ld.loadKappaObject(m.kappaUrl.replace(/\/+$/, ""), pin ? { expectKappa: pin } : { allowUnpinned: true });
const ing = await import("../qvac-ingest.mjs");
onStatus("Building tokenizer (source header, no full download)…");
const hdr = await ing.readHeader(info.source, ing.rangeReader());
const lr = JSON.parse(qvac_load_gpu(hdr.headerBytes));
if (lr.error) { onStatus("tokenizer error: " + lr.error); return null; }
if (m.eosText) { try { const e = JSON.parse(qvac_tokenize(m.eosText)).ids; if (e && e.length === 1) lr.eos = e[0]; } catch {} } // chat-stop override (e.g. LLaMA-3 <|eot_id|> ≠ header eos)
qvac_gpu_free();
manifest.kv4 = !!m.kv4; // int4 KV cache (E6) — catalog opt-in
// MoE forward reads the layer-packed attention (Wb[l]) + RAM-resident experts (readExpert via
// fetchTensor) — i.e. stream="layer": attention JS-resident & paged per token, experts cached.
const sm = manifest.moe ? "layer" : (m.stream || window.__kappaStream || false);
onStatus(`Building engine from κ-object (${info.mode === "q4" ? "native Q4" : info.incoherent ? "incoherent 2-bit" : "LDLQ 2-bit"}, ${sm || "resident"}, no requant)…`);
const prog = (done, total) => onProgress(done, total, "streaming");
const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, sm, sm ? prog : null);
window.__gpu = gpu;
onStatus("");
return { gpu, info: lr, manifest, imageKappa: info.root || null, ld: modelLinkedData(m, info.root) };
}
// Very-large-model path: the GGUF never enters wasm; only the header does (tokenizer + manifest),
// then each tensor is streamed off disk (HTTP Range), converted in JS, paged to the GPU per layer.
async function loadModelDisk(m, onStatus, onProgress) {
const bits = m.q4 ? 4 : 8;
const ing = await import("../qvac-ingest.mjs");
let read = ing.rangeReader();
try { const cachedResp = await (await caches.open(MCACHE)).match(m.url); if (cachedResp) { const blob = await cachedResp.blob(); read = async (_u, start, len) => new Uint8Array(await blob.slice(start, start + len).arrayBuffer()); } } catch {}
onStatus(`Reading ${m.name} header…`);
const hdr = await ing.readHeader(m.url, read);
const lr = JSON.parse(qvac_load_gpu(hdr.headerBytes));
if (lr.error) { onStatus("model error: " + lr.error); return null; }
const manifest = JSON.parse(qvac_gpu_manifest(bits));
qvac_gpu_free();
const fetchTensor = ing.makeDiskFetcher({ url: m.url, readRange: read, dataOffset: hdr.dataOffset, tensors: hdr.tensors, manifest, bits });
const mode = m.stream || "layer";
onStatus(`Preparing ${m.name} (one-time, streamed off disk)…`);
const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, mode, (d, t) => onProgress(d, t, "layers"));
window.__gpu = gpu; onStatus("");
return { gpu, info: lr, manifest, imageKappa: null };
}
// Out-of-core: stream a PRE-BUILT .qvf frames file from the server, one layer per token via HTTP Range.
async function loadModelRemote(m, onStatus, onProgress) {
onStatus(`Loading ${m.name} index…`);
const index = await (await fetch(m.framesUrl + ".json")).json();
const url = m.framesUrl;
const rr = async (off, len) => { const r = await fetch(url, { headers: { Range: `bytes=${off}-${off + len - 1}` } }); if (!r.ok && r.status !== 206) throw new Error("HTTP " + r.status); return new Uint8Array(await r.arrayBuffer()); };
const header = await rr(index.headerOff, index.headerLen);
const lr = JSON.parse(qvac_load_gpu(header));
if (lr.error) { onStatus("model error: " + lr.error); return null; }
const manifest = index.manifest; qvac_gpu_free();
const fetchTensor = async (name) => { const s = index.singles[name]; return s ? await rr(s.off, s.len) : new Uint8Array(0); };
const frameStore = { ready: true, read: (off, len) => rr(index.layersOff + off, len), readExpert: (l, e, role) => { const ri = { gate: 0, up: 1, down: 2 }[role]; const off = index.expertsOff + ((l * index.nExperts + e) * 3 + ri) * index.expertBytes; return rr(off, index.expertBytes); } };
const layersBytes = (index.packStride || 0) * (index.n_layers || 0) + (manifest.moe ? (index.nExperts * 3 * index.expertBytes * index.n_layers) : 0);
const cacheBudget = window.__cacheGB != null ? window.__cacheGB * 1073741824 : Math.min(layersBytes, 12 * 1073741824);
onStatus(`Preparing ${m.name} (served off disk)…`);
const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, "remote", (d, t) => onProgress(d, t, "remote"), frameStore, cacheBudget);
window.__gpu = gpu; onStatus("");
return { gpu, info: lr, manifest, imageKappa: null };
}
// HOLOGRAM: load through a content-addressed κ-DISK — every sector VERIFIED by re-derivation (Law L3/L5).
async function loadModelKDisk(m, onStatus, onProgress) {
onStatus(`Resolving ${m.name} κ-disk…`);
const index = await (await fetch(m.kdiskUrl)).json();
const { makeKDisk } = await import("../qvac-kdisk.mjs");
const bases = window.__kdiskSources || m.kdiskSources || [location.origin];
const sources = bases.map((b) => b.replace(/\/$/, "") + "/" + (index.dataFile || (m.dataUrl || "").replace(/^\.\//, "")));
const kd = makeKDisk({ index, sources });
window.__kd = kd;
const iv = await kd.verifyImage();
if (!iv.ok) { onStatus("κ-disk image_kappa mismatch — refusing to load"); return null; }
const rr = kd.rr, qvf = index.qvf;
const header = await rr(qvf.headerOff, qvf.headerLen);
const lr = JSON.parse(qvac_load_gpu(header));
if (lr.error) { onStatus("model error: " + lr.error); return null; }
const manifest = qvf.manifest; qvac_gpu_free();
const fetchTensor = async (name) => { const s = qvf.singles[name]; return s ? await rr(s.off, s.len) : new Uint8Array(0); };
const frameStore = { ready: true, read: (off, len) => rr(qvf.layersOff + off, len),
readExpert: async (l, e, role) => { const ri = { gate: 0, up: 1, down: 2 }[role]; const blkOff = qvf.expertsOff + (l * qvf.nExperts + e) * 3 * qvf.expertBytes; const blk = await rr(blkOff, 3 * qvf.expertBytes); return blk.slice(ri * qvf.expertBytes, (ri + 1) * qvf.expertBytes); } };
const layersBytes = (qvf.packStride || 0) * (qvf.n_layers || 0) + (manifest.moe ? (qvf.nExperts * 3 * qvf.expertBytes * qvf.n_layers) : 0);
const cacheBudget = window.__cacheGB != null ? window.__cacheGB * 1073741824 : Math.min(layersBytes, 1024 * 1048576);
onStatus(`Realizing ${m.name} (verified off κ-disk)…`);
const gpu = await createQvacGPU(manifest, fetchTensor, kvOf(m), lr.eos ?? 2, "remote", (d, t) => onProgress(d, t, "κ-disk"), frameStore, cacheBudget);
window.__gpu = gpu;
const st = kd.stats(); onStatus(`${index.imageKappa.slice(0, 22)}… · ${st.verified} sectors verified`);
return { gpu, info: lr, manifest, imageKappa: kd.imageKappa || index.imageKappa || null };
}