// core/engine.js — the inference ENGINE adapter (the only module that touches the wasm
// tokenizer + the WebGPU `gpu` object). DOM-free. Wraps a model that core/loader.js has
// already loaded onto the GPU and exposes a clean, UI-agnostic API:
//
//   const engine = await createEngine(modelEntry, { gpu, info, imageKappa });
//   const { text, outIds } = await engine.generate(ids, { onToken, signal });
//   const rec = await engine.buildReceipt({ ... });   // PROV-O, re-derivable (Law L5)
//
// The token loop, framing, memo and receipt logic are lifted byte-for-byte from the
// original index.html think()/run()/sealReceipt — only the DOM writes are replaced by an
// onToken callback and the running/handedOff flags by an AbortSignal, so output (and the
// receipt κ) is identical to the original app.

import { qvac_tokenize, qvac_continue, kappa } from "../pkg/holospaces_web.js";
import { clean, didHolo, kappaTokens, sealReceipt, verifyIntegrity, idBytes, kappaBytes } from "./kappa.js";

const _perf = () => (typeof performance !== "undefined" ? performance.now() : 0);
const _sleep = (ms) => new Promise((r) => setTimeout(r, ms));

// The engine is itself a content-addressed object — hash the wasm once (lazy).
let _engineK = null;
export async function engineKappa() {
  if (_engineK) return _engineK;
  try { const b = new Uint8Array(await (await fetch(new URL("../pkg/holospaces_web_bg.wasm", import.meta.url))).arrayBuffer()); _engineK = await kappaBytes(b); }
  catch { _engineK = "did:holo:sha256:(engine unavailable)"; }
  return _engineK;
}

export async function createEngine(modelEntry, loaded) {
  const { gpu, info, imageKappa } = loaded;
  const m = modelEntry;
  const engineReady = engineKappa();

  // model κ: the κ-disk's VERIFIED image_kappa when present (a real content address of the
  // weights, every sector re-derived); else the model's declared identity.
  const modelKappa = imageKappa
    ? "did:holo:sha256:" + String(imageKappa).replace(/^(did:holo:)?sha256:/, "")
    : await didHolo({ "@type": "schema:SoftwareSourceCode", name: m.name, size: m.size, fmt: m.fmt || "", family: m.fam || "" });

  const memo = new Map();
  let _drafter = null;   // learned speculative drafter (fn(seq,max)=>ids); null → standard decode. Set via setDrafter().

  const tokenize = (text) => { try { return JSON.parse(qvac_tokenize(text)).ids || []; } catch { return []; } };
  const detokenize = (ids) => { try { return clean(JSON.parse(qvac_continue(JSON.stringify(ids), 0, 0, 0, ids.length)).text || ""); } catch { return ""; } };
  const fingerprint = (ids) => kappa(idBytes(ids));   // live mind κ (blake3, from wasm)

  // Frame one user turn. Qwen2/3 use ChatML (its <|im_*|> markers are atomic BPE tokens);
  // other instruction models use a plain Q/A frame. (Verbatim from the original run().)
  function frameTurn(prompt, hasHistory) {
    if (m.qwen) {
      const noThink = m.qwen3 ? "<think>\n\n</think>\n\n" : "";   // Qwen3: skip the thinking block for fast direct answers
      return (hasHistory ? "<|im_end|>\n" : "") + `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n` + noThink;
    }
    if (m.llama3)                                                  // LLaMA-3 header template (BitNet b1.58 etc.)
      return (hasHistory ? "<|eot_id|>" : "") + `<|start_header_id|>user<|end_header_id|>\n\n${prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n`;
    if (m.olmo)                                                    // OLMo/OLMoE: <|user|>/<|assistant|> role tags (each turn self-delimited; leading bos via m.bos)
      return `<|user|>\n${prompt}\n<|assistant|>\n`;
    if (m.userWord)                                                // word-frame (Falcon-E: its ChatML template stalls EMPIRICALLY; "User:/Falcon:" answers — see q-falcon-templates sweep)
      return (hasHistory ? "\n" : "") + "User: " + prompt + "\nFalcon:";
    return "Question: " + prompt + "\nAnswer:";
  }

  function params() {
    const temp = m.temp || 0;
    return { decode: temp > 0 ? "sampled@t=" + temp : "greedy-argmax", maxTokens: m.cap, repetitionPenalty: m.rep ?? 1.05, template: m.qwen ? "chatml" : m.llama3 ? "llama3" : "qa", thinking: !m.qwen3 };
  }

  // The streaming token loop. `ids` is the whole running conversation (the mind); generation
  // appends to it. onToken({ text, ids, outIds, stats }) fires per step; signal aborts.
  async function generate(ids, { onToken, signal, repPenalty, maxNew } = {}) {
    const rep = repPenalty ?? m.rep ?? 1.3;
    const newCap = maxNew ?? m.cap ?? 80;                       // max NEW tokens this call
    const kvCap = (m.ctx || m.cap || 80) + 8;                   // the engine's KV allocation (loader kvOf)
    const promptLen = ids.length;
    const tStart = _perf();
    let first = true, decodeStart = 0, decodeTok = 0, ttft = 0, tokps = 0, msExec = 0, err = null, outText = "";
    if (promptLen >= kvCap - 1) err = new Error(`context full: ${promptLen} tokens ≥ ${kvCap} KV positions`);

    // SPECULATIVE PATH (opt-in): when a learned drafter is registered and the engine has the batched-verify
    // head, the drafter proposes and the target batch-verifies — output is BYTE-IDENTICAL to greedy decode
    // (greedy verify), streamed via onCommit. Any incompatibility/throw falls straight through to the standard
    // loop below, so default Q (no drafter) and unsupported models are completely unaffected.
    if (!err && (_drafter || globalThis.__spec) && gpu.specDecode && gpu.setDrafter) {
      try {
        gpu.setDrafter(_drafter);
        const out = []; const t0 = _perf(); let ttft2 = 0, tokps2 = 0;
        const seq = await gpu.specDecode(ids.slice(), newCap, rep, (tk) => {
          if (signal && signal.aborted) return false;
          out.push(tk);
          if (!ttft2) ttft2 = _perf() - tStart;
          const dt = _perf() - t0; if (dt > 0) tokps2 = out.length / (dt / 1000);
          if (onToken) onToken({ text: detokenize(out), ids: ids.slice(0, promptLen).concat(out), outIds: out.slice(), stats: { ttft: ttft2, tokps: tokps2, msExec: gpu.timing ? gpu.timing.exec : 0, gpuBytes: gpu.gpuBytes, spec: gpu.specStats ? gpu.specStats() : null } });
          return out.length < newCap;
        });
        const outIds = seq.slice(promptLen);
        let text = detokenize(outIds);
        if (m.stopText) { const ix = text.indexOf(m.stopText); if (ix >= 0) text = text.slice(0, ix); }
        return { text, outIds, ids: seq, stats: { ttft: ttft2, tokps: tokps2, msExec: gpu.timing ? gpu.timing.exec : 0, spec: gpu.specStats ? gpu.specStats() : null }, error: null };
      } catch (e) { try { gpu.setDrafter(null); } catch {} /* fall through to the standard decode loop */ }
    }

    while (!err && !(signal && signal.aborted) && ids.length - promptLen < newCap && ids.length < kvCap - 1) {
      const prevLen = ids.length;
      try { ids = await (gpu.decode || gpu.generate)(ids, first ? 1 : 6, rep); }   // batched GPU decode head (4 B/token readback) when the engine has it
      catch (e) { err = e; break; }
      const dn = ids.length - prevLen;
      if (dn > 0) {
        msExec = gpu.timing ? gpu.timing.exec : msExec;
        if (first) { ttft = _perf() - tStart; decodeStart = _perf(); first = false; }   // TTFT = prefill + first token
        else { decodeTok += dn; const dt = _perf() - decodeStart; if (dt > 0) tokps = decodeTok / (dt / 1000); }   // steady decode rate
      }
      const di = ids.slice(promptLen);
      // incremental detokenize: decode only a bounded window (the dn new tokens + a small left context)
      // and append the delta — O(1) per step, not re-detokenizing the whole growing output (was O(n²)).
      { const a = Math.max(promptLen, ids.length - (dn + 8)); const wf = detokenize(ids.slice(a)); const wp = a >= ids.length - dn ? "" : detokenize(ids.slice(a, ids.length - dn)); outText += wf.slice(wp.length); }
      let text = outText, hitStop = false;
      if (m.stopText) { const ix = text.indexOf(m.stopText); if (ix >= 0) { text = text.slice(0, ix); hitStop = true; } }   // word-framed models stop on the next "User:" turn
      if (onToken) onToken({ text, ids: ids.slice(), outIds: di.slice(), stats: { ttft, tokps, msExec, gpuBytes: gpu.gpuBytes } });
      if (hitStop) break;
      if (ids.length <= prevLen) break;    // EOS / no progress
      // degeneration guard: a long run of one repeated character (the repetition collapse of
      // small/experimental quants) will never recover — stop instead of burning the budget.
      if (text.length > 80 && /(.)\1{63}$/.test(text)) { err = new Error("degenerate repetition — stopped"); break; }
      await _sleep(0);   // yield to the event loop so the UI can paint — no artificial throttle (gpu.decode already awaits the GPU)
    }
    const outIds = ids.slice(promptLen);
    let text = detokenize(outIds);
    if (m.stopText) { const ix = text.indexOf(m.stopText); if (ix >= 0) text = text.slice(0, ix); }
    return { text, outIds, ids, stats: { ttft, tokps, msExec }, error: err };
  }

  // DIFFUSION decode (Dream-class): iterative bidirectional unmasking over a fixed `steps` budget,
  // wall-clock fixed by steps not output length. Greedy ⇒ deterministic ⇒ κ-re-derivable (Law L5).
  // `ids` is the framed prompt; we diffuse `genLen` masked positions after it. Returns the same shape
  // as generate() so callers (and the brain seam) are agnostic. onToken fires ONCE with the final fill
  // (diffusion has no left-to-right token stream — the whole block resolves together).
  // Two modes: APPEND (genLen masks at the suffix — generation) or FILL (ids ALREADY contain mask ids
  // anywhere → infill/surgical edit, conditioning on BOTH sides; diffusion's structural edge over AR).
  // `causal` flips the parity gate (causal block=1 must equal the sequential engine — validates the pass).
  async function diffuse(ids, { genLen, steps, fill, causal, signal, onToken } = {}) {
    if (!gpu || !gpu.diffuse) throw new Error("this model has no diffusion engine (load a diffusion κ-object)");
    const gl = fill ? 0 : (genLen ?? Math.min(m.cap || 64, (m.ctx || 192) - ids.length - 1));
    const S = steps ?? m.steps ?? 12;
    const tStart = _perf();
    const seq = await gpu.diffuse(ids, gl, { steps: S, fill: !!fill, causal: !!causal, signal });
    // append → output is the generated suffix; fill → the whole sequence is the answer (a span edited in place)
    const outIds = fill ? seq.slice() : seq.slice(ids.length);
    let text = detokenize(outIds);
    if (m.stopText && !fill) { const ix = text.indexOf(m.stopText); if (ix >= 0) text = text.slice(0, ix); }
    const stats = { ttft: _perf() - tStart, tokps: 0, msExec: gpu.timing ? gpu.timing.exec : 0, steps: S, fill: !!fill, diff: gpu.diffStats ? gpu.diffStats() : null };
    if (onToken) onToken({ text, ids: seq.slice(), outIds: outIds.slice(), stats });
    return { text, outIds, ids: seq, stats, error: null };
  }

  // κ-memo: identical (context ⊕ prompt ⊕ model ⊕ params) → replay in O(1), no decode.
  const memoKey = async (ctxIds, turnIds, p) => didHolo({ ctx: await kappaTokens(ctxIds.concat(turnIds)), model: modelKappa, params: p || params() });

  async function buildReceipt({ promptText, ctxIds, turnIds, outIds, fromMemo, evaluateText, paramsPatch, extraUsed }) {
    return sealReceipt({
      promptText, ctxIds, turnIds, outIds, text: detokenize(outIds), params: { ...params(), ...(paramsPatch || {}) }, fromMemo,
      modelKappa, engineKappa: await engineReady, evaluateText, extraUsed,
    });
  }

  // Re-derivation (greedy only): re-run the exact inference and reproduce κ(output) byte-for-byte.
  async function reDerive(rec) {
    if (!gpu) return { ok: false, reason: "load the model to re-derive" };
    if (/sampled/.test(rec.params.decode)) return { ok: false, reason: "sampled decode — only the κ-binding is verifiable, not re-derivation" };
    try {
      let seq = rec.ctxIds.concat(rec.turnIds); const start = seq.length;
      gpu.reset();
      seq = await (gpu.decode || gpu.generate)(seq, rec.outIds.length, rec.params.repetitionPenalty);   // same head as the live path — replay must match byte-for-byte
      const got = await kappaTokens(seq.slice(start)), want = rec.body["prov:generated"]["holo:outputTokens"];
      return { ok: got === want, got, want };
    } finally { try { gpu.reset(); } catch {} }
  }

  return {
    model: m, dims: gpu.dims, modelKappa, bosId: info?.bos ?? null, get gpuBytes() { return gpu.gpuBytes; },
    tokenize, detokenize, fingerprint, frameTurn, params,
    generate,
    // register/clear the learned speculative drafter (fn(seq,max)=>ids). Off by default; safe fallback.
    setDrafter: (fn) => { _drafter = fn || null; try { gpu.setDrafter && gpu.setDrafter(_drafter); } catch (e) {} },
    specAvailable: !!(gpu.specDecode && gpu.setDrafter),
    memoKey, memoGet: (k) => memo.get(k), memoHas: (k) => memo.has(k), memoSet: (k, v) => memo.set(k, v),
    buildReceipt, verify: verifyIntegrity, reDerive,
    stats: () => gpu.timing, reset: () => { try { gpu.reset(); } catch {} }, destroy: () => { try { gpu.destroy(); } catch {} },
    _gpu: gpu,   // raw engine handle (diagnostics: per-pass GPU trace via the step() path + window.__profile)
  };
}