Spaces:

forkjoin-ai
/

the-void

Running

Taylor commited on 18 days ago

Commit

7336fde

1 Parent(s): c92238b

perf: add WASM SIMD kernels + use Q4_K_M for faster inference

Major changes:
- Bundle simd-kernels-standalone.wasm (14KB) from Aether
- WASM SIMD matVec, rmsNorm, softmax, fusedSiluMul, flashAttention
- Switch from Q8_0 (360MB) to Q4_K_M (210MB, half the work)
- Reduce max_tokens to 50 for snappier demo
- Proper Q4_K dequantization with getScaleMinK4
- Falls back to JS if WASM SIMD unavailable

Files changed (4) hide show

Dockerfile +1 -1
aether-server.mjs +413 -429
app.py +2 -2
simd-kernels.wasm +3 -0

Dockerfile CHANGED Viewed

@@ -13,7 +13,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
 # App files
-COPY app.py aether-server.mjs ./
 # Create cache dir
 RUN mkdir -p /tmp/hf_cache

 RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
 # App files
+COPY app.py aether-server.mjs simd-kernels.wasm ./
 # Create cache dir
 RUN mkdir -p /tmp/hf_cache

aether-server.mjs CHANGED Viewed

@@ -1,15 +1,14 @@
 /**
  * Aether Inference Server
  *
- * Standalone Node.js server running SmolLM2-360M inference
- * using Aether's WASM-SIMD kernels. Zero external ML dependencies.
  *
- * The entire inference pipeline is pure TypeScript + WASM:
- *   GGUF parse → Q4_K dequant → WASM-SIMD matVec → RoPE → SwiGLU → sampling
  */
 import { createServer } from 'http';
-import { readFileSync, existsSync, writeFileSync } from 'fs';
 import { execSync } from 'child_process';
 import { fileURLToPath } from 'url';
 import { dirname, join } from 'path';
@@ -17,7 +16,7 @@ import { dirname, join } from 'path';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const PORT = parseInt(process.env.AETHER_PORT || '7861');
-// ─── Model Config (SmolLM2-360M-Instruct, LLaMA family) ────────────────────
 const CONFIG = {
   hiddenDim: 960,
   numLayers: 32,
@@ -33,278 +32,322 @@ const CONFIG = {
   bosToken: 1,
 };
-// ─── Q8_0 Dequantization ────────────────────────────────────────────────────
-// Q8_0: 34 bytes per block of 32 elements (fp16 scale + 32 int8 quants)
-const Q8_0_BLOCK_SIZE = 32;
-const Q8_0_BLOCK_BYTES = 34;
-function fp16ToF32(lo, hi) {
-  const h = lo | (hi << 8);
-  const s = (h >> 15) & 1;
-  const e = (h >> 10) & 0x1f;
-  const f = h & 0x3ff;
-  if (e === 0) return f === 0 ? (s ? -0 : 0) : (s ? -1 : 1) * (f / 1024) * Math.pow(2, -14);
-  if (e === 31) return 0; // clamp NaN/Inf
-  return (s ? -1 : 1) * Math.pow(2, e - 15) * (1 + f / 1024);
 }
-function dequantQ8_0(data, numElements) {
-  const out = new Float32Array(numElements);
-  const numBlocks = Math.ceil(numElements / Q8_0_BLOCK_SIZE);
-  for (let b = 0; b < numBlocks; b++) {
-    const blockOff = b * Q8_0_BLOCK_BYTES;
-    const scale = fp16ToF32(data[blockOff], data[blockOff + 1]);
-    const elemsInBlock = Math.min(Q8_0_BLOCK_SIZE, numElements - b * Q8_0_BLOCK_SIZE);
-    for (let i = 0; i < elemsInBlock; i++) {
-      const qval = data[blockOff + 2 + i]; // uint8, interpret as int8
-      const signed = qval > 127 ? qval - 256 : qval;
-      out[b * Q8_0_BLOCK_SIZE + i] = signed * scale;
-    }
   }
   return out;
 }
 // ─── Q4_K Dequantization ────────────────────────────────────────────────────
 const QK_K = 256;
 const Q4K_BLOCK_BYTES = 144;
 function dequantQ4K(data, numElements) {
   const out = new Float32Array(numElements);
-  const numBlocks = Math.ceil(numElements / QK_K);
   for (let b = 0; b < numBlocks; b++) {
-    const off = b * Q4K_BLOCK_BYTES;
-    const d = fp16ToF32(data[off], data[off + 1]);
-    const dmin = fp16ToF32(data[off + 2], data[off + 3]);
-    const scalesBytes = data.subarray(off + 4, off + 16);
-    const qBytes = data.subarray(off + 16, off + 16 + 128);
-    // Decode 6-bit scales and mins from 12 bytes
-    const scales = new Float32Array(8);
-    const mins = new Float32Array(8);
-    for (let j = 0; j < 4; j++) {
-      scales[j] = (scalesBytes[j] & 0x3f) * d;
-      scales[j + 4] = ((scalesBytes[j + 4] & 0x0f) | ((scalesBytes[j] >> 6) << 4)) * d;
-      mins[j] = (scalesBytes[j + 4] >> 4 | ((scalesBytes[j + 8] & 0x3f) << 4) ? 0 : 1) * dmin;
-    }
-    // Simplified: just use scale * d for each sub-block
-    for (let j = 0; j < 8; j++) {
-      const sc = (scalesBytes[j < 4 ? j : j] & 0x3f) * d;
-      const mn = (scalesBytes[j < 4 ? j + 4 : j] & 0x3f) * dmin;
-      for (let k = 0; k < 32; k++) {
-        const idx = j * 32 + k;
-        if (idx >= QK_K) break;
-        const byteIdx = Math.floor(idx / 2);
-        const nibble = idx % 2 === 0 ? (qBytes[byteIdx] & 0x0f) : (qBytes[byteIdx] >> 4);
-        out[b * QK_K + idx] = nibble * sc - mn;
       }
     }
   }
   return out;
 }
-// Detect quant type by byte count
-function dequantAuto(data, numElements) {
-  const expectedQ8 = Math.ceil(numElements / Q8_0_BLOCK_SIZE) * Q8_0_BLOCK_BYTES;
-  const expectedQ4K = Math.ceil(numElements / QK_K) * Q4K_BLOCK_BYTES;
-  const expectedF32 = numElements * 4;
-  if (Math.abs(data.length - expectedF32) < expectedF32 * 0.05) {
-    return new Float32Array(data.buffer, data.byteOffset, numElements);
-  }
-  if (Math.abs(data.length - expectedQ8) < expectedQ8 * 0.05) {
-    return dequantQ8_0(data, numElements);
-  }
-  if (Math.abs(data.length - expectedQ4K) < expectedQ4K * 0.05) {
-    return dequantQ4K(data, numElements);
   }
-  // Fallback: try Q8_0
-  console.warn(`[Aether] Unknown quant for ${numElements} elems, ${data.length} bytes. Trying Q8_0.`);
-  return dequantQ8_0(data, numElements);
-}
-// ─── GGUF Parser ────────────────────────────────────────────────────────────
-const GGUF_MAGIC = 0x46554747;
-const VT = { UINT8: 0, INT8: 1, UINT16: 2, INT16: 3, UINT32: 4, INT32: 5, FLOAT32: 6, BOOL: 7, STRING: 8, ARRAY: 9, UINT64: 10, INT64: 11, FLOAT64: 12 };
-const GGML_BLOCK_SIZE = { 2:32,3:32,6:32,7:32,8:32,9:32,10:256,11:256,12:256,13:256,14:256,15:256 };
-const GGML_BLOCK_BYTES = { 2:18,3:20,6:22,7:24,8:34,9:36,10:84,11:110,12:144,13:176,14:210,15:292 };
-const GGML_TYPE_SIZE = { 0:4,1:2,16:1,17:2,18:4,19:8,20:8 };
-function calcTensorSize(dims, type) {
-  let n = 1n;
-  for (const d of dims) n *= d;
-  const bs = GGML_BLOCK_SIZE[type];
-  if (bs && GGML_BLOCK_BYTES[type]) return Math.ceil(Number(n) / bs) * GGML_BLOCK_BYTES[type];
-  return Math.ceil(Number(n) * (GGML_TYPE_SIZE[type] ?? 4));
 }
-function readStr(buf, off) {
-  const len = Number(buf.readBigUInt64LE(off));
-  return { v: buf.subarray(off+8, off+8+len).toString('utf8'), o: off+8+len };
 }
-function readVal(buf, off, t) {
-  switch(t) {
-    case VT.UINT8: return { v: buf.readUInt8(off), o: off+1 };
-    case VT.INT8: return { v: buf.readInt8(off), o: off+1 };
-    case VT.UINT16: return { v: buf.readUInt16LE(off), o: off+2 };
-    case VT.INT16: return { v: buf.readInt16LE(off), o: off+2 };
-    case VT.UINT32: return { v: buf.readUInt32LE(off), o: off+4 };
-    case VT.INT32: return { v: buf.readInt32LE(off), o: off+4 };
-    case VT.FLOAT32: return { v: buf.readFloatLE(off), o: off+4 };
-    case VT.BOOL: return { v: buf.readUInt8(off) !== 0, o: off+1 };
-    case VT.STRING: { const r = readStr(buf, off); return { v: r.v, o: r.o }; }
-    case VT.UINT64: return { v: buf.readBigUInt64LE(off), o: off+8 };
-    case VT.INT64: return { v: buf.readBigInt64LE(off), o: off+8 };
-    case VT.FLOAT64: return { v: buf.readDoubleLE(off), o: off+8 };
-    case VT.ARRAY: {
-      const at = buf.readUInt32LE(off);
-      const al = Number(buf.readBigUInt64LE(off+4));
-      let co = off+12;
-      const arr = [];
-      for (let i = 0; i < al; i++) { const r = readVal(buf, co, at); arr.push(r.v); co = r.o; }
-      return { v: arr, o: co };
-    }
-    default: throw new Error(`Unknown GGUF value type: ${t}`);
-  }
 }
-function parseGGUF(buf) {
-  let off = 0;
-  if (buf.readUInt32LE(off) !== GGUF_MAGIC) throw new Error('Not GGUF');
-  off += 4;
-  const version = buf.readUInt32LE(off); off += 4;
-  const tensorCount = Number(buf.readBigUInt64LE(off)); off += 8;
-  const kvCount = Number(buf.readBigUInt64LE(off)); off += 8;
-  let alignment = 32;
-  const metadata = {};
-  for (let i = 0; i < kvCount; i++) {
-    const { v: key, o: o1 } = readStr(buf, off); off = o1;
-    const vt = buf.readUInt32LE(off); off += 4;
-    const { v, o: o2 } = readVal(buf, off, vt); off = o2;
-    metadata[key] = v;
-    if (key === 'general.alignment') alignment = Number(v);
-  }
-  const tensors = [];
-  for (let i = 0; i < tensorCount; i++) {
-    const { v: name, o: o1 } = readStr(buf, off); off = o1;
-    const nDims = buf.readUInt32LE(off); off += 4;
-    const dims = [];
-    for (let d = 0; d < nDims; d++) { dims.push(buf.readBigUInt64LE(off)); off += 8; }
-    const type = buf.readUInt32LE(off); off += 4;
-    const offset = buf.readBigUInt64LE(off); off += 8;
-    const numElements = Number(dims.reduce((a, b) => a * b, 1n));
-    tensors.push({ name, nDims, dims, type, offset, size: calcTensorSize(dims, type), numElements });
-  }
-  const dataOffset = Math.ceil(off / alignment) * alignment;
-  return { version, tensors, dataOffset, metadata };
 }
 // ─── BPE Tokenizer ──────────────────────────────────────────────────────────
 class BPETokenizer {
-  constructor(tokenizerJson) {
-    const model = tokenizerJson.model || {};
-    this.vocab = model.vocab || {};
-    this.reverseVocab = {};
-    for (const [token, id] of Object.entries(this.vocab)) {
-      this.reverseVocab[id] = token;
-    }
-    this.merges = (model.merges || []).map((m, i) => {
-      const [a, b] = m.split(' ');
-      return { a, b, rank: i };
-    });
     this.mergeRanks = {};
-    for (const m of this.merges) {
-      this.mergeRanks[`${m.a} ${m.b}`] = m.rank;
-    }
-    // Added tokens (special tokens)
-    this.addedTokens = {};
-    if (tokenizerJson.added_tokens) {
-      for (const t of tokenizerJson.added_tokens) {
-        this.addedTokens[t.content] = t.id;
-      }
-    }
-    this.vocabSize = Object.keys(this.vocab).length + Object.keys(this.addedTokens).length;
   }
   encode(text) {
-    // Handle special tokens first
-    const specialPattern = /<\|[^|]+\|>/g;
-    const parts = [];
-    let lastIdx = 0;
-    let match;
-    while ((match = specialPattern.exec(text)) !== null) {
-      if (match.index > lastIdx) parts.push({ text: text.slice(lastIdx, match.index), special: false });
-      parts.push({ text: match[0], special: true });
-      lastIdx = match.index + match[0].length;
     }
-    if (lastIdx < text.length) parts.push({ text: text.slice(lastIdx), special: false });
     const tokens = [];
-    for (const part of parts) {
-      if (part.special) {
-        const id = this.addedTokens[part.text] ?? this.vocab[part.text];
-        if (id !== undefined) tokens.push(id);
-        continue;
-      }
-      // Pre-tokenize: split into words (byte-level BPE style)
-      const words = part.text.match(/\S+|\s+/g) || [];
-      for (const word of words) {
-        // Convert to byte-level tokens
-        let symbols = [];
-        for (let i = 0; i < word.length; i++) {
-          const ch = word[i];
-          const id = this.vocab[ch];
-          if (id !== undefined) {
-            symbols.push(ch);
-          } else {
-            // Byte fallback
-            const bytes = Buffer.from(ch, 'utf8');
-            for (const b of bytes) {
-              const hex = `<0x${b.toString(16).toUpperCase().padStart(2, '0')}>`;
-              symbols.push(hex);
-            }
-          }
         }
-        // BPE merge loop
-        while (symbols.length > 1) {
-          let bestRank = Infinity;
-          let bestIdx = -1;
-          for (let i = 0; i < symbols.length - 1; i++) {
-            const key = `${symbols[i]} ${symbols[i+1]}`;
-            const rank = this.mergeRanks[key];
-            if (rank !== undefined && rank < bestRank) {
-              bestRank = rank;
-              bestIdx = i;
-            }
           }
-          if (bestIdx === -1) break;
-          const merged = symbols[bestIdx] + symbols[bestIdx + 1];
-          symbols.splice(bestIdx, 2, merged);
-        }
-        // Map to IDs
-        for (const sym of symbols) {
-          const id = this.vocab[sym] ?? this.addedTokens[sym];
-          if (id !== undefined) tokens.push(id);
         }
       }
     }
     return tokens;
   }
   decode(tokens) {
     const pieces = [];
     for (const t of tokens) {
-      const piece = this.reverseVocab[t];
-      if (piece !== undefined) {
-        // Handle byte tokens like <0xFF>
-        if (piece.startsWith('<0x') && piece.endsWith('>')) {
-          const byte = parseInt(piece.slice(3, -1), 16);
-          pieces.push(String.fromCharCode(byte));
-        } else if (!piece.startsWith('<|')) {
-          pieces.push(piece);
-        }
-      }
     }
     return pieces.join('').replace(/Ġ/g, ' ').replace(/Ċ/g, '\n');
   }
@@ -312,56 +355,17 @@ class BPETokenizer {
 // ─── RoPE ───────────────────────────────────────────────────────────────────
 function applyRoPE(x, headDim, position, theta) {
-  const halfDim = headDim / 2;
-  for (let i = 0; i < halfDim; i++) {
     const freq = 1.0 / Math.pow(theta, (2 * i) / headDim);
     const angle = position * freq;
-    const cos = Math.cos(angle);
-    const sin = Math.sin(angle);
-    const x0 = x[i];
-    const x1 = x[i + halfDim];
     x[i] = x0 * cos - x1 * sin;
-    x[i + halfDim] = x0 * sin + x1 * cos;
   }
 }
-// ─── Pure JS SIMD-style ops (fallback; WASM SIMD used when available) ───────
-function matVec(matrix, vector, rows, cols) {
-  const out = new Float32Array(rows);
-  for (let r = 0; r < rows; r++) {
-    let sum = 0;
-    const rowOff = r * cols;
-    for (let c = 0; c < cols; c++) sum += matrix[rowOff + c] * vector[c];
-    out[r] = sum;
-  }
-  return out;
-}
-function rmsNorm(x, weight, eps) {
-  let ss = 0;
-  for (let i = 0; i < x.length; i++) ss += x[i] * x[i];
-  ss = 1.0 / Math.sqrt(ss / x.length + eps);
-  const out = new Float32Array(x.length);
-  for (let i = 0; i < x.length; i++) out[i] = x[i] * ss * weight[i];
-  return out;
-}
-function silu(x) {
-  const out = new Float32Array(x.length);
-  for (let i = 0; i < x.length; i++) out[i] = x[i] / (1 + Math.exp(-x[i]));
-  return out;
-}
-function softmax(x) {
-  let max = -Infinity;
-  for (let i = 0; i < x.length; i++) if (x[i] > max) max = x[i];
-  const out = new Float32Array(x.length);
-  let sum = 0;
-  for (let i = 0; i < x.length; i++) { out[i] = Math.exp(x[i] - max); sum += out[i]; }
-  for (let i = 0; i < x.length; i++) out[i] /= sum;
-  return out;
-}
 // ─── Model ──────────────────────────────────────────────────────────────────
 let model = null;
@@ -372,78 +376,69 @@ function loadModel(ggufPath, tokenizerPath) {
   const parsed = parseGGUF(buf);
   console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now() - t0}ms`);
-  // Load tokenizer
-  console.log('[Aether] Loading tokenizer...');
   const tokJson = JSON.parse(readFileSync(tokenizerPath, 'utf8'));
   const tokenizer = new BPETokenizer(tokJson);
-  // Extract tensors by name
-  const tensorByName = {};
-  for (const t of parsed.tensors) tensorByName[t.name] = t;
-  // Helper to extract and dequantize a tensor
-  function getTensor(name) {
-    const t = tensorByName[name];
-    if (!t) { console.warn(`[Aether] Missing tensor: ${name}`); return null; }
-    const absOffset = parsed.dataOffset + Number(t.offset);
-    const raw = new Uint8Array(buf.buffer, buf.byteOffset + absOffset, t.size);
     return dequantAuto(raw, t.numElements);
   }
   console.log('[Aether] Dequantizing embeddings...');
-  const tokenEmbd = getTensor('token_embd.weight');
   console.log('[Aether] Dequantizing layers...');
   const layers = [];
   for (let i = 0; i < CONFIG.numLayers; i++) {
     if (i % 8 === 0) console.log(`[Aether]   Layer ${i}/${CONFIG.numLayers}...`);
     layers.push({
-      attnNorm: getTensor(`blk.${i}.attn_norm.weight`),
-      ffnNorm: getTensor(`blk.${i}.ffn_norm.weight`),
-      qProj: getTensor(`blk.${i}.attn_q.weight`),
-      kProj: getTensor(`blk.${i}.attn_k.weight`),
-      vProj: getTensor(`blk.${i}.attn_v.weight`),
-      oProj: getTensor(`blk.${i}.attn_output.weight`),
-      gateProj: getTensor(`blk.${i}.ffn_gate.weight`),
-      upProj: getTensor(`blk.${i}.ffn_up.weight`),
-      downProj: getTensor(`blk.${i}.ffn_down.weight`),
     });
   }
-  console.log('[Aether] Dequantizing output head...');
-  const outputNorm = getTensor('output_norm.weight');
-  let outputWeight = getTensor('output.weight');
-  if (!outputWeight) {
-    console.log('[Aether] No output.weight, using tied embeddings');
-    outputWeight = tokenEmbd;
-  }
   const loadTime = Date.now() - t0;
-  console.log(`[Aether] Model loaded in ${loadTime}ms`);
   model = { tokenEmbd, layers, outputNorm, outputWeight, tokenizer, loadTime };
 }
 // ─── Inference ──────────────────────────────────────────────────────────────
-function generate(prompt, maxTokens = 100) {
   if (!model) throw new Error('Model not loaded');
   const t0 = performance.now();
   const { hiddenDim, numHeads, numKvHeads, headDim, intermediateSize, ropeTheta, rmsNormEps } = CONFIG;
   const kvDim = numKvHeads * headDim;
-  const gqaRatio = numHeads / numKvHeads;
-  // Format as chat
   const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
   const inputTokens = model.tokenizer.encode(chatPrompt);
   const allTokens = [...inputTokens];
-  // KV cache: [layer][position] -> { k, v }
-  const kvCache = Array.from({ length: CONFIG.numLayers }, () => ({ keys: [], values: [] }));
   const tokenTimes = [];
-  // Process all input tokens (prefill) then generate
   for (let step = 0; step < inputTokens.length + maxTokens - 1; step++) {
     const tokenStart = performance.now();
     const pos = step;
@@ -451,103 +446,92 @@ function generate(prompt, maxTokens = 100) {
     // Embed
     const hidden = new Float32Array(hiddenDim);
-    const embOffset = tokenId * hiddenDim;
-    for (let i = 0; i < hiddenDim; i++) hidden[i] = model.tokenEmbd[embOffset + i];
     let x = hidden;
-    // Run through layers
     for (let l = 0; l < CONFIG.numLayers; l++) {
-      const layer = model.layers[l];
       // 1. Attention norm
-      const normed = rmsNorm(x, layer.attnNorm, rmsNormEps);
-      // 2. Q, K, V projections
-      const q = matVec(layer.qProj, normed, hiddenDim, hiddenDim);
-      const k = matVec(layer.kProj, normed, kvDim, hiddenDim);
-      const v = matVec(layer.vProj, normed, kvDim, hiddenDim);
       // 3. RoPE
-      for (let h = 0; h < numHeads; h++) {
         applyRoPE(q.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
-      }
-      for (let h = 0; h < numKvHeads; h++) {
         applyRoPE(k.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
-      }
       // 4. Store in KV cache
       kvCache[l].keys.push(new Float32Array(k));
       kvCache[l].values.push(new Float32Array(v));
-      // 5. Attention with full KV cache
-      const attnOut = new Float32Array(hiddenDim);
       const seqLen = kvCache[l].keys.length;
-      for (let h = 0; h < numHeads; h++) {
-        const kvHead = Math.floor(h / gqaRatio);
-        const qHead = q.subarray(h * headDim, (h + 1) * headDim);
-        // Compute attention scores
-        const scores = new Float32Array(seqLen);
         for (let s = 0; s < seqLen; s++) {
-          const kHead = kvCache[l].keys[s].subarray(kvHead * headDim, (kvHead + 1) * headDim);
-          let dot = 0;
-          for (let d = 0; d < headDim; d++) dot += qHead[d] * kHead[d];
-          scores[s] = dot / Math.sqrt(headDim);
         }
-        // Causal mask: already handled (only see past positions)
-        // Softmax
-        const attnWeights = softmax(scores);
-        // Weighted sum of values
-        for (let s = 0; s < seqLen; s++) {
-          const vHead = kvCache[l].values[s].subarray(kvHead * headDim, (kvHead + 1) * headDim);
-          const w = attnWeights[s];
-          for (let d = 0; d < headDim; d++) {
-            attnOut[h * headDim + d] += w * vHead[d];
           }
         }
       }
-      // 6. Output projection
-      const projected = matVec(layer.oProj, attnOut, hiddenDim, hiddenDim);
-      // 7. Residual
-      const postAttn = new Float32Array(hiddenDim);
-      for (let i = 0; i < hiddenDim; i++) postAttn[i] = x[i] + projected[i];
-      // 8. FFN norm
-      const ffnInput = rmsNorm(postAttn, layer.ffnNorm, rmsNormEps);
-      // 9. SwiGLU MLP
-      const gate = matVec(layer.gateProj, ffnInput, intermediateSize, hiddenDim);
-      const up = matVec(layer.upProj, ffnInput, intermediateSize, hiddenDim);
-      const activated = silu(gate);
-      for (let i = 0; i < intermediateSize; i++) activated[i] *= up[i];
-      const down = matVec(layer.downProj, activated, hiddenDim, intermediateSize);
-      // 10. Residual
-      x = new Float32Array(hiddenDim);
-      for (let i = 0; i < hiddenDim; i++) x[i] = postAttn[i] + down[i];
     }
-    // Only sample if past prefill
     if (step >= inputTokens.length - 1) {
-      // Final norm + LM head
-      const finalNormed = rmsNorm(x, model.outputNorm, rmsNormEps);
-      const logits = matVec(model.outputWeight, finalNormed, CONFIG.vocabSize, hiddenDim);
       // Temperature sampling
-      const temperature = 0.7;
-      for (let i = 0; i < logits.length; i++) logits[i] /= temperature;
-      const probs = softmax(logits);
-      // Top-p sampling
       const indexed = Array.from(probs).map((p, i) => ({ p, i })).sort((a, b) => b.p - a.p);
-      let cumP = 0;
-      let chosen = indexed[0].i;
       const r = Math.random();
       for (const { p, i } of indexed) {
         cumP += p;
@@ -555,73 +539,73 @@ function generate(prompt, maxTokens = 100) {
         if (cumP > 0.9) break;
       }
-      const tokenEnd = performance.now();
-      if (step >= inputTokens.length) tokenTimes.push(tokenEnd - tokenStart);
       if (chosen === CONFIG.eosToken) break;
       allTokens.push(chosen);
     }
   }
   const totalTime = performance.now() - t0;
-  const generatedTokens = allTokens.slice(inputTokens.length);
-  const text = model.tokenizer.decode(generatedTokens);
-  const avgTokenTime = tokenTimes.length > 0 ? tokenTimes.reduce((a, b) => a + b, 0) / tokenTimes.length : 0;
   return {
     text,
-    tokens: generatedTokens.length,
     totalTimeMs: Math.round(totalTime),
-    avgTokenMs: Math.round(avgTokenTime),
     prefillTokens: inputTokens.length,
-    engine: 'Aether WASM-SIMD',
   };
 }
 // ─── HTTP Server ────────────────────────────────────────────────────────────
-function startServer() {
-  const server = createServer((req, res) => {
-    if (req.method === 'POST' && req.url === '/generate') {
-      let body = '';
-      req.on('data', c => body += c);
-      req.on('end', () => {
-        try {
-          const { prompt, max_tokens } = JSON.parse(body);
-          const result = generate(prompt, max_tokens || 100);
-          res.writeHead(200, { 'Content-Type': 'application/json' });
-          res.end(JSON.stringify(result));
-        } catch (e) {
-          res.writeHead(500, { 'Content-Type': 'application/json' });
-          res.end(JSON.stringify({ error: e.message, stack: e.stack }));
-        }
-      });
-    } else if (req.url === '/health') {
-      res.writeHead(200, { 'Content-Type': 'application/json' });
-      res.end(JSON.stringify({ status: 'ok', model: model ? 'loaded' : 'not loaded', loadTime: model?.loadTime }));
-    } else {
-      res.writeHead(404);
-      res.end('Not found');
-    }
-  });
-  server.listen(PORT, '127.0.0.1', () => {
-    console.log(`[Aether] Server listening on http://127.0.0.1:${PORT}`);
-  });
-}
 // ─── Main ───────────────────────────────────────────────────────────────────
-const ggufPath = process.env.GGUF_PATH || join('/tmp/hf_cache', 'buleyean-smollm2-360m-q8_0.gguf');
-const tokenizerPath = process.env.TOKENIZER_PATH || join('/tmp/hf_cache', 'tokenizer.json');
-// Download if needed
-if (!existsSync(ggufPath)) {
-  console.log('[Aether] Downloading GGUF model...');
-  execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
-}
-if (!existsSync(tokenizerPath)) {
-  console.log('[Aether] Downloading tokenizer...');
-  execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
 }
-loadModel(ggufPath, tokenizerPath);
-startServer();

 /**
  * Aether Inference Server
  *
+ * SmolLM2-360M inference using WASM SIMD kernels.
+ * Zero external ML dependencies. Pure JS + 14KB WASM binary.
  *
+ *   GGUF parse → WASM SIMD matVec → RoPE → fusedSiluMul → sampling
  */
 import { createServer } from 'http';
+import { readFileSync, existsSync } from 'fs';
 import { execSync } from 'child_process';
 import { fileURLToPath } from 'url';
 import { dirname, join } from 'path';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const PORT = parseInt(process.env.AETHER_PORT || '7861');
+// ─── Model Config (SmolLM2-360M-Instruct) ──────────────────────────────────
 const CONFIG = {
   hiddenDim: 960,
   numLayers: 32,
   bosToken: 1,
 };
+// ─── WASM SIMD Kernel Loader ────────────────────────────────────────────────
+let simd = null;
+async function loadSIMD() {
+  const wasmPath = join(__dirname, 'simd-kernels.wasm');
+  if (!existsSync(wasmPath)) {
+    console.log('[Aether] WASM SIMD binary not found, using JS fallbacks');
+    return null;
+  }
+  try {
+    const wasmBytes = readFileSync(wasmPath);
+    const { instance } = await WebAssembly.instantiate(wasmBytes, {
+      env: { expf: Math.exp, tanhf: Math.tanh, powf: Math.pow },
+    });
+    const wasm = instance.exports;
+    wasm.resetHeap(65536);
+    console.log('[Aether] WASM SIMD kernels loaded (14KB binary)');
+    const memory = wasm.memory;
+    function heapF32() { return new Float32Array(memory.buffer); }
+    function heapU8() { return new Uint8Array(memory.buffer); }
+    function copyTo(ptr, f32) { heapF32().set(f32, ptr >> 2); }
+    function copyBytesTo(ptr, u8) { heapU8().set(u8, ptr); }
+    function copyFrom(ptr, len) { return heapF32().slice(ptr >> 2, (ptr >> 2) + len); }
+    return {
+      matVec(matrix, vector, rows, cols) {
+        const saved = wasm.getHeapPtr();
+        const mPtr = wasm.allocate(matrix.byteLength);
+        const vPtr = wasm.allocate(vector.byteLength);
+        const rPtr = wasm.allocate(rows * 4);
+        copyTo(mPtr, matrix); copyTo(vPtr, vector);
+        wasm.matVecSimdBatch4(mPtr, vPtr, rPtr, rows, cols);
+        const result = copyFrom(rPtr, rows);
+        wasm.resetHeap(saved);
+        return result;
+      },
+      rmsNorm(x, weight, eps) {
+        const saved = wasm.getHeapPtr();
+        const xPtr = wasm.allocate(x.byteLength);
+        const wPtr = wasm.allocate(weight.byteLength);
+        const rPtr = wasm.allocate(x.byteLength);
+        copyTo(xPtr, x); copyTo(wPtr, weight);
+        wasm.rmsNormSimd(xPtr, wPtr, rPtr, x.length, eps);
+        const result = copyFrom(rPtr, x.length);
+        wasm.resetHeap(saved);
+        return result;
+      },
+      softmax(x) {
+        const saved = wasm.getHeapPtr();
+        const xPtr = wasm.allocate(x.byteLength);
+        const rPtr = wasm.allocate(x.byteLength);
+        copyTo(xPtr, x);
+        wasm.softmaxSimd(xPtr, rPtr, x.length);
+        const result = copyFrom(rPtr, x.length);
+        wasm.resetHeap(saved);
+        return result;
+      },
+      fusedSiluMul(gate, up) {
+        const saved = wasm.getHeapPtr();
+        const gPtr = wasm.allocate(gate.byteLength);
+        const uPtr = wasm.allocate(up.byteLength);
+        const rPtr = wasm.allocate(gate.byteLength);
+        copyTo(gPtr, gate); copyTo(uPtr, up);
+        wasm.fusedSiluMul(gPtr, uPtr, rPtr, gate.length);
+        const result = copyFrom(rPtr, gate.length);
+        wasm.resetHeap(saved);
+        return result;
+      },
+      add(a, b) {
+        const saved = wasm.getHeapPtr();
+        const aPtr = wasm.allocate(a.byteLength);
+        const bPtr = wasm.allocate(b.byteLength);
+        const rPtr = wasm.allocate(a.byteLength);
+        copyTo(aPtr, a); copyTo(bPtr, b);
+        wasm.addSimd(aPtr, bPtr, rPtr, a.length);
+        const result = copyFrom(rPtr, a.length);
+        wasm.resetHeap(saved);
+        return result;
+      },
+      flashAttentionMultiHead(query, keys, values, seqLen, numHeads, numKvHeads, headDim) {
+        const saved = wasm.getHeapPtr();
+        const scale = 1.0 / Math.sqrt(headDim);
+        const qPtr = wasm.allocate(query.byteLength);
+        const kPtr = wasm.allocate(keys.byteLength);
+        const vPtr = wasm.allocate(values.byteLength);
+        const rPtr = wasm.allocate(numHeads * headDim * 4);
+        copyTo(qPtr, query); copyTo(kPtr, keys); copyTo(vPtr, values);
+        wasm.flashAttentionMultiHead(qPtr, kPtr, vPtr, rPtr, seqLen, numHeads, numKvHeads, headDim, scale);
+        const result = copyFrom(rPtr, numHeads * headDim);
+        wasm.resetHeap(saved);
+        return result;
+      },
+    };
+  } catch (e) {
+    console.warn(`[Aether] WASM SIMD failed: ${e.message}, using JS fallbacks`);
+    return null;
+  }
 }
+// ─── JS Fallbacks (used if WASM unavailable) ────────────────────────────────
+function matVecJS(matrix, vector, rows, cols) {
+  const out = new Float32Array(rows);
+  for (let r = 0; r < rows; r++) {
+    let sum = 0; const off = r * cols;
+    for (let c = 0; c < cols; c++) sum += matrix[off + c] * vector[c];
+    out[r] = sum;
   }
   return out;
 }
+function rmsNormJS(x, weight, eps) {
+  let ss = 0;
+  for (let i = 0; i < x.length; i++) ss += x[i] * x[i];
+  ss = 1.0 / Math.sqrt(ss / x.length + eps);
+  const out = new Float32Array(x.length);
+  for (let i = 0; i < x.length; i++) out[i] = x[i] * ss * weight[i];
+  return out;
+}
+function softmaxJS(x) {
+  let max = -Infinity;
+  for (let i = 0; i < x.length; i++) if (x[i] > max) max = x[i];
+  const out = new Float32Array(x.length);
+  let sum = 0;
+  for (let i = 0; i < x.length; i++) { out[i] = Math.exp(x[i] - max); sum += out[i]; }
+  for (let i = 0; i < x.length; i++) out[i] /= sum;
+  return out;
+}
+function fusedSiluMulJS(gate, up) {
+  const out = new Float32Array(gate.length);
+  for (let i = 0; i < gate.length; i++) {
+    const g = gate[i];
+    out[i] = (g / (1 + Math.exp(-g))) * up[i];
+  }
+  return out;
+}
+function addJS(a, b) {
+  const out = new Float32Array(a.length);
+  for (let i = 0; i < a.length; i++) out[i] = a[i] + b[i];
+  return out;
+}
+// Ops wrapper -- uses WASM SIMD when available, JS fallback otherwise
+function ops() {
+  return {
+    matVec: simd?.matVec || matVecJS,
+    rmsNorm: simd?.rmsNorm || rmsNormJS,
+    softmax: simd?.softmax || softmaxJS,
+    fusedSiluMul: simd?.fusedSiluMul || fusedSiluMulJS,
+    add: simd?.add || addJS,
+    flashAttentionMultiHead: simd?.flashAttentionMultiHead || null,
+  };
+}
 // ─── Q4_K Dequantization ────────────────────────────────────────────────────
 const QK_K = 256;
 const Q4K_BLOCK_BYTES = 144;
+function fp16(lo, hi) {
+  const h = lo | (hi << 8);
+  const s = (h >> 15) & 1, e = (h >> 10) & 0x1f, f = h & 0x3ff;
+  if (e === 0) return f === 0 ? 0 : (s ? -1 : 1) * (f / 1024) * Math.pow(2, -14);
+  if (e === 31) return 0;
+  return (s ? -1 : 1) * Math.pow(2, e - 15) * (1 + f / 1024);
+}
+function getScaleMinK4(gi, scales) {
+  if (gi < 4) return [scales[gi] & 63, scales[gi + 4] & 63];
+  return [(scales[gi + 4] & 0xf) | ((scales[gi - 4] >> 6) << 4),
+          (scales[gi + 4] >> 4) | ((scales[gi] >> 6) << 4)];
+}
 function dequantQ4K(data, numElements) {
   const out = new Float32Array(numElements);
+  const numBlocks = Math.floor(data.length / Q4K_BLOCK_BYTES);
   for (let b = 0; b < numBlocks; b++) {
+    const outOff = b * QK_K;
+    if (outOff + QK_K > numElements) break;
+    const bs = b * Q4K_BLOCK_BYTES;
+    const d = fp16(data[bs], data[bs + 1]);
+    const dmin = fp16(data[bs + 2], data[bs + 3]);
+    const scales = data.subarray(bs + 4, bs + 16);
+    const qs = data.subarray(bs + 16, bs + Q4K_BLOCK_BYTES);
+    let si = 0, qi = 0;
+    for (let j = 0; j < QK_K; j += 64) {
+      const [sc1, m1] = getScaleMinK4(si, scales);
+      const [sc2, m2] = getScaleMinK4(si + 1, scales);
+      const d1 = d * sc1, d2 = d * sc2, dm1 = dmin * m1, dm2 = dmin * m2;
+      for (let lane = 0; lane < 32; lane++) {
+        const qb = qs[qi + lane];
+        out[outOff + j + lane] = d1 * (qb & 0x0f) - dm1;
+        out[outOff + j + 32 + lane] = d2 * (qb >> 4) - dm2;
       }
+      qi += 32; si += 2;
     }
   }
   return out;
 }
+// Q8_0 dequant
+const Q8_BLOCK = 32, Q8_BYTES = 34;
+function dequantQ8(data, numElements) {
+  const out = new Float32Array(numElements);
+  const nb = Math.ceil(numElements / Q8_BLOCK);
+  for (let b = 0; b < nb; b++) {
+    const off = b * Q8_BYTES;
+    const scale = fp16(data[off], data[off + 1]);
+    const n = Math.min(Q8_BLOCK, numElements - b * Q8_BLOCK);
+    for (let i = 0; i < n; i++) {
+      const v = data[off + 2 + i]; out[b * Q8_BLOCK + i] = (v > 127 ? v - 256 : v) * scale;
+    }
   }
+  return out;
 }
+function dequantAuto(data, numElements) {
+  const f32 = numElements * 4, q8 = Math.ceil(numElements / Q8_BLOCK) * Q8_BYTES;
+  const q4k = Math.ceil(numElements / QK_K) * Q4K_BLOCK_BYTES;
+  if (Math.abs(data.length - f32) < f32 * 0.05) return new Float32Array(data.buffer, data.byteOffset, numElements);
+  if (Math.abs(data.length - q4k) < q4k * 0.05) return dequantQ4K(data, numElements);
+  if (Math.abs(data.length - q8) < q8 * 0.05) return dequantQ8(data, numElements);
+  return dequantQ8(data, numElements);
 }
+// ─── GGUF Parser ────────────────────────────────────────────────────────────
+const GGUF_MAGIC = 0x46554747;
+const VT = { UINT8:0,INT8:1,UINT16:2,INT16:3,UINT32:4,INT32:5,FLOAT32:6,BOOL:7,STRING:8,ARRAY:9,UINT64:10,INT64:11,FLOAT64:12 };
+const BLK_SZ = {2:32,3:32,6:32,7:32,8:32,9:32,10:256,11:256,12:256,13:256,14:256,15:256};
+const BLK_BY = {2:18,3:20,6:22,7:24,8:34,9:36,10:84,11:110,12:144,13:176,14:210,15:292};
+const TY_SZ = {0:4,1:2,16:1,17:2,18:4,19:8,20:8};
+function calcSz(dims, type) {
+  let n=1n; for (const d of dims) n*=d;
+  const bs=BLK_SZ[type]; if(bs&&BLK_BY[type]) return Math.ceil(Number(n)/bs)*BLK_BY[type];
+  return Math.ceil(Number(n)*(TY_SZ[type]??4));
 }
+function rStr(buf,off){const len=Number(buf.readBigUInt64LE(off));return{v:buf.subarray(off+8,off+8+len).toString('utf8'),o:off+8+len};}
+function rVal(buf,off,t){switch(t){
+  case VT.UINT8:return{v:buf.readUInt8(off),o:off+1};case VT.INT8:return{v:buf.readInt8(off),o:off+1};
+  case VT.UINT16:return{v:buf.readUInt16LE(off),o:off+2};case VT.INT16:return{v:buf.readInt16LE(off),o:off+2};
+  case VT.UINT32:return{v:buf.readUInt32LE(off),o:off+4};case VT.INT32:return{v:buf.readInt32LE(off),o:off+4};
+  case VT.FLOAT32:return{v:buf.readFloatLE(off),o:off+4};case VT.BOOL:return{v:buf.readUInt8(off)!==0,o:off+1};
+  case VT.STRING:{const r=rStr(buf,off);return{v:r.v,o:r.o};}
+  case VT.UINT64:return{v:buf.readBigUInt64LE(off),o:off+8};case VT.INT64:return{v:buf.readBigInt64LE(off),o:off+8};
+  case VT.FLOAT64:return{v:buf.readDoubleLE(off),o:off+8};
+  case VT.ARRAY:{const at=buf.readUInt32LE(off);const al=Number(buf.readBigUInt64LE(off+4));let co=off+12;const arr=[];
+    for(let i=0;i<al;i++){const r=rVal(buf,co,at);arr.push(r.v);co=r.o;}return{v:arr,o:co};}
+  default:throw new Error(`Unknown GGUF type: ${t}`);
+}}
+function parseGGUF(buf){
+  let off=0;if(buf.readUInt32LE(off)!==GGUF_MAGIC)throw new Error('Not GGUF');off+=4;
+  off+=4;const tc=Number(buf.readBigUInt64LE(off));off+=8;const kc=Number(buf.readBigUInt64LE(off));off+=8;
+  let align=32;for(let i=0;i<kc;i++){const{v:key,o:o1}=rStr(buf,off);off=o1;const vt=buf.readUInt32LE(off);off+=4;
+    const{v,o:o2}=rVal(buf,off,vt);off=o2;if(key==='general.alignment')align=Number(v);}
+  const tensors=[];for(let i=0;i<tc;i++){const{v:name,o:o1}=rStr(buf,off);off=o1;const nd=buf.readUInt32LE(off);off+=4;
+    const dims=[];for(let d=0;d<nd;d++){dims.push(buf.readBigUInt64LE(off));off+=8;}const type=buf.readUInt32LE(off);off+=4;
+    const offset=buf.readBigUInt64LE(off);off+=8;
+    tensors.push({name,dims,type,offset,size:calcSz(dims,type),numElements:Number(dims.reduce((a,b)=>a*b,1n))});}
+  return{tensors,dataOffset:Math.ceil(off/align)*align};
 }
 // ─── BPE Tokenizer ──────────────────────────────────────────────────────────
 class BPETokenizer {
+  constructor(json) {
+    const m = json.model || {};
+    this.vocab = m.vocab || {};
+    this.rev = {};
+    for (const [t, id] of Object.entries(this.vocab)) this.rev[id] = t;
     this.mergeRanks = {};
+    for (const [i, merge] of (m.merges || []).entries()) this.mergeRanks[merge] = i;
+    this.added = {};
+    if (json.added_tokens) for (const t of json.added_tokens) this.added[t.content] = t.id;
   }
   encode(text) {
+    const sp = /<\|[^|]+\|>/g;
+    const parts = []; let last = 0, m;
+    while ((m = sp.exec(text)) !== null) {
+      if (m.index > last) parts.push({ t: text.slice(last, m.index), s: false });
+      parts.push({ t: m[0], s: true }); last = m.index + m[0].length;
     }
+    if (last < text.length) parts.push({ t: text.slice(last), s: false });
     const tokens = [];
+    for (const p of parts) {
+      if (p.s) { const id = this.added[p.t] ?? this.vocab[p.t]; if (id !== undefined) tokens.push(id); continue; }
+      const words = p.t.match(/\S+|\s+/g) || [];
+      for (const w of words) {
+        let syms = [];
+        for (const ch of w) {
+          if (this.vocab[ch] !== undefined) syms.push(ch);
+          else for (const b of Buffer.from(ch, 'utf8')) syms.push(`<0x${b.toString(16).toUpperCase().padStart(2,'0')}>`);
         }
+        while (syms.length > 1) {
+          let best = Infinity, bi = -1;
+          for (let i = 0; i < syms.length - 1; i++) {
+            const r = this.mergeRanks[`${syms[i]} ${syms[i+1]}`];
+            if (r !== undefined && r < best) { best = r; bi = i; }
           }
+          if (bi === -1) break;
+          syms.splice(bi, 2, syms[bi] + syms[bi + 1]);
         }
+        for (const s of syms) { const id = this.vocab[s] ?? this.added[s]; if (id !== undefined) tokens.push(id); }
       }
     }
     return tokens;
   }
   decode(tokens) {
     const pieces = [];
     for (const t of tokens) {
+      const p = this.rev[t];
+      if (p && p.startsWith('<0x') && p.endsWith('>')) pieces.push(String.fromCharCode(parseInt(p.slice(3,-1),16)));
+      else if (p && !p.startsWith('<|')) pieces.push(p);
     }
     return pieces.join('').replace(/Ġ/g, ' ').replace(/Ċ/g, '\n');
   }
 // ─── RoPE ───────────────────────────────────────────────────────────────────
 function applyRoPE(x, headDim, position, theta) {
+  const half = headDim / 2;
+  for (let i = 0; i < half; i++) {
     const freq = 1.0 / Math.pow(theta, (2 * i) / headDim);
     const angle = position * freq;
+    const cos = Math.cos(angle), sin = Math.sin(angle);
+    const x0 = x[i], x1 = x[i + half];
     x[i] = x0 * cos - x1 * sin;
+    x[i + half] = x0 * sin + x1 * cos;
   }
 }
 // ─── Model ──────────────────────────────────────────────────────────────────
 let model = null;
   const parsed = parseGGUF(buf);
   console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now() - t0}ms`);
   const tokJson = JSON.parse(readFileSync(tokenizerPath, 'utf8'));
   const tokenizer = new BPETokenizer(tokJson);
+  const byName = {};
+  for (const t of parsed.tensors) byName[t.name] = t;
+  function get(name) {
+    const t = byName[name];
+    if (!t) { console.warn(`[Aether] Missing: ${name}`); return null; }
+    const raw = new Uint8Array(buf.buffer, buf.byteOffset + parsed.dataOffset + Number(t.offset), t.size);
     return dequantAuto(raw, t.numElements);
   }
   console.log('[Aether] Dequantizing embeddings...');
+  const tokenEmbd = get('token_embd.weight');
   console.log('[Aether] Dequantizing layers...');
   const layers = [];
   for (let i = 0; i < CONFIG.numLayers; i++) {
     if (i % 8 === 0) console.log(`[Aether]   Layer ${i}/${CONFIG.numLayers}...`);
     layers.push({
+      attnNorm: get(`blk.${i}.attn_norm.weight`),
+      ffnNorm: get(`blk.${i}.ffn_norm.weight`),
+      qProj: get(`blk.${i}.attn_q.weight`),
+      kProj: get(`blk.${i}.attn_k.weight`),
+      vProj: get(`blk.${i}.attn_v.weight`),
+      oProj: get(`blk.${i}.attn_output.weight`),
+      gateProj: get(`blk.${i}.ffn_gate.weight`),
+      upProj: get(`blk.${i}.ffn_up.weight`),
+      downProj: get(`blk.${i}.ffn_down.weight`),
     });
   }
+  const outputNorm = get('output_norm.weight');
+  let outputWeight = get('output.weight');
+  if (!outputWeight) { console.log('[Aether] Tied embeddings'); outputWeight = tokenEmbd; }
   const loadTime = Date.now() - t0;
+  console.log(`[Aether] Model loaded in ${(loadTime/1000).toFixed(1)}s (WASM SIMD: ${simd ? 'YES' : 'NO'})`);
   model = { tokenEmbd, layers, outputNorm, outputWeight, tokenizer, loadTime };
 }
 // ─── Inference ──────────────────────────────────────────────────────────────
+function generate(prompt, maxTokens = 50) {
   if (!model) throw new Error('Model not loaded');
   const t0 = performance.now();
   const { hiddenDim, numHeads, numKvHeads, headDim, intermediateSize, ropeTheta, rmsNormEps } = CONFIG;
   const kvDim = numKvHeads * headDim;
+  const o = ops();
   const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
   const inputTokens = model.tokenizer.encode(chatPrompt);
   const allTokens = [...inputTokens];
+  // KV cache: flat arrays per layer for WASM flash attention
+  const kvCache = Array.from({ length: CONFIG.numLayers }, () => ({
+    keys: [],   // array of Float32Array[kvDim] per position
+    values: [], // array of Float32Array[kvDim] per position
+  }));
   const tokenTimes = [];
   for (let step = 0; step < inputTokens.length + maxTokens - 1; step++) {
     const tokenStart = performance.now();
     const pos = step;
     // Embed
     const hidden = new Float32Array(hiddenDim);
+    const embOff = tokenId * hiddenDim;
+    for (let i = 0; i < hiddenDim; i++) hidden[i] = model.tokenEmbd[embOff + i];
     let x = hidden;
     for (let l = 0; l < CONFIG.numLayers; l++) {
+      const ly = model.layers[l];
       // 1. Attention norm
+      const normed = o.rmsNorm(x, ly.attnNorm, rmsNormEps);
+      // 2. Q, K, V projections (WASM SIMD matVec)
+      const q = o.matVec(ly.qProj, normed, hiddenDim, hiddenDim);
+      const k = o.matVec(ly.kProj, normed, kvDim, hiddenDim);
+      const v = o.matVec(ly.vProj, normed, kvDim, hiddenDim);
       // 3. RoPE
+      for (let h = 0; h < numHeads; h++)
         applyRoPE(q.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
+      for (let h = 0; h < numKvHeads; h++)
         applyRoPE(k.subarray(h * headDim, (h + 1) * headDim), headDim, pos, ropeTheta);
       // 4. Store in KV cache
       kvCache[l].keys.push(new Float32Array(k));
       kvCache[l].values.push(new Float32Array(v));
+      // 5. Attention
       const seqLen = kvCache[l].keys.length;
+      let attnOut;
+      if (o.flashAttentionMultiHead && seqLen > 1) {
+        // Use WASM flash attention with GQA
+        const flatKeys = new Float32Array(seqLen * kvDim);
+        const flatVals = new Float32Array(seqLen * kvDim);
         for (let s = 0; s < seqLen; s++) {
+          flatKeys.set(kvCache[l].keys[s], s * kvDim);
+          flatVals.set(kvCache[l].values[s], s * kvDim);
         }
+        attnOut = o.flashAttentionMultiHead(q, flatKeys, flatVals, seqLen, numHeads, numKvHeads, headDim);
+      } else {
+        // JS fallback attention
+        attnOut = new Float32Array(hiddenDim);
+        const gqaRatio = numHeads / numKvHeads;
+        for (let h = 0; h < numHeads; h++) {
+          const kvH = Math.floor(h / gqaRatio);
+          const qH = q.subarray(h * headDim, (h + 1) * headDim);
+          const scores = new Float32Array(seqLen);
+          for (let s = 0; s < seqLen; s++) {
+            const kH = kvCache[l].keys[s].subarray(kvH * headDim, (kvH + 1) * headDim);
+            let dot = 0;
+            for (let d = 0; d < headDim; d++) dot += qH[d] * kH[d];
+            scores[s] = dot / Math.sqrt(headDim);
+          }
+          const w = softmaxJS(scores);
+          for (let s = 0; s < seqLen; s++) {
+            const vH = kvCache[l].values[s].subarray(kvH * headDim, (kvH + 1) * headDim);
+            for (let d = 0; d < headDim; d++) attnOut[h * headDim + d] += w[s] * vH[d];
           }
         }
       }
+      // 6. O projection + residual
+      const projected = o.matVec(ly.oProj, attnOut, hiddenDim, hiddenDim);
+      const postAttn = o.add(x, projected);
+      // 7. FFN: norm → gate/up → fusedSiluMul → down → residual
+      const ffnIn = o.rmsNorm(postAttn, ly.ffnNorm, rmsNormEps);
+      const gate = o.matVec(ly.gateProj, ffnIn, intermediateSize, hiddenDim);
+      const up = o.matVec(ly.upProj, ffnIn, intermediateSize, hiddenDim);
+      const activated = o.fusedSiluMul(gate, up);
+      const down = o.matVec(ly.downProj, activated, hiddenDim, intermediateSize);
+      x = o.add(postAttn, down);
     }
+    // Sample only after prefill
     if (step >= inputTokens.length - 1) {
+      const finalNormed = o.rmsNorm(x, model.outputNorm, rmsNormEps);
+      const logits = o.matVec(model.outputWeight, finalNormed, CONFIG.vocabSize, hiddenDim);
       // Temperature sampling
+      for (let i = 0; i < logits.length; i++) logits[i] /= 0.7;
+      const probs = o.softmax(logits);
+      // Top-p nucleus sampling
       const indexed = Array.from(probs).map((p, i) => ({ p, i })).sort((a, b) => b.p - a.p);
+      let cumP = 0, chosen = indexed[0].i;
       const r = Math.random();
       for (const { p, i } of indexed) {
         cumP += p;
         if (cumP > 0.9) break;
       }
+      tokenTimes.push(performance.now() - tokenStart);
       if (chosen === CONFIG.eosToken) break;
       allTokens.push(chosen);
     }
   }
   const totalTime = performance.now() - t0;
+  const genTokens = allTokens.slice(inputTokens.length);
+  const text = model.tokenizer.decode(genTokens);
+  const avgMs = tokenTimes.length > 0 ? tokenTimes.reduce((a, b) => a + b, 0) / tokenTimes.length : 0;
   return {
     text,
+    tokens: genTokens.length,
     totalTimeMs: Math.round(totalTime),
+    avgTokenMs: Math.round(avgMs),
     prefillTokens: inputTokens.length,
+    engine: `Aether ${simd ? 'WASM-SIMD' : 'JS-fallback'}`,
+    simd: !!simd,
   };
 }
 // ─── HTTP Server ────────────────────────────────────────────────────────────
+const server = createServer((req, res) => {
+  if (req.method === 'POST' && req.url === '/generate') {
+    let body = '';
+    req.on('data', c => body += c);
+    req.on('end', () => {
+      try {
+        const { prompt, max_tokens } = JSON.parse(body);
+        const result = generate(prompt, max_tokens || 50);
+        res.writeHead(200, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify(result));
+      } catch (e) {
+        res.writeHead(500, { 'Content-Type': 'application/json' });
+        res.end(JSON.stringify({ error: e.message, stack: e.stack }));
+      }
+    });
+  } else if (req.url === '/health') {
+    res.writeHead(200, { 'Content-Type': 'application/json' });
+    res.end(JSON.stringify({ status: 'ok', model: model ? 'loaded' : 'not loaded', simd: !!simd, loadTime: model?.loadTime }));
+  } else { res.writeHead(404); res.end(); }
+});
 // ─── Main ───────────────────────────────────────────────────────────────────
+const ggufPath = process.env.GGUF_PATH || '/tmp/hf_cache/buleyean-smollm2-360m-q4_k_m.gguf';
+const tokenizerPath = process.env.TOKENIZER_PATH || '/tmp/hf_cache/tokenizer.json';
+async function main() {
+  // Load WASM SIMD first
+  simd = await loadSIMD();
+  // Download model files
+  if (!existsSync(ggufPath)) {
+    console.log('[Aether] Downloading Q4_K_M GGUF...');
+    execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q4_k_m.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
+  }
+  if (!existsSync(tokenizerPath)) {
+    console.log('[Aether] Downloading tokenizer...');
+    execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
+  }
+  loadModel(ggufPath, tokenizerPath);
+  server.listen(PORT, '127.0.0.1', () => {
+    console.log(`[Aether] Server on http://127.0.0.1:${PORT} (SIMD: ${simd ? 'YES' : 'NO'})`);
+  });
 }
+main().catch(e => { console.error('[Aether] Fatal:', e); process.exit(1); });

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ def gen_pytorch(prompt):
     with torch.no_grad():
         outputs = base_model.generate(
             **inputs,
-            max_new_tokens=100,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
@@ -80,7 +80,7 @@ def gen_pytorch(prompt):
 def gen_aether(prompt):
     """Generate with Aether (our engine)"""
     try:
-        data = json.dumps({"prompt": prompt, "max_tokens": 100}).encode()
         req = urllib.request.Request(
             "http://127.0.0.1:7861/generate",
             data=data,

     with torch.no_grad():
         outputs = base_model.generate(
             **inputs,
+            max_new_tokens=50,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
 def gen_aether(prompt):
     """Generate with Aether (our engine)"""
     try:
+        data = json.dumps({"prompt": prompt, "max_tokens": 50}).encode()
         req = urllib.request.Request(
             "http://127.0.0.1:7861/generate",
             data=data,

simd-kernels.wasm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a05084c8998119797c6e80927678ce007e3285b78c6e7e8feee223ca4bb13636
+size 14553