/** * Aether Inference Server with Glossolalia Decoder * * SmolLM2-360M inference using WASM SIMD kernels. * Two endpoints: * /generate-standard -- standard top-p sampling * /generate-glossolalia -- temperature-ensemble fork/race/fold */ import { createServer } from 'http'; import { readFileSync, existsSync } from 'fs'; import { execSync } from 'child_process'; import { fileURLToPath } from 'url'; import { dirname, join } from 'path'; const __dirname = dirname(fileURLToPath(import.meta.url)); const PORT = parseInt(process.env.AETHER_PORT || '7861'); // ─── SmolLM2-360M Config ──────────────────────────────────────────────────── const C = { hiddenDim: 960, numLayers: 32, numHeads: 15, numKvHeads: 5, headDim: 64, intermediateSize: 2560, vocabSize: 49152, ropeTheta: 100000.0, rmsNormEps: 1e-5, eosToken: 2, }; const kvDim = C.numKvHeads * C.headDim; const gqaRatio = C.numHeads / C.numKvHeads; // ─── WASM SIMD ────────────────────────────────────────────────────────────── let simd = null; async function loadSIMD() { const p = join(__dirname, 'simd-kernels.wasm'); if (!existsSync(p)) return null; try { const { instance } = await WebAssembly.instantiate(readFileSync(p), { env: { expf: Math.exp, tanhf: Math.tanh, powf: Math.pow }, }); const w = instance.exports; w.resetHeap(65536); const mem = w.memory; const hf = () => new Float32Array(mem.buffer); const cp = (ptr, f) => hf().set(f, ptr >> 2); const rd = (ptr, n) => hf().slice(ptr >> 2, (ptr >> 2) + n); const wrap = (fn) => (...args) => { const s = w.getHeapPtr(); try { return fn(s, ...args); } finally { w.resetHeap(s); } }; console.log('[Aether] WASM SIMD loaded'); return { matVec: wrap((s, mat, vec, rows, cols) => { if (mat.byteLength > 100_000_000) return matVecJS(mat, vec, rows, cols); const mP=w.allocate(mat.byteLength),vP=w.allocate(vec.byteLength),rP=w.allocate(rows*4); cp(mP,mat);cp(vP,vec);w.matVecSimdBatch4(mP,vP,rP,rows,cols);return rd(rP,rows); }), rmsNorm: wrap((s,x,wt,eps) => { const xP=w.allocate(x.byteLength),wP=w.allocate(wt.byteLength),rP=w.allocate(x.byteLength); cp(xP,x);cp(wP,wt);w.rmsNormSimd(xP,wP,rP,x.length,eps);return rd(rP,x.length); }), softmax: wrap((s,x) => { const xP=w.allocate(x.byteLength),rP=w.allocate(x.byteLength); cp(xP,x);w.softmaxSimd(xP,rP,x.length);return rd(rP,x.length); }), fusedSiluMul: wrap((s,g,u) => { const gP=w.allocate(g.byteLength),uP=w.allocate(u.byteLength),rP=w.allocate(g.byteLength); cp(gP,g);cp(uP,u);w.fusedSiluMul(gP,uP,rP,g.length);return rd(rP,g.length); }), add: wrap((s,a,b) => { const aP=w.allocate(a.byteLength),bP=w.allocate(b.byteLength),rP=w.allocate(a.byteLength); cp(aP,a);cp(bP,b);w.addSimd(aP,bP,rP,a.length);return rd(rP,a.length); }), }; } catch(e) { console.warn('[Aether] WASM failed:',e.message); return null; } } // ─── JS Fallbacks ─────────────────────────────────────────────────────────── function matVecJS(m,v,rows,cols){const o=new Float32Array(rows);for(let r=0;rmx)mx=x[i];const o=new Float32Array(x.length);let s=0;for(let i=0;i ({ matVec:simd?.matVec||matVecJS, rmsNorm:simd?.rmsNorm||rmsNormJS, softmax:simd?.softmax||softmaxJS, fusedSiluMul:simd?.fusedSiluMul||fusedSiluMulJS, add:simd?.add||addJS }); // ─── Q8_0 Dequant ─────────────────────────────────────────────────────────── function fp16(lo,hi){const h=lo|(hi<<8),s=(h>>15)&1,e=(h>>10)&0x1f,f=h&0x3ff;if(e===0)return f===0?0:(s?-1:1)*(f/1024)*Math.pow(2,-14);if(e===31)return 0;return(s?-1:1)*Math.pow(2,e-15)*(1+f/1024);} function dequantQ8(data,n){const o=new Float32Array(n),nb=Math.ceil(n/32);for(let b=0;b127?v-256:v)*sc;}}return o;} function dequantByType(data,n,type){if(type===0)return new Float32Array(data.buffer,data.byteOffset,n);if(type===8)return dequantQ8(data,n);if(type===1){const o=new Float32Array(n);for(let i=0;ia*b,1n))});}return{tensors,dataOffset:Math.ceil(o/align)*align};} // ─── BPE Tokenizer ────────────────────────────────────────────────────────── class Tok{constructor(j){const m=j.model||{};this.vocab=m.vocab||{};this.rev={};for(const[t,id]of Object.entries(this.vocab))this.rev[id]=t;this.mr={};for(const[i,mg]of(m.merges||[]).entries())this.mr[mg]=i;this.added={};if(j.added_tokens)for(const t of j.added_tokens)this.added[t.content]=t.id;} encode(text){const sp=/<\|[^|]+\|>/g;const parts=[];let last=0,m;while((m=sp.exec(text))!==null){if(m.index>last)parts.push({t:text.slice(last,m.index),s:false});parts.push({t:m[0],s:true});last=m.index+m[0].length;}if(last`)}while(syms.length>1){let best=Infinity,bi=-1;for(let i=0;i'))p.push(String.fromCharCode(parseInt(s.slice(3,-1),16)));else if(s&&!s.startsWith('<|'))p.push(s);}return p.join('').replace(/Ġ/g,' ').replace(/Ċ/g,'\n');}} // ─── RoPE (LLaMA style: ADJACENT pairs) ───────────────────────────────────── function applyRoPE(x, headDim, position, theta) { for (let i = 0; i < headDim; i += 2) { const freq = 1.0 / Math.pow(theta, (2 * (i/2)) / headDim); const angle = position * freq; const cos = Math.cos(angle), sin = Math.sin(angle); const x0 = x[i], x1 = x[i + 1]; x[i] = x0 * cos - x1 * sin; x[i + 1] = x0 * sin + x1 * cos; } } // ─── Models ───────────────────────────────────────────────────────────────── const models = {}; let activeModel = null; function loadModel(name, ggufPath, tokPath) { console.log(`[Aether] Loading ${name}...`); const t0=Date.now();const buf=readFileSync(ggufPath);const parsed=parseGGUF(buf); console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now()-t0}ms`); const tokenizer=new Tok(JSON.parse(readFileSync(tokPath,'utf8'))); const byName={};for(const t of parsed.tensors)byName[t.name]=t; function get(nm){const t=byName[nm];if(!t)return null;const raw=new Uint8Array(buf.buffer,buf.byteOffset+parsed.dataOffset+Number(t.offset),t.size);return dequantByType(raw,t.numElements,t.type);} console.log('[Aether] Dequantizing...');const tokenEmbd=get('token_embd.weight');const layers=[]; for(let i=0;iuser\n${prompt}<|im_end|>\n<|im_start|>assistant\n`; const inputTokens = model.tokenizer.encode(chatPrompt); const allTokens = [...inputTokens]; const kvCache = Array.from({length:C.numLayers},()=>({k:[],v:[]})); // Returns a function that generates one token at a time return { inputTokens, step(allToks, kvC, diag) { const pos = allToks.length - 1; const tid = allToks[allToks.length - 1]; const x0 = model.tokenEmbd.slice(tid*C.hiddenDim,(tid+1)*C.hiddenDim); let x = x0; const layerNorms = diag ? [] : null; const attnEntropies = diag ? [] : null; for (let l=0;l1e-10) he-=w[s]*Math.log(w[s]); headEntropies.push(Math.round(he*1000)/1000); } for(let s=0;s0 ? Math.round(Math.sqrt(delta/prevNorm)*1000)/1000 : 0 }); } } const finalNormed=o.rmsNorm(x,model.outNorm,C.rmsNormEps); const logits = o.matVec(model.outWeight,finalNormed,C.vocabSize,C.hiddenDim); return { logits, layerNorms, attnEntropies }; } }; } // ─── Sampling Functions ───────────────────────────────────────────────────── function sampleStandard(logits, temperature = 0.7, topP = 0.9) { const o = op(); const scaled = new Float32Array(logits.length); for (let i = 0; i < logits.length; i++) scaled[i] = logits[i] / temperature; const probs = o.softmax(scaled); // Top-p const indexed = Array.from(probs).map((p,i)=>({p,i})).sort((a,b)=>b.p-a.p); let cumP = 0; const candidates = []; for (const {p,i} of indexed) { cumP += p; candidates.push({p,i}); if (cumP >= topP) break; } const total = candidates.reduce((s,c) => s+c.p, 0); const r = Math.random() * total; let acc = 0; for (const {p,i} of candidates) { acc += p; if (r < acc) return i; } return candidates[0].i; } function glossolaliaMerge(rawLogits, temperatures = [0.4, 0.7, 1.0]) { const V = rawLogits.length; const logV = Math.log(V); const agents = []; for (const tau of temperatures) { const scaled = new Float32Array(V); for (let i = 0; i < V; i++) scaled[i] = rawLogits[i] / Math.max(tau, 0.01); const probs = softmaxJS(scaled); // Shannon entropy let h = 0; for (let i = 0; i < V; i++) { const p = probs[i]; if (p > 1e-12) h -= p * Math.log(p); } // Deficit weight: low entropy = high confidence = high weight const w = Math.max(1.0 - h / logV, 1e-8); // the sliver // Top-5 for diagnostics const top5 = Array.from(probs).map((p,i)=>({p,i})).sort((a,b)=>b.p-a.p).slice(0,5); agents.push({ probs, entropy: h, weight: w, tau, top5 }); } // Merge: weighted average const totalW = agents.reduce((s,a) => s + a.weight, 0); const merged = new Float32Array(V); for (const a of agents) { const nw = a.weight / totalW; for (let i = 0; i < V; i++) merged[i] += nw * a.probs[i]; } return { merged, agents, totalW }; } function sampleGlossolalia(logits) { const { merged, agents } = glossolaliaMerge(logits); // Top-p on merged distribution const indexed = Array.from(merged).map((p,i)=>({p,i})).sort((a,b)=>b.p-a.p); let cumP = 0; const candidates = []; for (const {p,i} of indexed) { cumP += p; candidates.push({p,i}); if (cumP >= 0.95) break; } const total = candidates.reduce((s,c) => s+c.p, 0); const r = Math.random() * total; let acc = 0; for (const {p,i} of candidates) { acc += p; if (r < acc) return { tokenId: i, agents }; } return { tokenId: candidates[0].i, agents }; } // ─── Generation Loops ─────────────────────────────────────────────────────── function generateStandard(prompt, maxTokens = 8192, modelName = 'buleyean') { const t0 = performance.now(); const model = getModel(modelName); const fwd = forwardPass(prompt, modelName); const allTokens = [...fwd.inputTokens]; const kvC = Array.from({length:C.numLayers},()=>({k:[],v:[]})); const tokenTimes = []; // Prefill for (let i = 0; i < fwd.inputTokens.length; i++) { fwd.step(allTokens.slice(0, i+1), kvC, false); } const perTokenInfo = []; // Decode for (let i = 0; i < maxTokens; i++) { const ts = performance.now(); const { logits, layerNorms, attnEntropies } = fwd.step(allTokens, kvC, true); const o2 = op(); const scaled = new Float32Array(logits.length); for (let j = 0; j < logits.length; j++) scaled[j] = logits[j] / 0.7; const probs = o2.softmax(scaled); const chosen = sampleStandard(logits); const chosenProb = probs[chosen]; const perplexity = chosenProb > 0 ? -Math.log2(chosenProb) : 99; // Vocab coverage: tokens with >0.1% probability let vocabCoverage = 0; for (let j = 0; j < probs.length; j++) if (probs[j] > 0.001) vocabCoverage++; // Top-5 const top5 = Array.from(probs).map((p,j)=>({p,i:j})).sort((a,b)=>b.p-a.p).slice(0,5) .map(t => ({ token: model.tokenizer.decode([t.i]), prob: Math.round(t.p*1000)/1000 })); tokenTimes.push(performance.now() - ts); perTokenInfo.push({ perplexity: Math.round(perplexity*100)/100, chosenProb: Math.round(chosenProb*1000)/1000, vocabCoverage, top5, layerNorms, attnEntropies }); if (chosen === C.eosToken) break; allTokens.push(chosen); } const genTokens = allTokens.slice(fwd.inputTokens.length); const totalTime = performance.now() - t0; const avgMs = tokenTimes.length > 0 ? tokenTimes.reduce((a,b)=>a+b,0)/tokenTimes.length : 0; return { text: model.tokenizer.decode(genTokens), tokens: genTokens.length, totalTimeMs: Math.round(totalTime), avgTokenMs: Math.round(avgMs), mode: 'standard', temperature: 0.7, topP: 0.9, tokenDiagnostics: perTokenInfo, }; } function generateGlossolalia(prompt, maxTokens = 8192, modelName = 'buleyean') { const t0 = performance.now(); const model = getModel(modelName); const fwd = forwardPass(prompt, modelName); const allTokens = [...fwd.inputTokens]; const kvC = Array.from({length:C.numLayers},()=>({k:[],v:[]})); const tokenTimes = []; const perTokenDiag = []; // Prefill for (let i = 0; i < fwd.inputTokens.length; i++) { fwd.step(allTokens.slice(0, i+1), kvC, false); } // Decode with Glossolalia for (let i = 0; i < maxTokens; i++) { const ts = performance.now(); const { logits, layerNorms, attnEntropies } = fwd.step(allTokens, kvC, true); const { tokenId, agents } = sampleGlossolalia(logits); // Token-level perplexity from merged distribution const { merged } = glossolaliaMerge(logits); const chosenProb = merged[tokenId] || 0; const perplexity = chosenProb > 0 ? -Math.log2(chosenProb) : 99; let vocabCoverage = 0; for (let j = 0; j < merged.length; j++) if (merged[j] > 0.001) vocabCoverage++; tokenTimes.push(performance.now() - ts); perTokenDiag.push({ agents: agents.map(a => ({ tau: a.tau, entropy: Math.round(a.entropy*1000)/1000, weight: Math.round(a.weight*1000)/1000, top3: a.top5.slice(0,3).map(t => ({ token: model.tokenizer.decode([t.i]), prob: Math.round(t.p*1000)/1000 })), })), perplexity: Math.round(perplexity*100)/100, chosenProb: Math.round(chosenProb*1000)/1000, vocabCoverage, layerNorms, attnEntropies, }); if (tokenId === C.eosToken) break; allTokens.push(tokenId); } const genTokens = allTokens.slice(fwd.inputTokens.length); const totalTime = performance.now() - t0; const avgMs = tokenTimes.length > 0 ? tokenTimes.reduce((a,b)=>a+b,0)/tokenTimes.length : 0; return { text: model.tokenizer.decode(genTokens), tokens: genTokens.length, totalTimeMs: Math.round(totalTime), avgTokenMs: Math.round(avgMs), mode: 'glossolalia', temperatures: [0.4, 0.7, 1.0], diagnostics: perTokenDiag, }; } // ─── HTTP Server ──────────────────────────────────────────────────────────── const server = createServer((req, res) => { const handle = (genFn) => { let body = ''; req.on('data', c => body += c); req.on('end', () => { try { const { prompt, max_tokens, model } = JSON.parse(body); const result = genFn(prompt, max_tokens || 256, model || 'buleyean'); res.writeHead(200, { 'Content-Type': 'application/json' }); res.end(JSON.stringify(result)); } catch (e) { console.error('[Aether]', e); res.writeHead(500, { 'Content-Type': 'application/json' }); res.end(JSON.stringify({ error: e.message })); } }); }; if (req.method==='POST' && req.url==='/generate-standard') handle(generateStandard); else if (req.method==='POST' && req.url==='/generate-glossolalia') handle(generateGlossolalia); else if (req.url==='/health') { res.writeHead(200,{'Content-Type':'application/json'}); res.end(JSON.stringify({status:'ok',models:Object.keys(models),simd:!!simd,loadTimes:Object.fromEntries(Object.entries(models).map(([k,v])=>[k,v.loadTime]))})); } else { res.writeHead(404); res.end(); } }); // ─── Main ─────────────────────────────────────────────────────────────────── const basePath = '/tmp/hf_cache/smollm2-360m-q8_0.gguf'; const bulePath = '/tmp/hf_cache/buleyean-smollm2-360m-q8_0.gguf'; const tokPath = '/tmp/hf_cache/tokenizer.json'; async function main() { simd = await loadSIMD(); // Download base model if (!existsSync(basePath)) { console.log('[Aether] Downloading base SmolLM2-360M Q8_0...'); execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('bartowski/SmolLM2-360M-Instruct-GGUF', 'SmolLM2-360M-Instruct-Q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache'); import shutil; shutil.move('/tmp/hf_cache/SmolLM2-360M-Instruct-Q8_0.gguf', '${basePath}')"`, { stdio: 'inherit' }); } // Download Buleyean model if (!existsSync(bulePath)) { console.log('[Aether] Downloading Buleyean SmolLM2-360M Q8_0...'); execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' }); } // Download tokenizer if (!existsSync(tokPath)) { console.log('[Aether] Downloading tokenizer...'); execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' }); } // Load both models loadModel('base', basePath, tokPath); loadModel('buleyean', bulePath, tokPath); server.listen(PORT, '127.0.0.1', () => console.log(`[Aether] http://127.0.0.1:${PORT} (SIMD: ${!!simd}, models: ${Object.keys(models).join(', ')})`)); } main().catch(e => { console.error('[Aether] Fatal:', e); process.exit(1); });