Spaces:
Running
Running
Taylor commited on
Commit Β·
fcac5c7
1
Parent(s): 64ff7cb
feat: Aether v2 with RoPE fix -- PyTorch vs Aether side by side
Browse filesRoot cause found and fixed: RoPE was using NeoX style (half-split
pairing) but SmolLM2/LLaMA needs adjacent pairing (x[i], x[i+1]).
This corrupted every attention head in every layer.
Also includes:
- WASM SIMD for matVec, rmsNorm, softmax, fusedSiluMul
- JS fallback for LM head (>100MB) and attention
- Type-aware GGUF dequant (Q8_0 with type field, not byte guessing)
- Parallel execution (whichever engine finishes first shows first)
- Streaming results via Gradio generator
- Dockerfile +6 -3
- aether-server.mjs +231 -460
- app.py +106 -81
Dockerfile
CHANGED
|
@@ -1,14 +1,17 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
COPY requirements.txt .
|
| 6 |
RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
|
| 7 |
|
| 8 |
-
COPY app.py ./
|
| 9 |
|
| 10 |
RUN mkdir -p /tmp/hf_cache
|
| 11 |
-
|
| 12 |
EXPOSE 7860
|
| 13 |
-
|
| 14 |
CMD ["python", "app.py"]
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
+
RUN apt-get update && apt-get install -y curl && \
|
| 4 |
+
curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
|
| 5 |
+
apt-get install -y nodejs && \
|
| 6 |
+
rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
WORKDIR /app
|
| 9 |
|
| 10 |
COPY requirements.txt .
|
| 11 |
RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
|
| 12 |
|
| 13 |
+
COPY app.py aether-server.mjs simd-kernels.wasm ./
|
| 14 |
|
| 15 |
RUN mkdir -p /tmp/hf_cache
|
|
|
|
| 16 |
EXPOSE 7860
|
|
|
|
| 17 |
CMD ["python", "app.py"]
|
aether-server.mjs
CHANGED
|
@@ -3,8 +3,6 @@
|
|
| 3 |
*
|
| 4 |
* SmolLM2-360M inference using WASM SIMD kernels.
|
| 5 |
* Zero external ML dependencies. Pure JS + 14KB WASM binary.
|
| 6 |
-
*
|
| 7 |
-
* GGUF parse β WASM SIMD matVec β RoPE β fusedSiluMul β sampling
|
| 8 |
*/
|
| 9 |
|
| 10 |
import { createServer } from 'http';
|
|
@@ -16,559 +14,338 @@ import { dirname, join } from 'path';
|
|
| 16 |
const __dirname = dirname(fileURLToPath(import.meta.url));
|
| 17 |
const PORT = parseInt(process.env.AETHER_PORT || '7861');
|
| 18 |
|
| 19 |
-
// βββ
|
| 20 |
-
const
|
| 21 |
-
hiddenDim: 960,
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
numKvHeads: 5,
|
| 25 |
-
headDim: 64,
|
| 26 |
-
intermediateSize: 2560,
|
| 27 |
-
vocabSize: 49152,
|
| 28 |
-
maxSeqLength: 2048,
|
| 29 |
-
ropeTheta: 100000.0,
|
| 30 |
-
rmsNormEps: 1e-5,
|
| 31 |
-
eosToken: 2,
|
| 32 |
-
bosToken: 1,
|
| 33 |
};
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
// βββ WASM SIMD
|
| 36 |
let simd = null;
|
| 37 |
|
| 38 |
async function loadSIMD() {
|
| 39 |
-
const
|
| 40 |
-
if (!existsSync(
|
| 41 |
-
console.log('[Aether] WASM SIMD binary not found, using JS fallbacks');
|
| 42 |
-
return null;
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
try {
|
| 46 |
-
const
|
| 47 |
-
const { instance } = await WebAssembly.instantiate(wasmBytes, {
|
| 48 |
env: { expf: Math.exp, tanhf: Math.tanh, powf: Math.pow },
|
| 49 |
});
|
| 50 |
-
const
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
const
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
|
|
|
|
| 62 |
return {
|
| 63 |
-
matVec(
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
const
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
const
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
const xPtr = wasm.allocate(x.byteLength);
|
| 98 |
-
const rPtr = wasm.allocate(x.byteLength);
|
| 99 |
-
copyTo(xPtr, x);
|
| 100 |
-
wasm.softmaxSimd(xPtr, rPtr, x.length);
|
| 101 |
-
const result = copyFrom(rPtr, x.length);
|
| 102 |
-
wasm.resetHeap(saved);
|
| 103 |
-
return result;
|
| 104 |
-
},
|
| 105 |
-
fusedSiluMul(gate, up) {
|
| 106 |
-
const saved = wasm.getHeapPtr();
|
| 107 |
-
const gPtr = wasm.allocate(gate.byteLength);
|
| 108 |
-
const uPtr = wasm.allocate(up.byteLength);
|
| 109 |
-
const rPtr = wasm.allocate(gate.byteLength);
|
| 110 |
-
copyTo(gPtr, gate); copyTo(uPtr, up);
|
| 111 |
-
wasm.fusedSiluMul(gPtr, uPtr, rPtr, gate.length);
|
| 112 |
-
const result = copyFrom(rPtr, gate.length);
|
| 113 |
-
wasm.resetHeap(saved);
|
| 114 |
-
return result;
|
| 115 |
-
},
|
| 116 |
-
add(a, b) {
|
| 117 |
-
const saved = wasm.getHeapPtr();
|
| 118 |
-
const aPtr = wasm.allocate(a.byteLength);
|
| 119 |
-
const bPtr = wasm.allocate(b.byteLength);
|
| 120 |
-
const rPtr = wasm.allocate(a.byteLength);
|
| 121 |
-
copyTo(aPtr, a); copyTo(bPtr, b);
|
| 122 |
-
wasm.addSimd(aPtr, bPtr, rPtr, a.length);
|
| 123 |
-
const result = copyFrom(rPtr, a.length);
|
| 124 |
-
wasm.resetHeap(saved);
|
| 125 |
-
return result;
|
| 126 |
-
},
|
| 127 |
-
// flashAttentionMultiHead: not in standalone WASM -- use JS attention
|
| 128 |
-
flashAttentionMultiHead: null,
|
| 129 |
};
|
| 130 |
-
} catch (e) {
|
| 131 |
-
console.warn(`[Aether] WASM SIMD failed: ${e.message}, using JS fallbacks`);
|
| 132 |
-
return null;
|
| 133 |
-
}
|
| 134 |
}
|
| 135 |
|
| 136 |
-
// βββ JS Fallbacks
|
| 137 |
-
function matVecJS(
|
| 138 |
-
const
|
| 139 |
-
for (let r = 0; r < rows; r++) {
|
| 140 |
-
|
| 141 |
-
for (let c = 0; c < cols; c++) sum += matrix[off + c] * vector[c];
|
| 142 |
-
out[r] = sum;
|
| 143 |
-
}
|
| 144 |
-
return out;
|
| 145 |
}
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
for (let i = 0; i < x.length; i++)
|
| 150 |
-
ss = 1.0 / Math.sqrt(ss / x.length + eps);
|
| 151 |
-
const out = new Float32Array(x.length);
|
| 152 |
-
for (let i = 0; i < x.length; i++) out[i] = x[i] * ss * weight[i];
|
| 153 |
-
return out;
|
| 154 |
}
|
| 155 |
-
|
| 156 |
function softmaxJS(x) {
|
| 157 |
-
let
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
let
|
| 161 |
-
for (let i = 0; i < x.length; i++) { out[i] = Math.exp(x[i] - max); sum += out[i]; }
|
| 162 |
-
for (let i = 0; i < x.length; i++) out[i] /= sum;
|
| 163 |
-
return out;
|
| 164 |
}
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
for (let i = 0; i < gate.length; i++) {
|
| 169 |
-
const g = gate[i];
|
| 170 |
-
out[i] = (g / (1 + Math.exp(-g))) * up[i];
|
| 171 |
-
}
|
| 172 |
-
return out;
|
| 173 |
}
|
| 174 |
-
|
| 175 |
function addJS(a, b) {
|
| 176 |
-
const
|
| 177 |
-
for (let i = 0; i < a.length; i++) out[i] = a[i] + b[i];
|
| 178 |
-
return out;
|
| 179 |
}
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
softmax: simd?.softmax || softmaxJS,
|
| 187 |
-
fusedSiluMul: simd?.fusedSiluMul || fusedSiluMulJS,
|
| 188 |
-
add: simd?.add || addJS,
|
| 189 |
-
flashAttentionMultiHead: simd?.flashAttentionMultiHead || null,
|
| 190 |
-
};
|
| 191 |
-
}
|
| 192 |
-
|
| 193 |
-
// βββ Q4_K Dequantization ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 194 |
-
const QK_K = 256;
|
| 195 |
-
const Q4K_BLOCK_BYTES = 144;
|
| 196 |
|
|
|
|
| 197 |
function fp16(lo, hi) {
|
| 198 |
-
const h = lo
|
| 199 |
-
|
| 200 |
-
if (e
|
| 201 |
-
|
| 202 |
-
return (s ? -1 : 1) * Math.pow(2, e - 15) * (1 + f / 1024);
|
| 203 |
-
}
|
| 204 |
-
|
| 205 |
-
function getScaleMinK4(gi, scales) {
|
| 206 |
-
if (gi < 4) return [scales[gi] & 63, scales[gi + 4] & 63];
|
| 207 |
-
return [(scales[gi + 4] & 0xf) | ((scales[gi - 4] >> 6) << 4),
|
| 208 |
-
(scales[gi + 4] >> 4) | ((scales[gi] >> 6) << 4)];
|
| 209 |
}
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
if (outOff + QK_K > numElements) break;
|
| 217 |
-
const bs = b * Q4K_BLOCK_BYTES;
|
| 218 |
-
const d = fp16(data[bs], data[bs + 1]);
|
| 219 |
-
const dmin = fp16(data[bs + 2], data[bs + 3]);
|
| 220 |
-
const scales = data.subarray(bs + 4, bs + 16);
|
| 221 |
-
const qs = data.subarray(bs + 16, bs + Q4K_BLOCK_BYTES);
|
| 222 |
-
let si = 0, qi = 0;
|
| 223 |
-
for (let j = 0; j < QK_K; j += 64) {
|
| 224 |
-
const [sc1, m1] = getScaleMinK4(si, scales);
|
| 225 |
-
const [sc2, m2] = getScaleMinK4(si + 1, scales);
|
| 226 |
-
const d1 = d * sc1, d2 = d * sc2, dm1 = dmin * m1, dm2 = dmin * m2;
|
| 227 |
-
for (let lane = 0; lane < 32; lane++) {
|
| 228 |
-
const qb = qs[qi + lane];
|
| 229 |
-
out[outOff + j + lane] = d1 * (qb & 0x0f) - dm1;
|
| 230 |
-
out[outOff + j + 32 + lane] = d2 * (qb >> 4) - dm2;
|
| 231 |
-
}
|
| 232 |
-
qi += 32; si += 2;
|
| 233 |
-
}
|
| 234 |
-
}
|
| 235 |
-
return out;
|
| 236 |
}
|
|
|
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
for (let b = 0; b < nb; b++) {
|
| 244 |
-
const off = b * Q8_BYTES;
|
| 245 |
-
const scale = fp16(data[off], data[off + 1]);
|
| 246 |
-
const n = Math.min(Q8_BLOCK, numElements - b * Q8_BLOCK);
|
| 247 |
-
for (let i = 0; i < n; i++) {
|
| 248 |
-
const v = data[off + 2 + i]; out[b * Q8_BLOCK + i] = (v > 127 ? v - 256 : v) * scale;
|
| 249 |
-
}
|
| 250 |
-
}
|
| 251 |
-
return out;
|
| 252 |
-
}
|
| 253 |
-
|
| 254 |
-
function dequantAuto(data, numElements) {
|
| 255 |
-
const f32 = numElements * 4, q8 = Math.ceil(numElements / Q8_BLOCK) * Q8_BYTES;
|
| 256 |
-
const q4k = Math.ceil(numElements / QK_K) * Q4K_BLOCK_BYTES;
|
| 257 |
-
if (Math.abs(data.length - f32) < f32 * 0.05) return new Float32Array(data.buffer, data.byteOffset, numElements);
|
| 258 |
-
if (Math.abs(data.length - q4k) < q4k * 0.05) return dequantQ4K(data, numElements);
|
| 259 |
-
if (Math.abs(data.length - q8) < q8 * 0.05) return dequantQ8(data, numElements);
|
| 260 |
-
return dequantQ8(data, numElements);
|
| 261 |
}
|
| 262 |
|
| 263 |
// βββ GGUF Parser ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 264 |
-
const
|
| 265 |
-
const
|
| 266 |
-
const
|
| 267 |
-
const
|
| 268 |
-
const
|
| 269 |
-
|
| 270 |
-
function
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
}
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
case
|
| 278 |
-
case
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
case VT.STRING:{const r=rStr(buf,off);return{v:r.v,o:r.o};}
|
| 282 |
-
case VT.UINT64:return{v:buf.readBigUInt64LE(off),o:off+8};case VT.INT64:return{v:buf.readBigInt64LE(off),o:off+8};
|
| 283 |
-
case VT.FLOAT64:return{v:buf.readDoubleLE(off),o:off+8};
|
| 284 |
-
case VT.ARRAY:{const at=buf.readUInt32LE(off);const al=Number(buf.readBigUInt64LE(off+4));let co=off+12;const arr=[];
|
| 285 |
-
for(let i=0;i<al;i++){const r=rVal(buf,co,at);arr.push(r.v);co=r.o;}return{v:arr,o:co};}
|
| 286 |
-
default:throw new Error(`Unknown GGUF type: ${t}`);
|
| 287 |
}}
|
| 288 |
function parseGGUF(buf){
|
| 289 |
-
let
|
| 290 |
-
|
| 291 |
-
let align=32;for(let i=0;i<kc;i++){const{v:
|
| 292 |
-
const{v,o:o2}=
|
| 293 |
-
const tensors=[];for(let i=0;i<tc;i++){const{v:name,o:o1}=
|
| 294 |
-
const dims=[];for(let d=0;d<nd;d++){dims.push(buf.readBigUInt64LE(
|
| 295 |
-
const offset=buf.readBigUInt64LE(
|
| 296 |
-
tensors.push({name,dims,type,offset,size:
|
| 297 |
-
return{tensors,dataOffset:Math.ceil(
|
| 298 |
}
|
| 299 |
|
| 300 |
// βββ BPE Tokenizer ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 301 |
-
class
|
| 302 |
-
constructor(
|
| 303 |
-
const
|
| 304 |
-
this.
|
| 305 |
-
this.
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
for (const w of words) {
|
| 325 |
-
let syms = [];
|
| 326 |
-
for (const ch of w) {
|
| 327 |
-
if (this.vocab[ch] !== undefined) syms.push(ch);
|
| 328 |
-
else for (const b of Buffer.from(ch, 'utf8')) syms.push(`<0x${b.toString(16).toUpperCase().padStart(2,'0')}>`);
|
| 329 |
-
}
|
| 330 |
-
while (syms.length > 1) {
|
| 331 |
-
let best = Infinity, bi = -1;
|
| 332 |
-
for (let i = 0; i < syms.length - 1; i++) {
|
| 333 |
-
const r = this.mergeRanks[`${syms[i]} ${syms[i+1]}`];
|
| 334 |
-
if (r !== undefined && r < best) { best = r; bi = i; }
|
| 335 |
-
}
|
| 336 |
-
if (bi === -1) break;
|
| 337 |
-
syms.splice(bi, 2, syms[bi] + syms[bi + 1]);
|
| 338 |
-
}
|
| 339 |
-
for (const s of syms) { const id = this.vocab[s] ?? this.added[s]; if (id !== undefined) tokens.push(id); }
|
| 340 |
-
}
|
| 341 |
-
}
|
| 342 |
-
return tokens;
|
| 343 |
-
}
|
| 344 |
-
decode(tokens) {
|
| 345 |
-
const pieces = [];
|
| 346 |
-
for (const t of tokens) {
|
| 347 |
-
const p = this.rev[t];
|
| 348 |
-
if (p && p.startsWith('<0x') && p.endsWith('>')) pieces.push(String.fromCharCode(parseInt(p.slice(3,-1),16)));
|
| 349 |
-
else if (p && !p.startsWith('<|')) pieces.push(p);
|
| 350 |
-
}
|
| 351 |
-
return pieces.join('').replace(/Δ /g, ' ').replace(/Δ/g, '\n');
|
| 352 |
-
}
|
| 353 |
}
|
| 354 |
|
| 355 |
-
// βββ RoPE βββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 356 |
function applyRoPE(x, headDim, position, theta) {
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
const freq = 1.0 / Math.pow(theta, (2 *
|
| 360 |
const angle = position * freq;
|
| 361 |
const cos = Math.cos(angle), sin = Math.sin(angle);
|
| 362 |
-
const x0 = x[i], x1 = x[i +
|
| 363 |
-
x[i]
|
| 364 |
-
x[i +
|
| 365 |
}
|
| 366 |
}
|
| 367 |
|
| 368 |
// βββ Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 369 |
let model = null;
|
| 370 |
|
| 371 |
-
function loadModel(ggufPath,
|
| 372 |
-
console.log('[Aether] Loading GGUF...', ggufPath);
|
| 373 |
const t0 = Date.now();
|
| 374 |
const buf = readFileSync(ggufPath);
|
| 375 |
const parsed = parseGGUF(buf);
|
| 376 |
-
console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now()
|
| 377 |
|
| 378 |
-
const
|
| 379 |
-
const
|
| 380 |
-
|
| 381 |
-
const byName = {};
|
| 382 |
-
for (const t of parsed.tensors) byName[t.name] = t;
|
| 383 |
|
| 384 |
function get(name) {
|
| 385 |
-
const t = byName[name];
|
| 386 |
-
if (!t) { console.warn(`[Aether] Missing: ${name}`); return null; }
|
| 387 |
const raw = new Uint8Array(buf.buffer, buf.byteOffset + parsed.dataOffset + Number(t.offset), t.size);
|
| 388 |
-
|
| 389 |
-
// Type 0=F32, 1=F16, 8=Q8_0, 12=Q4_K, 14=Q6_K
|
| 390 |
-
if (t.type === 0) return new Float32Array(raw.buffer, raw.byteOffset, t.numElements);
|
| 391 |
-
if (t.type === 8) return dequantQ8(raw, t.numElements);
|
| 392 |
-
if (t.type === 12) return dequantQ4K(raw, t.numElements);
|
| 393 |
-
if (t.type === 1) {
|
| 394 |
-
// F16 -> F32
|
| 395 |
-
const out = new Float32Array(t.numElements);
|
| 396 |
-
for (let i = 0; i < t.numElements; i++) out[i] = fp16(raw[i*2], raw[i*2+1]);
|
| 397 |
-
return out;
|
| 398 |
-
}
|
| 399 |
-
console.warn(`[Aether] Unknown type ${t.type} for ${name}, trying Q8_0`);
|
| 400 |
-
return dequantQ8(raw, t.numElements);
|
| 401 |
}
|
| 402 |
|
| 403 |
-
console.log('[Aether] Dequantizing
|
| 404 |
const tokenEmbd = get('token_embd.weight');
|
| 405 |
-
|
| 406 |
-
console.log('[Aether] Dequantizing layers...');
|
| 407 |
const layers = [];
|
| 408 |
-
for (let i = 0; i <
|
| 409 |
-
if (i % 8 === 0) console.log(`[Aether] Layer ${i}/${
|
| 410 |
layers.push({
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
oProj: get(`blk.${i}.attn_output.weight`),
|
| 417 |
-
gateProj: get(`blk.${i}.ffn_gate.weight`),
|
| 418 |
-
upProj: get(`blk.${i}.ffn_up.weight`),
|
| 419 |
-
downProj: get(`blk.${i}.ffn_down.weight`),
|
| 420 |
});
|
| 421 |
}
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
if (!outputWeight) { console.log('[Aether] Tied embeddings'); outputWeight = tokenEmbd; }
|
| 426 |
-
|
| 427 |
-
const loadTime = Date.now() - t0;
|
| 428 |
-
console.log(`[Aether] Model loaded in ${(loadTime/1000).toFixed(1)}s (WASM SIMD: ${simd ? 'YES' : 'NO'})`);
|
| 429 |
-
model = { tokenEmbd, layers, outputNorm, outputWeight, tokenizer, loadTime };
|
| 430 |
}
|
| 431 |
|
| 432 |
// βββ Inference ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 433 |
function generate(prompt, maxTokens = 50) {
|
| 434 |
-
if (!model) throw new Error('Model not loaded');
|
| 435 |
-
|
| 436 |
const t0 = performance.now();
|
| 437 |
-
const
|
| 438 |
-
const kvDim = numKvHeads * headDim;
|
| 439 |
-
const o = ops();
|
| 440 |
|
| 441 |
const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
|
| 442 |
const inputTokens = model.tokenizer.encode(chatPrompt);
|
| 443 |
const allTokens = [...inputTokens];
|
| 444 |
|
| 445 |
-
|
| 446 |
-
const kvCache = Array.from({ length: CONFIG.numLayers }, () => ({
|
| 447 |
-
keys: [], // array of Float32Array[kvDim] per position
|
| 448 |
-
values: [], // array of Float32Array[kvDim] per position
|
| 449 |
-
}));
|
| 450 |
-
|
| 451 |
const tokenTimes = [];
|
| 452 |
|
| 453 |
for (let step = 0; step < inputTokens.length + maxTokens - 1; step++) {
|
| 454 |
-
const
|
| 455 |
-
const pos = step;
|
| 456 |
-
const tokenId = allTokens[step];
|
| 457 |
|
| 458 |
// Embed
|
| 459 |
-
const
|
| 460 |
-
|
| 461 |
-
for (let i = 0; i < hiddenDim; i++) hidden[i] = model.tokenEmbd[embOff + i];
|
| 462 |
-
|
| 463 |
-
let x = hidden;
|
| 464 |
|
| 465 |
-
for (let l = 0; l <
|
| 466 |
const ly = model.layers[l];
|
| 467 |
|
| 468 |
-
//
|
| 469 |
-
const normed = o.rmsNorm(x, ly.
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
const
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
for (let h = 0; h <
|
| 478 |
-
applyRoPE(
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
kvCache[l].
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
if (o.flashAttentionMultiHead && seqLen > 1) {
|
| 491 |
-
// Use WASM flash attention with GQA
|
| 492 |
-
const flatKeys = new Float32Array(seqLen * kvDim);
|
| 493 |
-
const flatVals = new Float32Array(seqLen * kvDim);
|
| 494 |
for (let s = 0; s < seqLen; s++) {
|
| 495 |
-
|
| 496 |
-
|
|
|
|
| 497 |
}
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
for (let h = 0; h < numHeads; h++) {
|
| 504 |
-
const kvH = Math.floor(h / gqaRatio);
|
| 505 |
-
const qH = q.subarray(h * headDim, (h + 1) * headDim);
|
| 506 |
-
const scores = new Float32Array(seqLen);
|
| 507 |
-
for (let s = 0; s < seqLen; s++) {
|
| 508 |
-
const kH = kvCache[l].keys[s].subarray(kvH * headDim, (kvH + 1) * headDim);
|
| 509 |
-
let dot = 0;
|
| 510 |
-
for (let d = 0; d < headDim; d++) dot += qH[d] * kH[d];
|
| 511 |
-
scores[s] = dot / Math.sqrt(headDim);
|
| 512 |
-
}
|
| 513 |
-
const w = softmaxJS(scores);
|
| 514 |
-
for (let s = 0; s < seqLen; s++) {
|
| 515 |
-
const vH = kvCache[l].values[s].subarray(kvH * headDim, (kvH + 1) * headDim);
|
| 516 |
-
for (let d = 0; d < headDim; d++) attnOut[h * headDim + d] += w[s] * vH[d];
|
| 517 |
-
}
|
| 518 |
}
|
| 519 |
}
|
| 520 |
|
| 521 |
-
|
| 522 |
-
const projected = o.matVec(ly.oProj, attnOut, hiddenDim, hiddenDim);
|
| 523 |
const postAttn = o.add(x, projected);
|
| 524 |
|
| 525 |
-
//
|
| 526 |
-
const ffnIn = o.rmsNorm(postAttn, ly.
|
| 527 |
-
const gate = o.matVec(ly.
|
| 528 |
-
const up = o.matVec(ly.
|
| 529 |
const activated = o.fusedSiluMul(gate, up);
|
| 530 |
-
const down = o.matVec(ly.
|
| 531 |
x = o.add(postAttn, down);
|
| 532 |
}
|
| 533 |
|
| 534 |
-
// Sample only after prefill
|
| 535 |
if (step >= inputTokens.length - 1) {
|
| 536 |
-
const finalNormed = o.rmsNorm(x, model.
|
| 537 |
-
const logits = o.matVec(model.
|
| 538 |
|
| 539 |
-
// Temperature sampling
|
| 540 |
for (let i = 0; i < logits.length; i++) logits[i] /= 0.7;
|
| 541 |
const probs = o.softmax(logits);
|
| 542 |
|
| 543 |
-
// Top-p nucleus sampling
|
| 544 |
const indexed = Array.from(probs).map((p, i) => ({ p, i })).sort((a, b) => b.p - a.p);
|
| 545 |
let cumP = 0, chosen = indexed[0].i;
|
| 546 |
const r = Math.random();
|
| 547 |
-
for (const { p, i } of indexed) {
|
| 548 |
-
cumP += p;
|
| 549 |
-
if (r < cumP) { chosen = i; break; }
|
| 550 |
-
if (cumP > 0.9) break;
|
| 551 |
-
}
|
| 552 |
|
| 553 |
-
tokenTimes.push(performance.now() -
|
| 554 |
-
if (chosen ===
|
| 555 |
allTokens.push(chosen);
|
| 556 |
}
|
| 557 |
}
|
| 558 |
|
| 559 |
const totalTime = performance.now() - t0;
|
| 560 |
const genTokens = allTokens.slice(inputTokens.length);
|
| 561 |
-
const text = model.tokenizer.decode(genTokens);
|
| 562 |
const avgMs = tokenTimes.length > 0 ? tokenTimes.reduce((a, b) => a + b, 0) / tokenTimes.length : 0;
|
| 563 |
|
| 564 |
return {
|
| 565 |
-
text,
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
avgTokenMs: Math.round(avgMs),
|
| 569 |
-
prefillTokens: inputTokens.length,
|
| 570 |
-
engine: `Aether ${simd ? 'WASM-SIMD' : 'JS-fallback'}`,
|
| 571 |
-
simd: !!simd,
|
| 572 |
};
|
| 573 |
}
|
| 574 |
|
|
@@ -584,6 +361,7 @@ const server = createServer((req, res) => {
|
|
| 584 |
res.writeHead(200, { 'Content-Type': 'application/json' });
|
| 585 |
res.end(JSON.stringify(result));
|
| 586 |
} catch (e) {
|
|
|
|
| 587 |
res.writeHead(500, { 'Content-Type': 'application/json' });
|
| 588 |
res.end(JSON.stringify({ error: e.message, stack: e.stack }));
|
| 589 |
}
|
|
@@ -595,28 +373,21 @@ const server = createServer((req, res) => {
|
|
| 595 |
});
|
| 596 |
|
| 597 |
// βββ Main βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 598 |
-
const ggufPath =
|
| 599 |
-
const
|
| 600 |
|
| 601 |
async function main() {
|
| 602 |
-
// Load WASM SIMD first
|
| 603 |
simd = await loadSIMD();
|
| 604 |
-
|
| 605 |
-
// Download model files
|
| 606 |
if (!existsSync(ggufPath)) {
|
| 607 |
console.log('[Aether] Downloading Q8_0 GGUF...');
|
| 608 |
execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
|
| 609 |
}
|
| 610 |
-
if (!existsSync(
|
| 611 |
console.log('[Aether] Downloading tokenizer...');
|
| 612 |
execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
|
| 613 |
}
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
server.listen(PORT, '127.0.0.1', () => {
|
| 618 |
-
console.log(`[Aether] Server on http://127.0.0.1:${PORT} (SIMD: ${simd ? 'YES' : 'NO'})`);
|
| 619 |
-
});
|
| 620 |
}
|
| 621 |
|
| 622 |
main().catch(e => { console.error('[Aether] Fatal:', e); process.exit(1); });
|
|
|
|
| 3 |
*
|
| 4 |
* SmolLM2-360M inference using WASM SIMD kernels.
|
| 5 |
* Zero external ML dependencies. Pure JS + 14KB WASM binary.
|
|
|
|
|
|
|
| 6 |
*/
|
| 7 |
|
| 8 |
import { createServer } from 'http';
|
|
|
|
| 14 |
const __dirname = dirname(fileURLToPath(import.meta.url));
|
| 15 |
const PORT = parseInt(process.env.AETHER_PORT || '7861');
|
| 16 |
|
| 17 |
+
// βββ SmolLM2-360M Config ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
const C = {
|
| 19 |
+
hiddenDim: 960, numLayers: 32, numHeads: 15, numKvHeads: 5,
|
| 20 |
+
headDim: 64, intermediateSize: 2560, vocabSize: 49152,
|
| 21 |
+
ropeTheta: 100000.0, rmsNormEps: 1e-5, eosToken: 2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
};
|
| 23 |
+
const kvDim = C.numKvHeads * C.headDim; // 320
|
| 24 |
+
const gqaRatio = C.numHeads / C.numKvHeads; // 3
|
| 25 |
|
| 26 |
+
// βββ WASM SIMD ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
let simd = null;
|
| 28 |
|
| 29 |
async function loadSIMD() {
|
| 30 |
+
const p = join(__dirname, 'simd-kernels.wasm');
|
| 31 |
+
if (!existsSync(p)) return null;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
try {
|
| 33 |
+
const { instance } = await WebAssembly.instantiate(readFileSync(p), {
|
|
|
|
| 34 |
env: { expf: Math.exp, tanhf: Math.tanh, powf: Math.pow },
|
| 35 |
});
|
| 36 |
+
const w = instance.exports;
|
| 37 |
+
w.resetHeap(65536);
|
| 38 |
+
const mem = w.memory;
|
| 39 |
+
const hf = () => new Float32Array(mem.buffer);
|
| 40 |
+
const cp = (ptr, f) => hf().set(f, ptr >> 2);
|
| 41 |
+
const rd = (ptr, n) => hf().slice(ptr >> 2, (ptr >> 2) + n);
|
| 42 |
+
|
| 43 |
+
const wrap = (fn) => (...args) => {
|
| 44 |
+
const s = w.getHeapPtr();
|
| 45 |
+
try { return fn(s, ...args); }
|
| 46 |
+
finally { w.resetHeap(s); }
|
| 47 |
+
};
|
| 48 |
|
| 49 |
+
console.log('[Aether] WASM SIMD loaded');
|
| 50 |
return {
|
| 51 |
+
matVec: wrap((s, mat, vec, rows, cols) => {
|
| 52 |
+
if (mat.byteLength > 100_000_000) return matVecJS(mat, vec, rows, cols);
|
| 53 |
+
const mP = w.allocate(mat.byteLength); const vP = w.allocate(vec.byteLength);
|
| 54 |
+
const rP = w.allocate(rows * 4);
|
| 55 |
+
cp(mP, mat); cp(vP, vec);
|
| 56 |
+
w.matVecSimdBatch4(mP, vP, rP, rows, cols);
|
| 57 |
+
return rd(rP, rows);
|
| 58 |
+
}),
|
| 59 |
+
rmsNorm: wrap((s, x, wt, eps) => {
|
| 60 |
+
const xP = w.allocate(x.byteLength); const wP = w.allocate(wt.byteLength);
|
| 61 |
+
const rP = w.allocate(x.byteLength);
|
| 62 |
+
cp(xP, x); cp(wP, wt);
|
| 63 |
+
w.rmsNormSimd(xP, wP, rP, x.length, eps);
|
| 64 |
+
return rd(rP, x.length);
|
| 65 |
+
}),
|
| 66 |
+
softmax: wrap((s, x) => {
|
| 67 |
+
const xP = w.allocate(x.byteLength); const rP = w.allocate(x.byteLength);
|
| 68 |
+
cp(xP, x); w.softmaxSimd(xP, rP, x.length);
|
| 69 |
+
return rd(rP, x.length);
|
| 70 |
+
}),
|
| 71 |
+
fusedSiluMul: wrap((s, g, u) => {
|
| 72 |
+
const gP = w.allocate(g.byteLength); const uP = w.allocate(u.byteLength);
|
| 73 |
+
const rP = w.allocate(g.byteLength);
|
| 74 |
+
cp(gP, g); cp(uP, u);
|
| 75 |
+
w.fusedSiluMul(gP, uP, rP, g.length);
|
| 76 |
+
return rd(rP, g.length);
|
| 77 |
+
}),
|
| 78 |
+
add: wrap((s, a, b) => {
|
| 79 |
+
const aP = w.allocate(a.byteLength); const bP = w.allocate(b.byteLength);
|
| 80 |
+
const rP = w.allocate(a.byteLength);
|
| 81 |
+
cp(aP, a); cp(bP, b);
|
| 82 |
+
w.addSimd(aP, bP, rP, a.length);
|
| 83 |
+
return rd(rP, a.length);
|
| 84 |
+
}),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
};
|
| 86 |
+
} catch (e) { console.warn('[Aether] WASM failed:', e.message); return null; }
|
|
|
|
|
|
|
|
|
|
| 87 |
}
|
| 88 |
|
| 89 |
+
// βββ JS Fallbacks βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
function matVecJS(m, v, rows, cols) {
|
| 91 |
+
const o = new Float32Array(rows);
|
| 92 |
+
for (let r = 0; r < rows; r++) { let s = 0; const off = r * cols; for (let c = 0; c < cols; c++) s += m[off+c]*v[c]; o[r] = s; }
|
| 93 |
+
return o;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
}
|
| 95 |
+
function rmsNormJS(x, w, eps) {
|
| 96 |
+
let ss = 0; for (let i = 0; i < x.length; i++) ss += x[i]*x[i];
|
| 97 |
+
ss = 1.0/Math.sqrt(ss/x.length+eps);
|
| 98 |
+
const o = new Float32Array(x.length); for (let i = 0; i < x.length; i++) o[i] = x[i]*ss*w[i]; return o;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
}
|
|
|
|
| 100 |
function softmaxJS(x) {
|
| 101 |
+
let mx = -Infinity; for (let i = 0; i < x.length; i++) if (x[i]>mx) mx=x[i];
|
| 102 |
+
const o = new Float32Array(x.length); let s=0;
|
| 103 |
+
for (let i = 0; i < x.length; i++) { o[i]=Math.exp(x[i]-mx); s+=o[i]; }
|
| 104 |
+
for (let i = 0; i < x.length; i++) o[i]/=s; return o;
|
|
|
|
|
|
|
|
|
|
| 105 |
}
|
| 106 |
+
function fusedSiluMulJS(g, u) {
|
| 107 |
+
const o = new Float32Array(g.length);
|
| 108 |
+
for (let i = 0; i < g.length; i++) { const v=g[i]; o[i]=(v/(1+Math.exp(-v)))*u[i]; } return o;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
}
|
|
|
|
| 110 |
function addJS(a, b) {
|
| 111 |
+
const o = new Float32Array(a.length); for (let i = 0; i < a.length; i++) o[i]=a[i]+b[i]; return o;
|
|
|
|
|
|
|
| 112 |
}
|
| 113 |
|
| 114 |
+
const op = () => ({
|
| 115 |
+
matVec: simd?.matVec || matVecJS, rmsNorm: simd?.rmsNorm || rmsNormJS,
|
| 116 |
+
softmax: simd?.softmax || softmaxJS, fusedSiluMul: simd?.fusedSiluMul || fusedSiluMulJS,
|
| 117 |
+
add: simd?.add || addJS,
|
| 118 |
+
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
// βββ Q8_0 Dequant βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 121 |
function fp16(lo, hi) {
|
| 122 |
+
const h = lo|(hi<<8), s=(h>>15)&1, e=(h>>10)&0x1f, f=h&0x3ff;
|
| 123 |
+
if (e===0) return f===0?0:(s?-1:1)*(f/1024)*Math.pow(2,-14);
|
| 124 |
+
if (e===31) return 0;
|
| 125 |
+
return (s?-1:1)*Math.pow(2,e-15)*(1+f/1024);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
}
|
| 127 |
+
function dequantQ8(data, n) {
|
| 128 |
+
const o = new Float32Array(n), nb = Math.ceil(n/32);
|
| 129 |
+
for (let b=0;b<nb;b++) { const off=b*34, sc=fp16(data[off],data[off+1]);
|
| 130 |
+
const cnt=Math.min(32,n-b*32);
|
| 131 |
+
for (let i=0;i<cnt;i++) { const v=data[off+2+i]; o[b*32+i]=(v>127?v-256:v)*sc; }
|
| 132 |
+
} return o;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
}
|
| 134 |
+
function dequantF32(data, n) { return new Float32Array(data.buffer, data.byteOffset, n); }
|
| 135 |
|
| 136 |
+
function dequantByType(data, n, type) {
|
| 137 |
+
if (type === 0) return dequantF32(data, n);
|
| 138 |
+
if (type === 8) return dequantQ8(data, n);
|
| 139 |
+
if (type === 1) { const o=new Float32Array(n); for(let i=0;i<n;i++) o[i]=fp16(data[i*2],data[i*2+1]); return o; }
|
| 140 |
+
return dequantQ8(data, n); // fallback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
}
|
| 142 |
|
| 143 |
// βββ GGUF Parser ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
+
const MAGIC=0x46554747;
|
| 145 |
+
const BSZ={2:32,3:32,6:32,7:32,8:32,9:32,10:256,11:256,12:256,13:256,14:256,15:256};
|
| 146 |
+
const BBY={2:18,3:20,6:22,7:24,8:34,9:36,10:84,11:110,12:144,13:176,14:210,15:292};
|
| 147 |
+
const TSZ={0:4,1:2,16:1,17:2,18:4,19:8,20:8};
|
| 148 |
+
function csz(d,t){let n=1n;for(const x of d)n*=x;const b=BSZ[t];if(b&&BBY[t])return Math.ceil(Number(n)/b)*BBY[t];return Math.ceil(Number(n)*(TSZ[t]??4));}
|
| 149 |
+
function rs(b,o){const l=Number(b.readBigUInt64LE(o));return{v:b.subarray(o+8,o+8+l).toString('utf8'),o:o+8+l};}
|
| 150 |
+
function rv(b,o,t){switch(t){
|
| 151 |
+
case 0:return{v:b.readUInt8(o),o:o+1};case 1:return{v:b.readInt8(o),o:o+1};
|
| 152 |
+
case 2:return{v:b.readUInt16LE(o),o:o+2};case 3:return{v:b.readInt16LE(o),o:o+2};
|
| 153 |
+
case 4:return{v:b.readUInt32LE(o),o:o+4};case 5:return{v:b.readInt32LE(o),o:o+4};
|
| 154 |
+
case 6:return{v:b.readFloatLE(o),o:o+4};case 7:return{v:b.readUInt8(o)!==0,o:o+1};
|
| 155 |
+
case 8:{const r=rs(b,o);return{v:r.v,o:r.o};}
|
| 156 |
+
case 10:return{v:b.readBigUInt64LE(o),o:o+8};case 11:return{v:b.readBigInt64LE(o),o:o+8};
|
| 157 |
+
case 12:return{v:b.readDoubleLE(o),o:o+8};
|
| 158 |
+
case 9:{const at=b.readUInt32LE(o),al=Number(b.readBigUInt64LE(o+4));let co=o+12;const a=[];
|
| 159 |
+
for(let i=0;i<al;i++){const r=rv(b,co,at);a.push(r.v);co=r.o;}return{v:a,o:co};}
|
| 160 |
+
default:throw new Error(`Unknown GGUF type ${t}`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
}}
|
| 162 |
function parseGGUF(buf){
|
| 163 |
+
let o=0;if(buf.readUInt32LE(o)!==MAGIC)throw new Error('Not GGUF');o+=4;o+=4;
|
| 164 |
+
const tc=Number(buf.readBigUInt64LE(o));o+=8;const kc=Number(buf.readBigUInt64LE(o));o+=8;
|
| 165 |
+
let align=32;for(let i=0;i<kc;i++){const{v:k,o:o1}=rs(buf,o);o=o1;const vt=buf.readUInt32LE(o);o+=4;
|
| 166 |
+
const{v,o:o2}=rv(buf,o,vt);o=o2;if(k==='general.alignment')align=Number(v);}
|
| 167 |
+
const tensors=[];for(let i=0;i<tc;i++){const{v:name,o:o1}=rs(buf,o);o=o1;const nd=buf.readUInt32LE(o);o+=4;
|
| 168 |
+
const dims=[];for(let d=0;d<nd;d++){dims.push(buf.readBigUInt64LE(o));o+=8;}const type=buf.readUInt32LE(o);o+=4;
|
| 169 |
+
const offset=buf.readBigUInt64LE(o);o+=8;
|
| 170 |
+
tensors.push({name,dims,type,offset,size:csz(dims,type),numElements:Number(dims.reduce((a,b)=>a*b,1n))});}
|
| 171 |
+
return{tensors,dataOffset:Math.ceil(o/align)*align};
|
| 172 |
}
|
| 173 |
|
| 174 |
// βββ BPE Tokenizer ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
+
class Tok {
|
| 176 |
+
constructor(j){const m=j.model||{};this.vocab=m.vocab||{};this.rev={};
|
| 177 |
+
for(const[t,id]of Object.entries(this.vocab))this.rev[id]=t;
|
| 178 |
+
this.mr={};for(const[i,mg]of(m.merges||[]).entries())this.mr[mg]=i;
|
| 179 |
+
this.added={};if(j.added_tokens)for(const t of j.added_tokens)this.added[t.content]=t.id;}
|
| 180 |
+
encode(text){const sp=/<\|[^|]+\|>/g;const parts=[];let last=0,m;
|
| 181 |
+
while((m=sp.exec(text))!==null){if(m.index>last)parts.push({t:text.slice(last,m.index),s:false});
|
| 182 |
+
parts.push({t:m[0],s:true});last=m.index+m[0].length;}
|
| 183 |
+
if(last<text.length)parts.push({t:text.slice(last),s:false});
|
| 184 |
+
const tokens=[];for(const p of parts){
|
| 185 |
+
if(p.s){const id=this.added[p.t]??this.vocab[p.t];if(id!==undefined)tokens.push(id);continue;}
|
| 186 |
+
const words=p.t.match(/\S+|\s+/g)||[];for(const w of words){let syms=[];
|
| 187 |
+
for(const ch of w){if(this.vocab[ch]!==undefined)syms.push(ch);
|
| 188 |
+
else for(const b of Buffer.from(ch,'utf8'))syms.push(`<0x${b.toString(16).toUpperCase().padStart(2,'0')}>`)}
|
| 189 |
+
while(syms.length>1){let best=Infinity,bi=-1;
|
| 190 |
+
for(let i=0;i<syms.length-1;i++){const r=this.mr[`${syms[i]} ${syms[i+1]}`];if(r!==undefined&&r<best){best=r;bi=i;}}
|
| 191 |
+
if(bi===-1)break;syms.splice(bi,2,syms[bi]+syms[bi+1]);}
|
| 192 |
+
for(const s of syms){const id=this.vocab[s]??this.added[s];if(id!==undefined)tokens.push(id);}}}
|
| 193 |
+
return tokens;}
|
| 194 |
+
decode(tokens){const p=[];for(const t of tokens){const s=this.rev[t];
|
| 195 |
+
if(s&&s.startsWith('<0x')&&s.endsWith('>'))p.push(String.fromCharCode(parseInt(s.slice(3,-1),16)));
|
| 196 |
+
else if(s&&!s.startsWith('<|'))p.push(s);}
|
| 197 |
+
return p.join('').replace(/Δ /g,' ').replace(/Δ/g,'\n');}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
}
|
| 199 |
|
| 200 |
+
// βββ RoPE (LLaMA style: ADJACENT pairs) βββββββββββββββββββββββββββββββββββββ
|
| 201 |
+
// CRITICAL: SmolLM2/LLaMA pairs (x[i], x[i+1]), NOT (x[k], x[k+half])
|
| 202 |
function applyRoPE(x, headDim, position, theta) {
|
| 203 |
+
for (let i = 0; i < headDim; i += 2) {
|
| 204 |
+
const freqIdx = i / 2;
|
| 205 |
+
const freq = 1.0 / Math.pow(theta, (2 * freqIdx) / headDim);
|
| 206 |
const angle = position * freq;
|
| 207 |
const cos = Math.cos(angle), sin = Math.sin(angle);
|
| 208 |
+
const x0 = x[i], x1 = x[i + 1];
|
| 209 |
+
x[i] = x0 * cos - x1 * sin;
|
| 210 |
+
x[i + 1] = x0 * sin + x1 * cos;
|
| 211 |
}
|
| 212 |
}
|
| 213 |
|
| 214 |
// βββ Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 215 |
let model = null;
|
| 216 |
|
| 217 |
+
function loadModel(ggufPath, tokPath) {
|
|
|
|
| 218 |
const t0 = Date.now();
|
| 219 |
const buf = readFileSync(ggufPath);
|
| 220 |
const parsed = parseGGUF(buf);
|
| 221 |
+
console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now()-t0}ms`);
|
| 222 |
|
| 223 |
+
const tokenizer = new Tok(JSON.parse(readFileSync(tokPath, 'utf8')));
|
| 224 |
+
const byName = {}; for (const t of parsed.tensors) byName[t.name] = t;
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
function get(name) {
|
| 227 |
+
const t = byName[name]; if (!t) return null;
|
|
|
|
| 228 |
const raw = new Uint8Array(buf.buffer, buf.byteOffset + parsed.dataOffset + Number(t.offset), t.size);
|
| 229 |
+
return dequantByType(raw, t.numElements, t.type);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
}
|
| 231 |
|
| 232 |
+
console.log('[Aether] Dequantizing...');
|
| 233 |
const tokenEmbd = get('token_embd.weight');
|
|
|
|
|
|
|
| 234 |
const layers = [];
|
| 235 |
+
for (let i = 0; i < C.numLayers; i++) {
|
| 236 |
+
if (i % 8 === 0) console.log(`[Aether] Layer ${i}/${C.numLayers}`);
|
| 237 |
layers.push({
|
| 238 |
+
an: get(`blk.${i}.attn_norm.weight`), fn: get(`blk.${i}.ffn_norm.weight`),
|
| 239 |
+
qw: get(`blk.${i}.attn_q.weight`), kw: get(`blk.${i}.attn_k.weight`),
|
| 240 |
+
vw: get(`blk.${i}.attn_v.weight`), ow: get(`blk.${i}.attn_output.weight`),
|
| 241 |
+
gw: get(`blk.${i}.ffn_gate.weight`), uw: get(`blk.${i}.ffn_up.weight`),
|
| 242 |
+
dw: get(`blk.${i}.ffn_down.weight`),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
});
|
| 244 |
}
|
| 245 |
+
const outNorm = get('output_norm.weight');
|
| 246 |
+
let outWeight = get('output.weight');
|
| 247 |
+
if (!outWeight) { console.log('[Aether] Tied embeddings'); outWeight = tokenEmbd; }
|
| 248 |
|
| 249 |
+
console.log(`[Aether] Loaded in ${((Date.now()-t0)/1000).toFixed(1)}s`);
|
| 250 |
+
model = { tokenEmbd, layers, outNorm, outWeight, tokenizer, loadTime: Date.now()-t0 };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
}
|
| 252 |
|
| 253 |
// βββ Inference ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 254 |
function generate(prompt, maxTokens = 50) {
|
|
|
|
|
|
|
| 255 |
const t0 = performance.now();
|
| 256 |
+
const o = op();
|
|
|
|
|
|
|
| 257 |
|
| 258 |
const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
|
| 259 |
const inputTokens = model.tokenizer.encode(chatPrompt);
|
| 260 |
const allTokens = [...inputTokens];
|
| 261 |
|
| 262 |
+
const kvCache = Array.from({ length: C.numLayers }, () => ({ k: [], v: [] }));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
const tokenTimes = [];
|
| 264 |
|
| 265 |
for (let step = 0; step < inputTokens.length + maxTokens - 1; step++) {
|
| 266 |
+
const tStart = performance.now();
|
| 267 |
+
const pos = step, tid = allTokens[step];
|
|
|
|
| 268 |
|
| 269 |
// Embed
|
| 270 |
+
const x0 = model.tokenEmbd.slice(tid * C.hiddenDim, (tid + 1) * C.hiddenDim);
|
| 271 |
+
let x = x0;
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
for (let l = 0; l < C.numLayers; l++) {
|
| 274 |
const ly = model.layers[l];
|
| 275 |
|
| 276 |
+
// Attention: norm β QKV β RoPE β attention β O β residual
|
| 277 |
+
const normed = o.rmsNorm(x, ly.an, C.rmsNormEps);
|
| 278 |
+
const q = o.matVec(ly.qw, normed, C.hiddenDim, C.hiddenDim);
|
| 279 |
+
const k = o.matVec(ly.kw, normed, kvDim, C.hiddenDim);
|
| 280 |
+
const v = o.matVec(ly.vw, normed, kvDim, C.hiddenDim);
|
| 281 |
+
|
| 282 |
+
// RoPE per head -- LLaMA style (adjacent pairs)
|
| 283 |
+
for (let h = 0; h < C.numHeads; h++)
|
| 284 |
+
applyRoPE(q.subarray(h * C.headDim, (h+1) * C.headDim), C.headDim, pos, C.ropeTheta);
|
| 285 |
+
for (let h = 0; h < C.numKvHeads; h++)
|
| 286 |
+
applyRoPE(k.subarray(h * C.headDim, (h+1) * C.headDim), C.headDim, pos, C.ropeTheta);
|
| 287 |
+
|
| 288 |
+
kvCache[l].k.push(new Float32Array(k));
|
| 289 |
+
kvCache[l].v.push(new Float32Array(v));
|
| 290 |
+
|
| 291 |
+
// Multi-head attention with GQA
|
| 292 |
+
const seqLen = kvCache[l].k.length;
|
| 293 |
+
const attnOut = new Float32Array(C.hiddenDim);
|
| 294 |
+
for (let h = 0; h < C.numHeads; h++) {
|
| 295 |
+
const kvH = Math.floor(h / gqaRatio);
|
| 296 |
+
const qH = q.subarray(h * C.headDim, (h+1) * C.headDim);
|
| 297 |
+
const scores = new Float32Array(seqLen);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
for (let s = 0; s < seqLen; s++) {
|
| 299 |
+
const kH = kvCache[l].k[s].subarray(kvH * C.headDim, (kvH+1) * C.headDim);
|
| 300 |
+
let dot = 0; for (let d = 0; d < C.headDim; d++) dot += qH[d] * kH[d];
|
| 301 |
+
scores[s] = dot / Math.sqrt(C.headDim);
|
| 302 |
}
|
| 303 |
+
const w = softmaxJS(scores);
|
| 304 |
+
for (let s = 0; s < seqLen; s++) {
|
| 305 |
+
const vH = kvCache[l].v[s].subarray(kvH * C.headDim, (kvH+1) * C.headDim);
|
| 306 |
+
const wt = w[s];
|
| 307 |
+
for (let d = 0; d < C.headDim; d++) attnOut[h * C.headDim + d] += wt * vH[d];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
}
|
| 309 |
}
|
| 310 |
|
| 311 |
+
const projected = o.matVec(ly.ow, attnOut, C.hiddenDim, C.hiddenDim);
|
|
|
|
| 312 |
const postAttn = o.add(x, projected);
|
| 313 |
|
| 314 |
+
// FFN: norm β gate/up β fusedSiluMul β down β residual
|
| 315 |
+
const ffnIn = o.rmsNorm(postAttn, ly.fn, C.rmsNormEps);
|
| 316 |
+
const gate = o.matVec(ly.gw, ffnIn, C.intermediateSize, C.hiddenDim);
|
| 317 |
+
const up = o.matVec(ly.uw, ffnIn, C.intermediateSize, C.hiddenDim);
|
| 318 |
const activated = o.fusedSiluMul(gate, up);
|
| 319 |
+
const down = o.matVec(ly.dw, activated, C.hiddenDim, C.intermediateSize);
|
| 320 |
x = o.add(postAttn, down);
|
| 321 |
}
|
| 322 |
|
|
|
|
| 323 |
if (step >= inputTokens.length - 1) {
|
| 324 |
+
const finalNormed = o.rmsNorm(x, model.outNorm, C.rmsNormEps);
|
| 325 |
+
const logits = o.matVec(model.outWeight, finalNormed, C.vocabSize, C.hiddenDim);
|
| 326 |
|
|
|
|
| 327 |
for (let i = 0; i < logits.length; i++) logits[i] /= 0.7;
|
| 328 |
const probs = o.softmax(logits);
|
| 329 |
|
|
|
|
| 330 |
const indexed = Array.from(probs).map((p, i) => ({ p, i })).sort((a, b) => b.p - a.p);
|
| 331 |
let cumP = 0, chosen = indexed[0].i;
|
| 332 |
const r = Math.random();
|
| 333 |
+
for (const { p, i } of indexed) { cumP += p; if (r < cumP) { chosen = i; break; } if (cumP > 0.9) break; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
+
tokenTimes.push(performance.now() - tStart);
|
| 336 |
+
if (chosen === C.eosToken) break;
|
| 337 |
allTokens.push(chosen);
|
| 338 |
}
|
| 339 |
}
|
| 340 |
|
| 341 |
const totalTime = performance.now() - t0;
|
| 342 |
const genTokens = allTokens.slice(inputTokens.length);
|
|
|
|
| 343 |
const avgMs = tokenTimes.length > 0 ? tokenTimes.reduce((a, b) => a + b, 0) / tokenTimes.length : 0;
|
| 344 |
|
| 345 |
return {
|
| 346 |
+
text: model.tokenizer.decode(genTokens), tokens: genTokens.length,
|
| 347 |
+
totalTimeMs: Math.round(totalTime), avgTokenMs: Math.round(avgMs),
|
| 348 |
+
engine: `Aether ${simd ? 'WASM-SIMD' : 'JS'}`, simd: !!simd,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
};
|
| 350 |
}
|
| 351 |
|
|
|
|
| 361 |
res.writeHead(200, { 'Content-Type': 'application/json' });
|
| 362 |
res.end(JSON.stringify(result));
|
| 363 |
} catch (e) {
|
| 364 |
+
console.error('[Aether] Error:', e);
|
| 365 |
res.writeHead(500, { 'Content-Type': 'application/json' });
|
| 366 |
res.end(JSON.stringify({ error: e.message, stack: e.stack }));
|
| 367 |
}
|
|
|
|
| 373 |
});
|
| 374 |
|
| 375 |
// βββ Main βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 376 |
+
const ggufPath = '/tmp/hf_cache/buleyean-smollm2-360m-q8_0.gguf';
|
| 377 |
+
const tokPath = '/tmp/hf_cache/tokenizer.json';
|
| 378 |
|
| 379 |
async function main() {
|
|
|
|
| 380 |
simd = await loadSIMD();
|
|
|
|
|
|
|
| 381 |
if (!existsSync(ggufPath)) {
|
| 382 |
console.log('[Aether] Downloading Q8_0 GGUF...');
|
| 383 |
execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
|
| 384 |
}
|
| 385 |
+
if (!existsSync(tokPath)) {
|
| 386 |
console.log('[Aether] Downloading tokenizer...');
|
| 387 |
execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
|
| 388 |
}
|
| 389 |
+
loadModel(ggufPath, tokPath);
|
| 390 |
+
server.listen(PORT, '127.0.0.1', () => console.log(`[Aether] http://127.0.0.1:${PORT} (SIMD: ${!!simd})`));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
}
|
| 392 |
|
| 393 |
main().catch(e => { console.error('[Aether] Fatal:', e); process.exit(1); });
|
app.py
CHANGED
|
@@ -1,52 +1,90 @@
|
|
| 1 |
"""
|
| 2 |
The Void -- Buleyean RL
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
import torch
|
|
|
|
| 8 |
import time
|
|
|
|
|
|
|
|
|
|
| 9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 10 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
base_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
|
| 14 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 15 |
"HuggingFaceTB/SmolLM2-360M-Instruct",
|
| 16 |
torch_dtype=torch.float32,
|
| 17 |
device_map="cpu",
|
| 18 |
)
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
messages = [{"role": "user", "content": prompt}]
|
| 33 |
-
text =
|
| 34 |
-
inputs =
|
| 35 |
t0 = time.perf_counter()
|
| 36 |
with torch.no_grad():
|
| 37 |
-
outputs =
|
| 38 |
-
**inputs,
|
| 39 |
-
|
| 40 |
-
temperature=0.7,
|
| 41 |
-
top_p=0.9,
|
| 42 |
-
do_sample=True,
|
| 43 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 44 |
)
|
| 45 |
elapsed = time.perf_counter() - t0
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def compare(prompt):
|
|
@@ -54,42 +92,36 @@ def compare(prompt):
|
|
| 54 |
yield "", "", "", ""
|
| 55 |
return
|
| 56 |
|
| 57 |
-
# Run both in parallel -- whichever finishes first shows first
|
| 58 |
base_result = [None]
|
| 59 |
-
|
| 60 |
|
| 61 |
def run_base():
|
| 62 |
-
base_result[0] =
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
bule_result[0] = gen(prompt, bule_model, bule_tokenizer)
|
| 66 |
|
| 67 |
with ThreadPoolExecutor(max_workers=2) as pool:
|
| 68 |
-
futures = {
|
| 69 |
-
pool.submit(run_base): "base",
|
| 70 |
-
pool.submit(run_bule): "bule",
|
| 71 |
-
}
|
| 72 |
for future in as_completed(futures):
|
| 73 |
name = futures[future]
|
| 74 |
-
future.result()
|
| 75 |
if name == "base" and base_result[0]:
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
yield
|
| 81 |
-
elif name == "
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
yield
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
yield bt, vt, f"{b_toks} tokens in {b_t:.1f}s ({b_ms:.0f}ms/tok)", f"{v_toks} tokens in {v_t:.1f}s ({v_ms:.0f}ms/tok)"
|
| 93 |
|
| 94 |
|
| 95 |
CSS = """
|
|
@@ -107,10 +139,10 @@ CSS = """
|
|
| 107 |
#prompt-input > label > span { display: none !important; }
|
| 108 |
#prompt-input textarea { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; color: #fafafa !important; font-size: 1rem !important; padding: 1rem !important; }
|
| 109 |
#prompt-input textarea:focus { border-color: #3b82f6 !important; box-shadow: 0 0 0 2px rgba(59,130,246,0.1) !important; }
|
| 110 |
-
#gen-btn { background: #3b82f6 !important; border: none !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 0.9rem !important; padding: 0.75rem 2rem !important;
|
| 111 |
-
#gen-btn:hover { background: #2563eb !important;
|
| 112 |
-
.prompt-chip { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 6px !important; color: #a1a1aa !important; font-size: 0.85rem !important;
|
| 113 |
-
.prompt-chip:hover { border-color: #3b82f6 !important; color: #fafafa !important;
|
| 114 |
#footer { text-align: center; padding: 2rem 0; border-top: 1px solid #1f1f23; margin-top: 2rem; }
|
| 115 |
#footer p { color: #52525b; font-size: 0.8rem; }
|
| 116 |
#footer a { color: #3b82f6; text-decoration: none; }
|
|
@@ -123,8 +155,8 @@ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zi
|
|
| 123 |
gr.HTML("""
|
| 124 |
<div id="hero">
|
| 125 |
<h1>The <span class="accent">Void</span></h1>
|
| 126 |
-
<p class="subtitle">
|
| 127 |
-
Left: standard
|
| 128 |
</div>
|
| 129 |
""")
|
| 130 |
|
|
@@ -133,47 +165,40 @@ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zi
|
|
| 133 |
|
| 134 |
with gr.Row(equal_height=True):
|
| 135 |
with gr.Column():
|
| 136 |
-
gr.HTML('<p class="base-label">
|
| 137 |
base_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"])
|
| 138 |
base_stats = gr.HTML('<p class="stats-text">--</p>')
|
| 139 |
with gr.Column(min_width=30):
|
| 140 |
gr.HTML('<p style="color:#27272a; text-align:center; padding-top:4rem; font-size:0.75rem; letter-spacing:0.1em;">VS</p>')
|
| 141 |
with gr.Column():
|
| 142 |
-
gr.HTML('<p class="void-label">
|
| 143 |
-
|
| 144 |
-
|
| 145 |
|
| 146 |
def run_compare(prompt_text):
|
| 147 |
-
for
|
| 148 |
-
yield
|
| 149 |
-
base_text,
|
| 150 |
-
bule_text,
|
| 151 |
-
f'<p class="stats-text">{b_stats}</p>',
|
| 152 |
-
f'<p class="stats-text">{a_stats}</p>',
|
| 153 |
-
)
|
| 154 |
|
| 155 |
-
btn.click(run_compare, [prompt], [base_out,
|
| 156 |
-
prompt.submit(run_compare, [prompt], [base_out,
|
| 157 |
|
| 158 |
gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
|
| 159 |
with gr.Row():
|
| 160 |
for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
|
| 161 |
gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
|
| 162 |
fn=lambda x=p: x, outputs=[prompt]
|
| 163 |
-
).then(fn=run_compare, inputs=[prompt], outputs=[base_out,
|
| 164 |
|
| 165 |
gr.HTML("""
|
| 166 |
<div id="footer">
|
| 167 |
<p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">
|
| 168 |
-
SmolLM2-360M
|
|
|
|
| 169 |
</p>
|
| 170 |
<p>
|
| 171 |
<a href="https://forkracefold.com/">Whitepaper</a> ·
|
| 172 |
<a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> ·
|
| 173 |
-
<a href="https://huggingface.co/forkjoin-ai">Models</a>
|
| 174 |
-
<a href="https://huggingface.co/spaces/forkjoin-ai/glossolalia">Glossolalia</a> ·
|
| 175 |
-
<a href="https://huggingface.co/spaces/forkjoin-ai/void-attention">Void Attention</a> ·
|
| 176 |
-
<a href="https://huggingface.co/spaces/forkjoin-ai/metacog">METACOG</a>
|
| 177 |
</p>
|
| 178 |
<p style="margin-top:1rem;">500+ Lean 4 theorems · Zero sorry · <a href="https://forkracefold.com/">φ² = φ + 1</a></p>
|
| 179 |
</div>
|
|
|
|
| 1 |
"""
|
| 2 |
The Void -- Buleyean RL
|
| 3 |
+
PyTorch vs Aether. Same model. Different engines. Let the speed speak.
|
| 4 |
"""
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
import torch
|
| 8 |
+
import json
|
| 9 |
import time
|
| 10 |
+
import subprocess
|
| 11 |
+
import urllib.request
|
| 12 |
+
import urllib.error
|
| 13 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 14 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 15 |
|
| 16 |
+
# βββ Start Aether sidecar ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
print("[Void] Starting Aether inference server...", flush=True)
|
| 18 |
+
aether_proc = subprocess.Popen(
|
| 19 |
+
["node", "aether-server.mjs"],
|
| 20 |
+
env={**__import__('os').environ, "AETHER_PORT": "7861"},
|
| 21 |
+
stdout=subprocess.PIPE,
|
| 22 |
+
stderr=subprocess.STDOUT,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# βββ Load PyTorch model ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
print("[Void] Loading PyTorch base model...", flush=True)
|
| 27 |
base_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct")
|
| 28 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 29 |
"HuggingFaceTB/SmolLM2-360M-Instruct",
|
| 30 |
torch_dtype=torch.float32,
|
| 31 |
device_map="cpu",
|
| 32 |
)
|
| 33 |
+
print("[Void] PyTorch model ready.", flush=True)
|
| 34 |
+
|
| 35 |
+
# Wait for Aether
|
| 36 |
+
print("[Void] Waiting for Aether...", flush=True)
|
| 37 |
+
import select
|
| 38 |
+
for attempt in range(180):
|
| 39 |
+
try:
|
| 40 |
+
req = urllib.request.Request("http://127.0.0.1:7861/health")
|
| 41 |
+
resp = urllib.request.urlopen(req, timeout=2)
|
| 42 |
+
health = json.loads(resp.read())
|
| 43 |
+
if health.get("status") == "ok" and health.get("model") == "loaded":
|
| 44 |
+
print(f"[Void] Aether ready (loaded in {health.get('loadTime')}ms, SIMD: {health.get('simd')})", flush=True)
|
| 45 |
+
break
|
| 46 |
+
except Exception:
|
| 47 |
+
pass
|
| 48 |
+
if aether_proc.stdout and select.select([aether_proc.stdout], [], [], 0)[0]:
|
| 49 |
+
line = aether_proc.stdout.readline()
|
| 50 |
+
if line:
|
| 51 |
+
print(f" {line.decode().strip()}", flush=True)
|
| 52 |
+
time.sleep(1)
|
| 53 |
+
else:
|
| 54 |
+
print("[Void] WARNING: Aether not ready after 180s", flush=True)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def gen_pytorch(prompt):
|
| 58 |
messages = [{"role": "user", "content": prompt}]
|
| 59 |
+
text = base_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 60 |
+
inputs = base_tokenizer(text, return_tensors="pt")
|
| 61 |
t0 = time.perf_counter()
|
| 62 |
with torch.no_grad():
|
| 63 |
+
outputs = base_model.generate(
|
| 64 |
+
**inputs, max_new_tokens=50, temperature=0.7, top_p=0.9,
|
| 65 |
+
do_sample=True, pad_token_id=base_tokenizer.eos_token_id,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
)
|
| 67 |
elapsed = time.perf_counter() - t0
|
| 68 |
+
n = outputs.shape[1] - inputs["input_ids"].shape[1]
|
| 69 |
+
text = base_tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
|
| 70 |
+
return text, elapsed, n, (elapsed * 1000 / n) if n > 0 else 0
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def gen_aether(prompt):
|
| 74 |
+
try:
|
| 75 |
+
data = json.dumps({"prompt": prompt, "max_tokens": 50}).encode()
|
| 76 |
+
req = urllib.request.Request("http://127.0.0.1:7861/generate", data=data,
|
| 77 |
+
headers={"Content-Type": "application/json"})
|
| 78 |
+
resp = urllib.request.urlopen(req, timeout=300)
|
| 79 |
+
r = json.loads(resp.read())
|
| 80 |
+
return r["text"], r["totalTimeMs"] / 1000, r["tokens"], r["avgTokenMs"]
|
| 81 |
+
except urllib.error.HTTPError as e:
|
| 82 |
+
body = e.read().decode() if e.fp else str(e)
|
| 83 |
+
try: detail = json.loads(body).get("error", body[:300])
|
| 84 |
+
except Exception: detail = body[:300]
|
| 85 |
+
return f"[Aether error: {detail}]", 0, 0, 0
|
| 86 |
+
except Exception as e:
|
| 87 |
+
return f"[Aether error: {e}]", 0, 0, 0
|
| 88 |
|
| 89 |
|
| 90 |
def compare(prompt):
|
|
|
|
| 92 |
yield "", "", "", ""
|
| 93 |
return
|
| 94 |
|
|
|
|
| 95 |
base_result = [None]
|
| 96 |
+
aether_result = [None]
|
| 97 |
|
| 98 |
def run_base():
|
| 99 |
+
base_result[0] = gen_pytorch(prompt)
|
| 100 |
+
def run_aether():
|
| 101 |
+
aether_result[0] = gen_aether(prompt)
|
|
|
|
| 102 |
|
| 103 |
with ThreadPoolExecutor(max_workers=2) as pool:
|
| 104 |
+
futures = {pool.submit(run_base): "base", pool.submit(run_aether): "aether"}
|
|
|
|
|
|
|
|
|
|
| 105 |
for future in as_completed(futures):
|
| 106 |
name = futures[future]
|
| 107 |
+
future.result()
|
| 108 |
if name == "base" and base_result[0]:
|
| 109 |
+
t, tm, tk, ms = base_result[0]
|
| 110 |
+
st = f"{tk} tokens in {tm:.1f}s ({ms:.0f}ms/tok)"
|
| 111 |
+
at = aether_result[0][0] if aether_result[0] else "generating..."
|
| 112 |
+
ast = f"{aether_result[0][2]} tokens in {aether_result[0][1]:.1f}s ({aether_result[0][3]:.0f}ms/tok)" if aether_result[0] else "running..."
|
| 113 |
+
yield t, at, st, ast
|
| 114 |
+
elif name == "aether" and aether_result[0]:
|
| 115 |
+
t, tm, tk, ms = aether_result[0]
|
| 116 |
+
st = f"{tk} tokens in {tm:.1f}s ({ms:.0f}ms/tok)"
|
| 117 |
+
bt = base_result[0][0] if base_result[0] else "generating..."
|
| 118 |
+
bst = f"{base_result[0][2]} tokens in {base_result[0][1]:.1f}s ({base_result[0][3]:.0f}ms/tok)" if base_result[0] else "running..."
|
| 119 |
+
yield bt, t, bst, st
|
| 120 |
+
|
| 121 |
+
if base_result[0] and aether_result[0]:
|
| 122 |
+
bt, b_t, b_tk, b_ms = base_result[0]
|
| 123 |
+
at, a_t, a_tk, a_ms = aether_result[0]
|
| 124 |
+
yield bt, at, f"{b_tk} tokens in {b_t:.1f}s ({b_ms:.0f}ms/tok)", f"{a_tk} tokens in {a_t:.1f}s ({a_ms:.0f}ms/tok)"
|
|
|
|
| 125 |
|
| 126 |
|
| 127 |
CSS = """
|
|
|
|
| 139 |
#prompt-input > label > span { display: none !important; }
|
| 140 |
#prompt-input textarea { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; color: #fafafa !important; font-size: 1rem !important; padding: 1rem !important; }
|
| 141 |
#prompt-input textarea:focus { border-color: #3b82f6 !important; box-shadow: 0 0 0 2px rgba(59,130,246,0.1) !important; }
|
| 142 |
+
#gen-btn { background: #3b82f6 !important; border: none !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 0.9rem !important; padding: 0.75rem 2rem !important; }
|
| 143 |
+
#gen-btn:hover { background: #2563eb !important; }
|
| 144 |
+
.prompt-chip { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 6px !important; color: #a1a1aa !important; font-size: 0.85rem !important; }
|
| 145 |
+
.prompt-chip:hover { border-color: #3b82f6 !important; color: #fafafa !important; }
|
| 146 |
#footer { text-align: center; padding: 2rem 0; border-top: 1px solid #1f1f23; margin-top: 2rem; }
|
| 147 |
#footer p { color: #52525b; font-size: 0.8rem; }
|
| 148 |
#footer a { color: #3b82f6; text-decoration: none; }
|
|
|
|
| 155 |
gr.HTML("""
|
| 156 |
<div id="hero">
|
| 157 |
<h1>The <span class="accent">Void</span></h1>
|
| 158 |
+
<p class="subtitle">PyTorch vs Aether. Same prompt. Different engines. Live inference.<br/>
|
| 159 |
+
Left: standard PyTorch CPU. Right: Aether -- pure JS + 14KB WASM SIMD, zero ML dependencies.</p>
|
| 160 |
</div>
|
| 161 |
""")
|
| 162 |
|
|
|
|
| 165 |
|
| 166 |
with gr.Row(equal_height=True):
|
| 167 |
with gr.Column():
|
| 168 |
+
gr.HTML('<p class="base-label">PyTorch (standard)</p>')
|
| 169 |
base_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"])
|
| 170 |
base_stats = gr.HTML('<p class="stats-text">--</p>')
|
| 171 |
with gr.Column(min_width=30):
|
| 172 |
gr.HTML('<p style="color:#27272a; text-align:center; padding-top:4rem; font-size:0.75rem; letter-spacing:0.1em;">VS</p>')
|
| 173 |
with gr.Column():
|
| 174 |
+
gr.HTML('<p class="void-label">Aether (our engine)</p>')
|
| 175 |
+
aether_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"])
|
| 176 |
+
aether_stats = gr.HTML('<p class="stats-text">--</p>')
|
| 177 |
|
| 178 |
def run_compare(prompt_text):
|
| 179 |
+
for bt, at, bs, ast in compare(prompt_text):
|
| 180 |
+
yield bt, at, f'<p class="stats-text">{bs}</p>', f'<p class="stats-text">{ast}</p>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
+
btn.click(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
|
| 183 |
+
prompt.submit(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
|
| 184 |
|
| 185 |
gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
|
| 186 |
with gr.Row():
|
| 187 |
for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
|
| 188 |
gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
|
| 189 |
fn=lambda x=p: x, outputs=[prompt]
|
| 190 |
+
).then(fn=run_compare, inputs=[prompt], outputs=[base_out, aether_out, base_stats, aether_stats])
|
| 191 |
|
| 192 |
gr.HTML("""
|
| 193 |
<div id="footer">
|
| 194 |
<p style="color:#a1a1aa; font-size:0.85rem; margin-bottom:0.5rem;">
|
| 195 |
+
SmolLM2-360M · Buleyean RL ·
|
| 196 |
+
Left: PyTorch CPU (base model) · Right: Aether WASM-SIMD (Buleyean, zero ML deps)
|
| 197 |
</p>
|
| 198 |
<p>
|
| 199 |
<a href="https://forkracefold.com/">Whitepaper</a> ·
|
| 200 |
<a href="https://github.com/forkjoin-ai/buleyean-rl">Library</a> ·
|
| 201 |
+
<a href="https://huggingface.co/forkjoin-ai">Models</a>
|
|
|
|
|
|
|
|
|
|
| 202 |
</p>
|
| 203 |
<p style="margin-top:1rem;">500+ Lean 4 theorems · Zero sorry · <a href="https://forkracefold.com/">φ² = φ + 1</a></p>
|
| 204 |
</div>
|