Taylor commited on
Commit
382cccc
Β·
1 Parent(s): a902eca

feat: dual model support -- Buleyean (default) vs base, user toggle

Browse files

Loads both SmolLM2-360M models at startup:
- buleyean: void-trained (forkjoin-ai/buleyean-smollm2-360m Q8_0)
- base: standard instruct (bartowski/SmolLM2-360M-Instruct Q8_0)

Radio toggle lets users pick which model to run. Buleyean default.
Same model feeds both standard and glossolalia decoders.
Act 3 (Metacog) builds on this same dual-model + glossolalia foundation.

Files changed (2) hide show
  1. aether-server.mjs +48 -21
  2. app.py +16 -12
aether-server.mjs CHANGED
@@ -105,24 +105,34 @@ function applyRoPE(x, headDim, position, theta) {
105
  }
106
  }
107
 
108
- // ─── Model ──────────────────────────────────────────────────────────────────
109
- let model = null;
110
- function loadModel(ggufPath, tokPath) {
 
 
 
111
  const t0=Date.now();const buf=readFileSync(ggufPath);const parsed=parseGGUF(buf);
112
  console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now()-t0}ms`);
113
  const tokenizer=new Tok(JSON.parse(readFileSync(tokPath,'utf8')));
114
  const byName={};for(const t of parsed.tensors)byName[t.name]=t;
115
- function get(name){const t=byName[name];if(!t)return null;const raw=new Uint8Array(buf.buffer,buf.byteOffset+parsed.dataOffset+Number(t.offset),t.size);return dequantByType(raw,t.numElements,t.type);}
116
  console.log('[Aether] Dequantizing...');const tokenEmbd=get('token_embd.weight');const layers=[];
117
  for(let i=0;i<C.numLayers;i++){if(i%8===0)console.log(`[Aether] Layer ${i}/${C.numLayers}`);layers.push({an:get(`blk.${i}.attn_norm.weight`),fn:get(`blk.${i}.ffn_norm.weight`),qw:get(`blk.${i}.attn_q.weight`),kw:get(`blk.${i}.attn_k.weight`),vw:get(`blk.${i}.attn_v.weight`),ow:get(`blk.${i}.attn_output.weight`),gw:get(`blk.${i}.ffn_gate.weight`),uw:get(`blk.${i}.ffn_up.weight`),dw:get(`blk.${i}.ffn_down.weight`)});}
118
  const outNorm=get('output_norm.weight');let outWeight=get('output.weight');if(!outWeight){console.log('[Aether] Tied embeddings');outWeight=tokenEmbd;}
119
- console.log(`[Aether] Loaded in ${((Date.now()-t0)/1000).toFixed(1)}s`);
120
- model={tokenEmbd,layers,outNorm,outWeight,tokenizer,loadTime:Date.now()-t0};
 
 
 
 
 
 
121
  }
122
 
123
  // ─── Forward Pass (returns raw logits) ──────────────────────────────────────
124
- function forwardPass(prompt) {
125
  const o = op();
 
126
  const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
127
  const inputTokens = model.tokenizer.encode(chatPrompt);
128
  const allTokens = [...inputTokens];
@@ -247,9 +257,10 @@ function sampleGlossolalia(logits) {
247
 
248
  // ─── Generation Loops ───────────────────────────────────────────────────────
249
 
250
- function generateStandard(prompt, maxTokens = 8192) {
251
  const t0 = performance.now();
252
- const fwd = forwardPass(prompt);
 
253
  const allTokens = [...fwd.inputTokens];
254
  const kvC = Array.from({length:C.numLayers},()=>({k:[],v:[]}));
255
  const tokenTimes = [];
@@ -301,9 +312,10 @@ function generateStandard(prompt, maxTokens = 8192) {
301
  };
302
  }
303
 
304
- function generateGlossolalia(prompt, maxTokens = 8192) {
305
  const t0 = performance.now();
306
- const fwd = forwardPass(prompt);
 
307
  const allTokens = [...fwd.inputTokens];
308
  const kvC = Array.from({length:C.numLayers},()=>({k:[],v:[]}));
309
  const tokenTimes = [];
@@ -364,8 +376,8 @@ const server = createServer((req, res) => {
364
  req.on('data', c => body += c);
365
  req.on('end', () => {
366
  try {
367
- const { prompt, max_tokens } = JSON.parse(body);
368
- const result = genFn(prompt, max_tokens || 256);
369
  res.writeHead(200, { 'Content-Type': 'application/json' });
370
  res.end(JSON.stringify(result));
371
  } catch (e) {
@@ -380,26 +392,41 @@ const server = createServer((req, res) => {
380
  else if (req.method==='POST' && req.url==='/generate-glossolalia') handle(generateGlossolalia);
381
  else if (req.url==='/health') {
382
  res.writeHead(200,{'Content-Type':'application/json'});
383
- res.end(JSON.stringify({status:'ok',model:model?'loaded':'not loaded',simd:!!simd,loadTime:model?.loadTime}));
384
  } else { res.writeHead(404); res.end(); }
385
  });
386
 
387
  // ─── Main ───────────────────────────────────────────────────────────────────
388
- const ggufPath='/tmp/hf_cache/smollm2-360m-q8_0.gguf';
389
- const tokPath='/tmp/hf_cache/tokenizer.json';
 
390
 
391
  async function main() {
392
  simd = await loadSIMD();
393
- if (!existsSync(ggufPath)) {
394
- console.log('[Aether] Downloading Q8_0 GGUF...');
395
- execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('bartowski/SmolLM2-360M-Instruct-GGUF', 'SmolLM2-360M-Instruct-Q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache'); import shutil; shutil.move('/tmp/hf_cache/SmolLM2-360M-Instruct-Q8_0.gguf', '${ggufPath}')"`, { stdio: 'inherit' });
 
 
396
  }
 
 
 
 
 
 
 
 
397
  if (!existsSync(tokPath)) {
398
  console.log('[Aether] Downloading tokenizer...');
399
  execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
400
  }
401
- loadModel(ggufPath, tokPath);
402
- server.listen(PORT,'127.0.0.1',()=>console.log(`[Aether] http://127.0.0.1:${PORT} (SIMD: ${!!simd})`));
 
 
 
 
403
  }
404
 
405
  main().catch(e => { console.error('[Aether] Fatal:', e); process.exit(1); });
 
105
  }
106
  }
107
 
108
+ // ─── Models ─────────────────────────────────────────────────────────────────
109
+ const models = {};
110
+ let activeModel = null;
111
+
112
+ function loadModel(name, ggufPath, tokPath) {
113
+ console.log(`[Aether] Loading ${name}...`);
114
  const t0=Date.now();const buf=readFileSync(ggufPath);const parsed=parseGGUF(buf);
115
  console.log(`[Aether] Parsed ${parsed.tensors.length} tensors in ${Date.now()-t0}ms`);
116
  const tokenizer=new Tok(JSON.parse(readFileSync(tokPath,'utf8')));
117
  const byName={};for(const t of parsed.tensors)byName[t.name]=t;
118
+ function get(nm){const t=byName[nm];if(!t)return null;const raw=new Uint8Array(buf.buffer,buf.byteOffset+parsed.dataOffset+Number(t.offset),t.size);return dequantByType(raw,t.numElements,t.type);}
119
  console.log('[Aether] Dequantizing...');const tokenEmbd=get('token_embd.weight');const layers=[];
120
  for(let i=0;i<C.numLayers;i++){if(i%8===0)console.log(`[Aether] Layer ${i}/${C.numLayers}`);layers.push({an:get(`blk.${i}.attn_norm.weight`),fn:get(`blk.${i}.ffn_norm.weight`),qw:get(`blk.${i}.attn_q.weight`),kw:get(`blk.${i}.attn_k.weight`),vw:get(`blk.${i}.attn_v.weight`),ow:get(`blk.${i}.attn_output.weight`),gw:get(`blk.${i}.ffn_gate.weight`),uw:get(`blk.${i}.ffn_up.weight`),dw:get(`blk.${i}.ffn_down.weight`)});}
121
  const outNorm=get('output_norm.weight');let outWeight=get('output.weight');if(!outWeight){console.log('[Aether] Tied embeddings');outWeight=tokenEmbd;}
122
+ const loadTime=Date.now()-t0;
123
+ console.log(`[Aether] ${name} loaded in ${(loadTime/1000).toFixed(1)}s`);
124
+ models[name]={tokenEmbd,layers,outNorm,outWeight,tokenizer,loadTime,name};
125
+ return models[name];
126
+ }
127
+
128
+ function getModel(name) {
129
+ return models[name] || models['base'] || Object.values(models)[0];
130
  }
131
 
132
  // ─── Forward Pass (returns raw logits) ──────────────────────────────────────
133
+ function forwardPass(prompt, modelName) {
134
  const o = op();
135
+ const model = getModel(modelName);
136
  const chatPrompt = `<|im_start|>user\n${prompt}<|im_end|>\n<|im_start|>assistant\n`;
137
  const inputTokens = model.tokenizer.encode(chatPrompt);
138
  const allTokens = [...inputTokens];
 
257
 
258
  // ─── Generation Loops ───────────────────────────────────────────────────────
259
 
260
+ function generateStandard(prompt, maxTokens = 8192, modelName = 'buleyean') {
261
  const t0 = performance.now();
262
+ const model = getModel(modelName);
263
+ const fwd = forwardPass(prompt, modelName);
264
  const allTokens = [...fwd.inputTokens];
265
  const kvC = Array.from({length:C.numLayers},()=>({k:[],v:[]}));
266
  const tokenTimes = [];
 
312
  };
313
  }
314
 
315
+ function generateGlossolalia(prompt, maxTokens = 8192, modelName = 'buleyean') {
316
  const t0 = performance.now();
317
+ const model = getModel(modelName);
318
+ const fwd = forwardPass(prompt, modelName);
319
  const allTokens = [...fwd.inputTokens];
320
  const kvC = Array.from({length:C.numLayers},()=>({k:[],v:[]}));
321
  const tokenTimes = [];
 
376
  req.on('data', c => body += c);
377
  req.on('end', () => {
378
  try {
379
+ const { prompt, max_tokens, model } = JSON.parse(body);
380
+ const result = genFn(prompt, max_tokens || 256, model || 'buleyean');
381
  res.writeHead(200, { 'Content-Type': 'application/json' });
382
  res.end(JSON.stringify(result));
383
  } catch (e) {
 
392
  else if (req.method==='POST' && req.url==='/generate-glossolalia') handle(generateGlossolalia);
393
  else if (req.url==='/health') {
394
  res.writeHead(200,{'Content-Type':'application/json'});
395
+ res.end(JSON.stringify({status:'ok',models:Object.keys(models),simd:!!simd,loadTimes:Object.fromEntries(Object.entries(models).map(([k,v])=>[k,v.loadTime]))}));
396
  } else { res.writeHead(404); res.end(); }
397
  });
398
 
399
  // ─── Main ───────────────────────────────────────────────────────────────────
400
+ const basePath = '/tmp/hf_cache/smollm2-360m-q8_0.gguf';
401
+ const bulePath = '/tmp/hf_cache/buleyean-smollm2-360m-q8_0.gguf';
402
+ const tokPath = '/tmp/hf_cache/tokenizer.json';
403
 
404
  async function main() {
405
  simd = await loadSIMD();
406
+
407
+ // Download base model
408
+ if (!existsSync(basePath)) {
409
+ console.log('[Aether] Downloading base SmolLM2-360M Q8_0...');
410
+ execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('bartowski/SmolLM2-360M-Instruct-GGUF', 'SmolLM2-360M-Instruct-Q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache'); import shutil; shutil.move('/tmp/hf_cache/SmolLM2-360M-Instruct-Q8_0.gguf', '${basePath}')"`, { stdio: 'inherit' });
411
  }
412
+
413
+ // Download Buleyean model
414
+ if (!existsSync(bulePath)) {
415
+ console.log('[Aether] Downloading Buleyean SmolLM2-360M Q8_0...');
416
+ execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('forkjoin-ai/buleyean-smollm2-360m', 'buleyean-smollm2-360m-q8_0.gguf', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
417
+ }
418
+
419
+ // Download tokenizer
420
  if (!existsSync(tokPath)) {
421
  console.log('[Aether] Downloading tokenizer...');
422
  execSync(`python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download('HuggingFaceTB/SmolLM2-360M-Instruct', 'tokenizer.json', cache_dir='/tmp/hf_cache', local_dir='/tmp/hf_cache')"`, { stdio: 'inherit' });
423
  }
424
+
425
+ // Load both models
426
+ loadModel('base', basePath, tokPath);
427
+ loadModel('buleyean', bulePath, tokPath);
428
+
429
+ server.listen(PORT, '127.0.0.1', () => console.log(`[Aether] http://127.0.0.1:${PORT} (SIMD: ${!!simd}, models: ${Object.keys(models).join(', ')})`));
430
  }
431
 
432
  main().catch(e => { console.error('[Aether] Fatal:', e); process.exit(1); });
app.py CHANGED
@@ -30,7 +30,7 @@ for attempt in range(180):
30
  req = urllib.request.Request("http://127.0.0.1:7861/health")
31
  resp = urllib.request.urlopen(req, timeout=2)
32
  health = json.loads(resp.read())
33
- if health.get("status") == "ok" and health.get("model") == "loaded":
34
  print(f"[Glossolalia] Aether ready ({health.get('loadTime')}ms, SIMD: {health.get('simd')})", flush=True)
35
  break
36
  except Exception:
@@ -44,9 +44,9 @@ else:
44
  print("[Glossolalia] WARNING: Aether not ready after 180s", flush=True)
45
 
46
 
47
- def call_aether(endpoint, prompt, max_tokens=256):
48
  try:
49
- data = json.dumps({"prompt": prompt, "max_tokens": max_tokens}).encode()
50
  req = urllib.request.Request(
51
  f"http://127.0.0.1:7861/{endpoint}", data=data,
52
  headers={"Content-Type": "application/json"},
@@ -125,7 +125,7 @@ def format_layer_health(diag_list):
125
 
126
  # ─── Compare Function ────────────────────────────────────────────────────────
127
 
128
- def compare(prompt, max_tokens):
129
  empty = ("", "", "", "", "", "", "")
130
  if not prompt or not prompt.strip():
131
  yield empty
@@ -136,9 +136,9 @@ def compare(prompt, max_tokens):
136
  glo_result = [None]
137
 
138
  def run_std():
139
- std_result[0] = call_aether("generate-standard", prompt, max_tokens)
140
  def run_glo():
141
- glo_result[0] = call_aether("generate-glossolalia", prompt, max_tokens)
142
 
143
  def fmt_stats(r):
144
  if not r:
@@ -209,7 +209,9 @@ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="purple", neutral_hue="
209
 
210
  with gr.Row():
211
  prompt = gr.Textbox(elem_id="prompt-input", placeholder="What is the shape of failure?", lines=2, label="Prompt", show_label=False, interactive=True, scale=4)
212
- max_tok = gr.Slider(minimum=8, maximum=512, value=64, step=1, label="Max tokens", scale=1)
 
 
213
 
214
  btn = gr.Button("Generate", elem_id="gen-btn", variant="primary")
215
 
@@ -236,20 +238,22 @@ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="purple", neutral_hue="
236
 
237
  outputs = [std_out, glo_out, std_stats, glo_stats, glo_diag, std_diag, layer_health]
238
 
239
- def run(prompt_text, max_tokens):
240
- for vals in compare(prompt_text, max_tokens):
 
 
241
  st, gt, ss, gs, gd, sd, lh = vals
242
  yield st, gt, f'<p class="stats-text">{ss}</p>', f'<p class="stats-text">{gs}</p>', gd, sd, lh
243
 
244
- btn.click(run, [prompt, max_tok], outputs)
245
- prompt.submit(run, [prompt, max_tok], outputs)
246
 
247
  gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
248
  with gr.Row():
249
  for p in ["What is the shape of failure?", "The theory of everything begins with", "If silence had a color", "Write a haiku about parallel universes"]:
250
  gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
251
  fn=lambda x=p: x, outputs=[prompt]
252
- ).then(fn=run, inputs=[prompt, max_tok], outputs=outputs)
253
 
254
  gr.HTML("""
255
  <div id="footer">
 
30
  req = urllib.request.Request("http://127.0.0.1:7861/health")
31
  resp = urllib.request.urlopen(req, timeout=2)
32
  health = json.loads(resp.read())
33
+ if health.get("status") == "ok" and health.get("models"):
34
  print(f"[Glossolalia] Aether ready ({health.get('loadTime')}ms, SIMD: {health.get('simd')})", flush=True)
35
  break
36
  except Exception:
 
44
  print("[Glossolalia] WARNING: Aether not ready after 180s", flush=True)
45
 
46
 
47
+ def call_aether(endpoint, prompt, max_tokens=256, model_name="buleyean"):
48
  try:
49
+ data = json.dumps({"prompt": prompt, "max_tokens": max_tokens, "model": model_name}).encode()
50
  req = urllib.request.Request(
51
  f"http://127.0.0.1:7861/{endpoint}", data=data,
52
  headers={"Content-Type": "application/json"},
 
125
 
126
  # ─── Compare Function ────────────────────────────────────────────────────────
127
 
128
+ def compare(prompt, max_tokens, model_name="buleyean"):
129
  empty = ("", "", "", "", "", "", "")
130
  if not prompt or not prompt.strip():
131
  yield empty
 
136
  glo_result = [None]
137
 
138
  def run_std():
139
+ std_result[0] = call_aether("generate-standard", prompt, max_tokens, model_name)
140
  def run_glo():
141
+ glo_result[0] = call_aether("generate-glossolalia", prompt, max_tokens, model_name)
142
 
143
  def fmt_stats(r):
144
  if not r:
 
209
 
210
  with gr.Row():
211
  prompt = gr.Textbox(elem_id="prompt-input", placeholder="What is the shape of failure?", lines=2, label="Prompt", show_label=False, interactive=True, scale=4)
212
+ with gr.Column(scale=1):
213
+ model_choice = gr.Radio(choices=["buleyean", "base"], value="buleyean", label="Model", info="Buleyean = void-trained, Base = standard instruct")
214
+ max_tok = gr.Slider(minimum=8, maximum=512, value=64, step=1, label="Max tokens")
215
 
216
  btn = gr.Button("Generate", elem_id="gen-btn", variant="primary")
217
 
 
238
 
239
  outputs = [std_out, glo_out, std_stats, glo_stats, glo_diag, std_diag, layer_health]
240
 
241
+ inputs = [prompt, max_tok, model_choice]
242
+
243
+ def run(prompt_text, max_tokens, model_name):
244
+ for vals in compare(prompt_text, max_tokens, model_name):
245
  st, gt, ss, gs, gd, sd, lh = vals
246
  yield st, gt, f'<p class="stats-text">{ss}</p>', f'<p class="stats-text">{gs}</p>', gd, sd, lh
247
 
248
+ btn.click(run, inputs, outputs)
249
+ prompt.submit(run, inputs, outputs)
250
 
251
  gr.HTML('<p style="color:#52525b; font-size:0.8rem; margin-top:1.5rem; margin-bottom:0.5rem;">Try these:</p>')
252
  with gr.Row():
253
  for p in ["What is the shape of failure?", "The theory of everything begins with", "If silence had a color", "Write a haiku about parallel universes"]:
254
  gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
255
  fn=lambda x=p: x, outputs=[prompt]
256
+ ).then(fn=run, inputs=inputs, outputs=outputs)
257
 
258
  gr.HTML("""
259
  <div id="footer">