""" Aether -- Pure Engine Speed Comparison Same model. Same prompt. Same tokens. Different engine. PyTorch CPU vs Aether WASM-SIMD. Let the ms/tok speak. """ import gradio as gr import torch import json import time import subprocess import urllib.request import urllib.error import select from concurrent.futures import ThreadPoolExecutor, as_completed from transformers import AutoModelForCausalLM, AutoTokenizer print("[Aether] Starting Aether sidecar...", flush=True) aether_proc = subprocess.Popen( ["node", "aether-server.mjs"], env={**__import__('os').environ, "AETHER_PORT": "7861"}, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) print("[Aether] Loading PyTorch model...", flush=True) tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-360M-Instruct") pytorch_model = AutoModelForCausalLM.from_pretrained( "HuggingFaceTB/SmolLM2-360M-Instruct", torch_dtype=torch.float32, device_map="cpu", ) print("[Aether] PyTorch ready.", flush=True) print("[Aether] Waiting for Aether engine...", flush=True) for attempt in range(180): try: req = urllib.request.Request("http://127.0.0.1:7861/health") resp = urllib.request.urlopen(req, timeout=2) health = json.loads(resp.read()) if health.get("status") == "ok" and health.get("model") == "loaded": print(f"[Aether] Engine ready ({health.get('loadTime')}ms, SIMD: {health.get('simd')})", flush=True) break except Exception: pass if aether_proc.stdout and select.select([aether_proc.stdout], [], [], 0)[0]: line = aether_proc.stdout.readline() if line: print(f" {line.decode().strip()}", flush=True) time.sleep(1) def gen_pytorch(prompt, max_tokens): messages = [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt") t0 = time.perf_counter() with torch.no_grad(): outputs = pytorch_model.generate( **inputs, max_new_tokens=max_tokens, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) elapsed = time.perf_counter() - t0 n = outputs.shape[1] - inputs["input_ids"].shape[1] text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip() return text, elapsed, n, (elapsed * 1000 / n) if n > 0 else 0 def gen_aether(prompt, max_tokens): try: data = json.dumps({"prompt": prompt, "max_tokens": max_tokens}).encode() req = urllib.request.Request("http://127.0.0.1:7861/generate", data=data, headers={"Content-Type": "application/json"}) resp = urllib.request.urlopen(req, timeout=600) r = json.loads(resp.read()) return r["text"], r["totalTimeMs"] / 1000, r["tokens"], r["avgTokenMs"] except urllib.error.HTTPError as e: body = e.read().decode() if e.fp else str(e) try: detail = json.loads(body).get("error", body[:300]) except Exception: detail = body[:300] return f"[Error: {detail}]", 0, 0, 0 except Exception as e: return f"[Error: {e}]", 0, 0, 0 def compare(prompt, max_tokens): empty = ("", "", "", "") if not prompt or not prompt.strip(): yield empty return max_tokens = int(max_tokens) pt_result = [None] ae_result = [None] def run_pt(): pt_result[0] = gen_pytorch(prompt, max_tokens) def run_ae(): ae_result[0] = gen_aether(prompt, max_tokens) def fmt(r): if not r: return "running..." return f"{r[2]} tokens in {r[1]:.1f}s ({r[3]:.0f}ms/tok)" def build(): pt, ae = pt_result[0], ae_result[0] return ( pt[0] if pt else "generating...", ae[0] if ae else "generating...", fmt(pt), fmt(ae), ) with ThreadPoolExecutor(max_workers=2) as pool: futures = {pool.submit(run_pt): "pt", pool.submit(run_ae): "ae"} for future in as_completed(futures): future.result() yield build() yield build() CSS = """ .gradio-container { max-width: 1060px !important; margin: 0 auto !important; } .gradio-container, .dark { background: #09090b !important; } #hero { text-align: center; padding: 2rem 0 1rem; } #hero h1 { font-size: 2.5rem; font-weight: 300; letter-spacing: -0.02em; color: #fafafa; margin: 0; } #hero .accent { color: #06b6d4; } #hero .subtitle { color: #71717a; font-size: 0.95rem; margin-top: 0.5rem; } .response-card { background: #0c0c0f !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; } .response-card textarea { background: #0c0c0f !important; border: none !important; color: #e4e4e7 !important; font-size: 0.95rem !important; line-height: 1.6 !important; } .pt-label { color: #71717a !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; } .ae-label { color: #06b6d4 !important; font-size: 0.8rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; font-weight: 500 !important; } .stats-text { font-family: 'SF Mono', 'Fira Code', monospace !important; font-size: 0.85rem !important; color: #52525b !important; } #prompt-input > label > span { display: none !important; } #prompt-input textarea { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 8px !important; color: #fafafa !important; font-size: 1rem !important; padding: 1rem !important; } #prompt-input textarea:focus { border-color: #06b6d4 !important; } #gen-btn { background: #06b6d4 !important; border: none !important; border-radius: 8px !important; font-weight: 500 !important; font-size: 0.9rem !important; padding: 0.75rem 2rem !important; color: #09090b !important; } .prompt-chip { background: #111114 !important; border: 1px solid #1f1f23 !important; border-radius: 6px !important; color: #a1a1aa !important; font-size: 0.85rem !important; } .prompt-chip:hover { border-color: #06b6d4 !important; color: #fafafa !important; } #footer { text-align: center; padding: 2rem 0; border-top: 1px solid #1f1f23; margin-top: 2rem; } #footer p { color: #52525b; font-size: 0.8rem; } #footer a { color: #06b6d4; text-decoration: none; } footer.svelte-1ax1toq { display: none !important; } .built-with { display: none !important; } """ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="cyan", neutral_hue="zinc"), title="Aether") as demo: gr.HTML("""
Pure engine speed comparison. Same model (SmolLM2-360M-Instruct). Same prompt. Same tokens.
Left: PyTorch CPU (2.8GB runtime, CUDA/MKL optimized).
Right: Aether (14KB WASM binary, pure JS + SIMD128, zero ML dependencies).
Both generate in parallel. Whichever finishes first shows first.
PyTorch CPU (standard)
') pt_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"]) pt_stats = gr.HTML('--
') with gr.Column(min_width=30): gr.HTML('VS
') with gr.Column(): gr.HTML('Aether WASM-SIMD (14KB)
') ae_out = gr.Textbox(lines=10, show_label=False, interactive=False, elem_classes=["response-card"]) ae_stats = gr.HTML('--
') outputs = [pt_out, ae_out, pt_stats, ae_stats] inputs = [prompt, max_tok] def run(p, mt): for pt, ae, ps, aes in compare(p, mt): yield pt, ae, f'{ps}
', f'{aes}
' btn.click(run, inputs, outputs) prompt.submit(run, inputs, outputs) gr.HTML('Try these:
') with gr.Row(): for p in ["hello", "What is the shape of failure?", "Write a haiku about parallel universes.", "Explain entropy to a five-year-old."]: gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click( fn=lambda x=p: x, outputs=[prompt] ).then(fn=run, inputs=inputs, outputs=outputs) gr.HTML(""" """) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)