Spaces:

forkjoin-ai
/

the-void

Sleeping

Taylor commited on 16 days ago

Commit

e5e1d2b

1 Parent(s): 90bf42d

fix: remove missing WASM flashAttention + stream results independently

1. flashAttentionMultiHead not in standalone WASM binary -- use JS
attention fallback (matVec is the real bottleneck, not attention)

2. Use Gradio generator pattern so PyTorch result shows immediately
when done, then Aether result shows when it finishes. No more
both flashing at the same time.

Files changed (2) hide show

aether-server.mjs +2 -13
app.py +18 -15

aether-server.mjs CHANGED Viewed

@@ -124,19 +124,8 @@ async function loadSIMD() {
         wasm.resetHeap(saved);
         return result;
       },
-      flashAttentionMultiHead(query, keys, values, seqLen, numHeads, numKvHeads, headDim) {
-        const saved = wasm.getHeapPtr();
-        const scale = 1.0 / Math.sqrt(headDim);
-        const qPtr = wasm.allocate(query.byteLength);
-        const kPtr = wasm.allocate(keys.byteLength);
-        const vPtr = wasm.allocate(values.byteLength);
-        const rPtr = wasm.allocate(numHeads * headDim * 4);
-        copyTo(qPtr, query); copyTo(kPtr, keys); copyTo(vPtr, values);
-        wasm.flashAttentionMultiHead(qPtr, kPtr, vPtr, rPtr, seqLen, numHeads, numKvHeads, headDim, scale);
-        const result = copyFrom(rPtr, numHeads * headDim);
-        wasm.resetHeap(saved);
-        return result;
-      },
     };
   } catch (e) {
     console.warn(`[Aether] WASM SIMD failed: ${e.message}, using JS fallbacks`);

         wasm.resetHeap(saved);
         return result;
       },
+      // flashAttentionMultiHead: not in standalone WASM -- use JS attention
+      flashAttentionMultiHead: null,
     };
   } catch (e) {
     console.warn(`[Aether] WASM SIMD failed: ${e.message}, using JS fallbacks`);

app.py CHANGED Viewed

@@ -108,17 +108,20 @@ def gen_aether(prompt):
 def compare(prompt):
     if not prompt or not prompt.strip():
-        return "", "", "", ""
-    # Run both
     base_text, base_time, base_toks, base_ms = gen_pytorch(prompt)
-    aether_text, aether_time, aether_toks, aether_ms = gen_aether(prompt)
     base_stats = f"{base_toks} tokens in {base_time:.1f}s ({base_ms:.0f}ms/tok)"
-    aether_stats = f"{aether_toks} tokens in {aether_time:.1f}s ({aether_ms:.0f}ms/tok)"
-    return base_text, aether_text, base_stats, aether_stats
 CSS = """
@@ -193,13 +196,13 @@ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zi
             aether_stats = gr.HTML('<p class="stats-text">--</p>')
     def run_compare(prompt_text):
-        base_text, aether_text, b_stats, a_stats = compare(prompt_text)
-        return (
-            base_text,
-            aether_text,
-            f'<p class="stats-text">{b_stats}</p>',
-            f'<p class="stats-text">{a_stats}</p>',
-        )
     btn.click(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
     prompt.submit(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
@@ -208,8 +211,8 @@ with gr.Blocks(css=CSS, theme=gr.themes.Base(primary_hue="blue", neutral_hue="zi
     with gr.Row():
         for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
             gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
-                fn=lambda x=p: run_compare(x), outputs=[base_out, aether_out, base_stats, aether_stats]
-            ).then(fn=lambda x=p: x, outputs=[prompt])
     gr.HTML("""
     <div id="footer">

 def compare(prompt):
+    """Generator: yields results as each engine finishes."""
     if not prompt or not prompt.strip():
+        yield "", "", "", ""
+        return
+    # Run PyTorch first, show immediately
     base_text, base_time, base_toks, base_ms = gen_pytorch(prompt)
     base_stats = f"{base_toks} tokens in {base_time:.1f}s ({base_ms:.0f}ms/tok)"
+    yield base_text, "generating...", base_stats, "running..."
+    # Then run Aether, show when done
+    aether_text, aether_time, aether_toks, aether_ms = gen_aether(prompt)
+    aether_stats = f"{aether_toks} tokens in {aether_time:.1f}s ({aether_ms:.0f}ms/tok)"
+    yield base_text, aether_text, base_stats, aether_stats
 CSS = """
             aether_stats = gr.HTML('<p class="stats-text">--</p>')
     def run_compare(prompt_text):
+        for base_text, aether_text, b_stats, a_stats in compare(prompt_text):
+            yield (
+                base_text,
+                aether_text,
+                f'<p class="stats-text">{b_stats}</p>',
+                f'<p class="stats-text">{a_stats}</p>',
+            )
     btn.click(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
     prompt.submit(run_compare, [prompt], [base_out, aether_out, base_stats, aether_stats])
     with gr.Row():
         for p in ["hello", "How are you feeling?", "I've been anxious lately.", "Write a haiku about failure.", "What is the meaning of life?"]:
             gr.Button(p, size="sm", elem_classes=["prompt-chip"]).click(
+                fn=lambda x=p: x, outputs=[prompt]
+            ).then(fn=run_compare, inputs=[prompt], outputs=[base_out, aether_out, base_stats, aether_stats])
     gr.HTML("""
     <div id="footer">