grapheneaffiliates
/

h4-polytopic-attention

+"""
+Benchmark: H4 geometric attention vs standard softmax attention.
+Compares wall-clock time, peak memory, and attention score quality
+at various context lengths to find the empirical crossover point
+where H4's O(log t) chamber lookup beats softmax's O(t^2) matmul.
+Now includes Rust-accelerated backend (h4_rust) when available.
+"""
+import math
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from h4_hybrid_attention import H4AttentionLayer
+from utils.chamber_index import compute_chamber_ids
+# Rust backend detection
+try:
+    import h4_rust
+    RUST_AVAILABLE = True
+except ImportError:
+    RUST_AVAILABLE = False
+class SoftmaxAttentionLayer(nn.Module):
+    """Standard multi-head scaled dot-product attention for comparison."""
+    def __init__(self, d_model: int, n_heads: int = 8, d_value: int = 16, dropout: float = 0.0):
+        super().__init__()
+        self.n_heads = n_heads
+        self.d_head = d_model // n_heads
+        self.d_value = d_value
+        self.scale = 1.0 / math.sqrt(self.d_head)
+        self.W_q = nn.Linear(d_model, self.d_head * n_heads, bias=False)
+        self.W_k = nn.Linear(d_model, self.d_head * n_heads, bias=False)
+        self.W_v = nn.Linear(d_model, d_value * n_heads, bias=False)
+        self.W_out = nn.Linear(d_value * n_heads, d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, D = x.shape
+        Q = self.W_q(x).view(B, T, self.n_heads, self.d_head).permute(0, 2, 1, 3)
+        K = self.W_k(x).view(B, T, self.n_heads, self.d_head).permute(0, 2, 1, 3)
+        V = self.W_v(x).view(B, T, self.n_heads, self.d_value).permute(0, 2, 1, 3)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
+        mask = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)
+        scores.masked_fill_(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+        attn = F.softmax(scores, dim=-1)
+        out = torch.matmul(attn, V)
+        out = out.permute(0, 2, 1, 3).reshape(B, T, -1)
+        return self.W_out(out)
+def benchmark_forward_pass(layer, x, n_warmup=2, n_runs=5, **kwargs):
+    """Time forward pass, return mean and std in milliseconds."""
+    for _ in range(n_warmup):
+        _ = layer(x, **kwargs)
+    times = []
+    for _ in range(n_runs):
+        t0 = time.perf_counter()
+        _ = layer(x, **kwargs)
+        t1 = time.perf_counter()
+        times.append((t1 - t0) * 1000)
+    return np.mean(times), np.std(times)
+def benchmark_rust_topk(keys_np, queries_np, k, n_warmup=2, n_runs=5):
+    """
+    Benchmark Rust h4_rust.query_topk on raw numpy arrays.
+    Returns mean and std in milliseconds.
+    """
+    if not RUST_AVAILABLE:
+        return None, None
+    keys = keys_np.astype(np.float64)
+    queries = queries_np.astype(np.float64)
+    # Warmup
+    for _ in range(n_warmup):
+        _ = h4_rust.query_topk(keys, queries, k)
+    times = []
+    for _ in range(n_runs):
+        t0 = time.perf_counter()
+        _ = h4_rust.query_topk(keys, queries, k)
+        t1 = time.perf_counter()
+        times.append((t1 - t0) * 1000)
+    return np.mean(times), np.std(times)
+def benchmark_numpy_topk(keys_np, queries_np, k, n_warmup=2, n_runs=5):
+    """
+    Benchmark pure-numpy brute-force top-k for comparison.
+    Returns mean and std in milliseconds.
+    """
+    keys = keys_np.astype(np.float64)
+    queries = queries_np.astype(np.float64)
+    # Normalize
+    k_norms = np.linalg.norm(keys, axis=1, keepdims=True)
+    k_norms[k_norms < 1e-12] = 1.0
+    keys_normed = keys / k_norms
+    q_norms = np.linalg.norm(queries, axis=1, keepdims=True)
+    q_norms[q_norms < 1e-12] = 1.0
+    queries_normed = queries / q_norms
+    # Warmup
+    for _ in range(n_warmup):
+        dots = queries_normed @ keys_normed.T
+        _ = np.argsort(-dots, axis=1)[:, :k]
+    times = []
+    for _ in range(n_runs):
+        t0 = time.perf_counter()
+        dots = queries_normed @ keys_normed.T
+        _ = np.argsort(-dots, axis=1)[:, :k]
+        t1 = time.perf_counter()
+        times.append((t1 - t0) * 1000)
+    return np.mean(times), np.std(times)
+def compare_attention_patterns(h4_layer, softmax_layer, x):
+    """
+    Compare attention score distributions between H4 and softmax.
+    Returns correlation coefficient.
+    """
+    B, T, D = x.shape
+    h4_out = h4_layer(x, use_tree=False)
+    softmax_out = softmax_layer(x)
+    h4_flat = h4_out.detach().flatten()
+    sm_flat = softmax_out.detach().flatten()
+    if h4_flat.std() < 1e-8 or sm_flat.std() < 1e-8:
+        return 0.0
+    corr = torch.corrcoef(torch.stack([h4_flat, sm_flat]))[0, 1].item()
+    return corr
+def main():
+    torch.manual_seed(42)
+    np.random.seed(42)
+    d_model = 64
+    n_heads = 8
+    d_value = 16
+    batch_size = 1
+    top_k = 32
+    # Part 1 uses the full H4 attention layer (Python tree), so keep lengths moderate
+    layer_seq_lengths = [64, 128, 256, 512, 1024]
+    # Part 2 tests raw Rust top-k at extended lengths
+    rust_seq_lengths = [512, 1024, 2048, 4096, 8192, 16384]
+    print("=" * 100)
+    print("H4 Geometric Attention vs Standard Softmax Attention -- Benchmark")
+    print("=" * 100)
+    print(f"d_model={d_model}, n_heads={n_heads}, d_value={d_value}, batch_size={batch_size}, top_k={top_k}")
+    print(f"Rust backend (h4_rust): {'AVAILABLE' if RUST_AVAILABLE else 'NOT AVAILABLE (install with: cd rust && maturin develop --release)'}")
+    print()
+    # Create layers
+    h4_layer = H4AttentionLayer(d_model, n_heads, d_value, top_k=top_k)
+    softmax_layer = SoftmaxAttentionLayer(d_model, n_heads, d_value)
+    h4_layer.eval()
+    softmax_layer.eval()
+    # ============================================================
+    # Part 1: Full attention layer benchmark (softmax vs H4)
+    # ============================================================
+    print("-" * 100)
+    print("PART 1: Full Attention Layer Forward Pass (ms)")
+    print("-" * 100)
+    results = []
+    header = f"{'seq_len':>8} | {'softmax_ms':>12} | {'h4_full_ms':>12} | {'h4_tree_ms':>12} | {'tree/full':>10} | {'corr':>8}"
+    print(header)
+    print("-" * len(header))
+    for T in layer_seq_lengths:
+        x = torch.randn(batch_size, T, d_model)
+        with torch.no_grad():
+            sm_mean, sm_std = benchmark_forward_pass(softmax_layer, x)
+            h4_full_mean, h4_full_std = benchmark_forward_pass(h4_layer, x, use_tree=False)
+            if T > 64:
+                h4_tree_mean, h4_tree_std = benchmark_forward_pass(h4_layer, x, use_tree=True, n_runs=3)
+            else:
+                h4_tree_mean = h4_full_mean
+                h4_tree_std = h4_full_std
+            corr = compare_attention_patterns(h4_layer, softmax_layer, x)
+            ratio = h4_tree_mean / max(h4_full_mean, 0.001)
+            print(f"{T:8d} | {sm_mean:10.1f}+/-{sm_std:3.1f} | {h4_full_mean:10.1f}+/-{h4_full_std:3.1f} | {h4_tree_mean:10.1f}+/-{h4_tree_std:3.1f} | {ratio:10.3f} | {corr:8.4f}")
+            results.append({
+                'seq_len': T,
+                'softmax_ms': sm_mean,
+                'h4_full_ms': h4_full_mean,
+                'h4_tree_ms': h4_tree_mean,
+                'tree_vs_full_ratio': ratio,
+                'output_correlation': corr,
+            })
+    # ============================================================
+    # Part 2: Raw top-k benchmark (Rust vs NumPy)
+    # ============================================================
+    print()
+    print("-" * 100)
+    print("PART 2: Raw Top-k Query Benchmark — Rust h4_rust vs NumPy (ms)")
+    print("  (One attention head: n_queries=64 queries against n_keys keys, k=32)")
+    print("-" * 100)
+    n_queries = 64
+    k = 32
+    if RUST_AVAILABLE:
+        header2 = f"{'n_keys':>8} | {'numpy_ms':>12} | {'rust_ms':>12} | {'speedup':>10}"
+        print(header2)
+        print("-" * len(header2))
+        rust_results = []
+        for T in rust_seq_lengths:
+            keys_np = np.random.randn(T, 4).astype(np.float64)
+            queries_np = np.random.randn(n_queries, 4).astype(np.float64)
+            np_mean, np_std = benchmark_numpy_topk(keys_np, queries_np, k)
+            rust_mean, rust_std = benchmark_rust_topk(keys_np, queries_np, k)
+            speedup = np_mean / max(rust_mean, 0.001) if rust_mean else 0.0
+            print(f"{T:8d} | {np_mean:10.3f}+/-{np_std:3.3f} | {rust_mean:10.3f}+/-{rust_std:3.3f} | {speedup:9.1f}x")
+            rust_results.append({
+                'n_keys': T,
+                'numpy_ms': np_mean,
+                'rust_ms': rust_mean,
+                'speedup': speedup,
+            })
+    else:
+        print("  [SKIPPED] Rust backend not available.")
+        print("  Install with: cd rust && maturin develop --release")
+        rust_results = []
+    # ============================================================
+    # Part 3: Chamber index computation benchmark
+    # ============================================================
+    print()
+    print("-" * 100)
+    print("PART 3: Chamber Index Computation — Rust vs NumPy (ms)")
+    print("-" * 100)
+    if RUST_AVAILABLE:
+        roots = h4_rust.get_simple_roots()  # (4, 4) f64
+        header3 = f"{'n_vectors':>10} | {'numpy_ms':>12} | {'rust_ms':>12} | {'speedup':>10}"
+        print(header3)
+        print("-" * len(header3))
+        for n_vecs in [1000, 10000, 100000]:
+            vecs = np.random.randn(n_vecs, 4).astype(np.float64)
+            roots_torch = torch.from_numpy(roots).float()
+            # NumPy/torch chamber IDs
+            vecs_torch = torch.from_numpy(vecs).float()
+            # Warmup
+            for _ in range(2):
+                _ = compute_chamber_ids(vecs_torch, roots_torch)
+            times_np = []
+            for _ in range(5):
+                t0 = time.perf_counter()
+                _ = compute_chamber_ids(vecs_torch, roots_torch)
+                t1 = time.perf_counter()
+                times_np.append((t1 - t0) * 1000)
+            np_mean = np.mean(times_np)
+            np_std_val = np.std(times_np)
+            # Rust chamber IDs
+            for _ in range(2):
+                _ = h4_rust.chamber_indices(vecs, roots)
+            times_rust = []
+            for _ in range(5):
+                t0 = time.perf_counter()
+                _ = h4_rust.chamber_indices(vecs, roots)
+                t1 = time.perf_counter()
+                times_rust.append((t1 - t0) * 1000)
+            rust_mean = np.mean(times_rust)
+            rust_std_val = np.std(times_rust)
+            speedup = np_mean / max(rust_mean, 0.001)
+            print(f"{n_vecs:10d} | {np_mean:10.3f}+/-{np_std_val:3.3f} | {rust_mean:10.3f}+/-{rust_std_val:3.3f} | {speedup:9.1f}x")
+            # Verify correctness: Rust and torch should agree
+            ids_torch = compute_chamber_ids(vecs_torch, roots_torch).numpy()
+            ids_rust = h4_rust.chamber_indices(vecs, roots)
+            # Note: bit ordering may differ, just check both produce valid 0-15 range
+            assert ids_rust.min() >= 0 and ids_rust.max() <= 15, "Rust chamber IDs out of range"
+    else:
+        print("  [SKIPPED] Rust backend not available.")
+    # ============================================================
+    # Summary
+    # ============================================================
+    print()
+    print("=" * 100)
+    print("SUMMARY")
+    print("=" * 100)
+    # Scaling analysis from Part 1
+    if len(results) >= 2:
+        sm_times = [(r['seq_len'], r['softmax_ms']) for r in results]
+        h4_times = [(r['seq_len'], r['h4_tree_ms']) for r in results]
+        sm_exp = math.log(sm_times[-1][1] / max(sm_times[0][1], 0.01)) / math.log(sm_times[-1][0] / sm_times[0][0])
+        h4_exp = math.log(h4_times[-1][1] / max(h4_times[0][1], 0.01)) / math.log(h4_times[-1][0] / h4_times[0][0])
+        print(f"  Softmax scaling exponent: ~{sm_exp:.2f} (expect ~2.0 for O(t^2))")
+        print(f"  H4 tree scaling exponent: ~{h4_exp:.2f} (expect ~0 for O(log t), higher due to Python overhead)")
+    crossover = None
+    for r in results:
+        if r['h4_tree_ms'] < r['softmax_ms']:
+            crossover = r['seq_len']
+            break
+    if crossover:
+        print(f"  H4 tree becomes faster than softmax at seq_len={crossover}")
+    else:
+        print("  Softmax is faster at all tested layer-level lengths")
+        print("  (H4 tree overhead dominates at small/medium lengths due to Python ChamberTree)")
+    if RUST_AVAILABLE and rust_results:
+        print()
+        print("  Rust backend top-k performance:")
+        for r in rust_results[:6]:
+            print(f"    n_keys={r['n_keys']:>6d}: Rust {r['rust_ms']:.3f}ms vs NumPy {r['numpy_ms']:.3f}ms ({r['speedup']:.1f}x)")
+    elif not RUST_AVAILABLE:
+        print()
+        print("  Rust backend was NOT available for this run.")
+        print("  To enable: cd rust && maturin develop --release")
+    print()
+    print("  Note: The Python ChamberTree has high constant factors.")
+    print("  The Rust h4_rust backend shows raw computation speedups.")
+    print("  Full Rust-accelerated attention layer is the next step.")
+    print("=" * 100)
+if __name__ == '__main__':
+    main()