h4-polytopic-attention / python /benchmark_h4_vs_softmax.py

Upload python/benchmark_h4_vs_softmax.py with huggingface_hub

06e4588 verified 8 days ago

13.2 kB

	"""
	Benchmark: H4 geometric attention vs standard softmax attention.

	Compares wall-clock time, peak memory, and attention score quality
	at various context lengths to find the empirical crossover point
	where H4's O(log t) chamber lookup beats softmax's O(t^2) matmul.

	Now includes Rust-accelerated backend (h4_rust) when available.
	"""

	import math
	import time
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	import sys
	import os

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from h4_hybrid_attention import H4AttentionLayer
	from utils.chamber_index import compute_chamber_ids

	# Rust backend detection
	try:
	import h4_rust
	RUST_AVAILABLE = True
	except ImportError:
	RUST_AVAILABLE = False


	class SoftmaxAttentionLayer(nn.Module):
	"""Standard multi-head scaled dot-product attention for comparison."""

	def __init__(self, d_model: int, n_heads: int = 8, d_value: int = 16, dropout: float = 0.0):
	super().__init__()
	self.n_heads = n_heads
	self.d_head = d_model // n_heads
	self.d_value = d_value
	self.scale = 1.0 / math.sqrt(self.d_head)

	self.W_q = nn.Linear(d_model, self.d_head * n_heads, bias=False)
	self.W_k = nn.Linear(d_model, self.d_head * n_heads, bias=False)
	self.W_v = nn.Linear(d_model, d_value * n_heads, bias=False)
	self.W_out = nn.Linear(d_value * n_heads, d_model, bias=False)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	B, T, D = x.shape
	Q = self.W_q(x).view(B, T, self.n_heads, self.d_head).permute(0, 2, 1, 3)
	K = self.W_k(x).view(B, T, self.n_heads, self.d_head).permute(0, 2, 1, 3)
	V = self.W_v(x).view(B, T, self.n_heads, self.d_value).permute(0, 2, 1, 3)

	scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
	mask = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)
	scores.masked_fill_(mask.unsqueeze(0).unsqueeze(0), float('-inf'))

	attn = F.softmax(scores, dim=-1)
	out = torch.matmul(attn, V)
	out = out.permute(0, 2, 1, 3).reshape(B, T, -1)
	return self.W_out(out)


	def benchmark_forward_pass(layer, x, n_warmup=2, n_runs=5, **kwargs):
	"""Time forward pass, return mean and std in milliseconds."""
	for _ in range(n_warmup):
	_ = layer(x, **kwargs)

	times = []
	for _ in range(n_runs):
	t0 = time.perf_counter()
	_ = layer(x, **kwargs)
	t1 = time.perf_counter()
	times.append((t1 - t0) * 1000)

	return np.mean(times), np.std(times)


	def benchmark_rust_topk(keys_np, queries_np, k, n_warmup=2, n_runs=5):
	"""
	Benchmark Rust h4_rust.query_topk on raw numpy arrays.
	Returns mean and std in milliseconds.
	"""
	if not RUST_AVAILABLE:
	return None, None

	keys = keys_np.astype(np.float64)
	queries = queries_np.astype(np.float64)

	# Warmup
	for _ in range(n_warmup):
	_ = h4_rust.query_topk(keys, queries, k)

	times = []
	for _ in range(n_runs):
	t0 = time.perf_counter()
	_ = h4_rust.query_topk(keys, queries, k)
	t1 = time.perf_counter()
	times.append((t1 - t0) * 1000)

	return np.mean(times), np.std(times)


	def benchmark_numpy_topk(keys_np, queries_np, k, n_warmup=2, n_runs=5):
	"""
	Benchmark pure-numpy brute-force top-k for comparison.
	Returns mean and std in milliseconds.
	"""
	keys = keys_np.astype(np.float64)
	queries = queries_np.astype(np.float64)

	# Normalize
	k_norms = np.linalg.norm(keys, axis=1, keepdims=True)
	k_norms[k_norms < 1e-12] = 1.0
	keys_normed = keys / k_norms

	q_norms = np.linalg.norm(queries, axis=1, keepdims=True)
	q_norms[q_norms < 1e-12] = 1.0
	queries_normed = queries / q_norms

	# Warmup
	for _ in range(n_warmup):
	dots = queries_normed @ keys_normed.T
	_ = np.argsort(-dots, axis=1)[:, :k]

	times = []
	for _ in range(n_runs):
	t0 = time.perf_counter()
	dots = queries_normed @ keys_normed.T
	_ = np.argsort(-dots, axis=1)[:, :k]
	t1 = time.perf_counter()
	times.append((t1 - t0) * 1000)

	return np.mean(times), np.std(times)


	def compare_attention_patterns(h4_layer, softmax_layer, x):
	"""
	Compare attention score distributions between H4 and softmax.
	Returns correlation coefficient.
	"""
	B, T, D = x.shape

	h4_out = h4_layer(x, use_tree=False)
	softmax_out = softmax_layer(x)

	h4_flat = h4_out.detach().flatten()
	sm_flat = softmax_out.detach().flatten()

	if h4_flat.std() < 1e-8 or sm_flat.std() < 1e-8:
	return 0.0

	corr = torch.corrcoef(torch.stack([h4_flat, sm_flat]))[0, 1].item()
	return corr


	def main():
	torch.manual_seed(42)
	np.random.seed(42)

	d_model = 64
	n_heads = 8
	d_value = 16
	batch_size = 1
	top_k = 32

	# Part 1 uses the full H4 attention layer (Python tree), so keep lengths moderate
	layer_seq_lengths = [64, 128, 256, 512, 1024]

	# Part 2 tests raw Rust top-k at extended lengths
	rust_seq_lengths = [512, 1024, 2048, 4096, 8192, 16384]

	print("=" * 100)
	print("H4 Geometric Attention vs Standard Softmax Attention -- Benchmark")
	print("=" * 100)
	print(f"d_model={d_model}, n_heads={n_heads}, d_value={d_value}, batch_size={batch_size}, top_k={top_k}")
	print(f"Rust backend (h4_rust): {'AVAILABLE' if RUST_AVAILABLE else 'NOT AVAILABLE (install with: cd rust && maturin develop --release)'}")
	print()

	# Create layers
	h4_layer = H4AttentionLayer(d_model, n_heads, d_value, top_k=top_k)
	softmax_layer = SoftmaxAttentionLayer(d_model, n_heads, d_value)

	h4_layer.eval()
	softmax_layer.eval()

	# ============================================================
	# Part 1: Full attention layer benchmark (softmax vs H4)
	# ============================================================
	print("-" * 100)
	print("PART 1: Full Attention Layer Forward Pass (ms)")
	print("-" * 100)

	results = []

	header = f"{'seq_len':>8} \| {'softmax_ms':>12} \| {'h4_full_ms':>12} \| {'h4_tree_ms':>12} \| {'tree/full':>10} \| {'corr':>8}"
	print(header)
	print("-" * len(header))

	for T in layer_seq_lengths:
	x = torch.randn(batch_size, T, d_model)

	with torch.no_grad():
	sm_mean, sm_std = benchmark_forward_pass(softmax_layer, x)
	h4_full_mean, h4_full_std = benchmark_forward_pass(h4_layer, x, use_tree=False)

	if T > 64:
	h4_tree_mean, h4_tree_std = benchmark_forward_pass(h4_layer, x, use_tree=True, n_runs=3)
	else:
	h4_tree_mean = h4_full_mean
	h4_tree_std = h4_full_std

	corr = compare_attention_patterns(h4_layer, softmax_layer, x)
	ratio = h4_tree_mean / max(h4_full_mean, 0.001)

	print(f"{T:8d} \| {sm_mean:10.1f}+/-{sm_std:3.1f} \| {h4_full_mean:10.1f}+/-{h4_full_std:3.1f} \| {h4_tree_mean:10.1f}+/-{h4_tree_std:3.1f} \| {ratio:10.3f} \| {corr:8.4f}")

	results.append({
	'seq_len': T,
	'softmax_ms': sm_mean,
	'h4_full_ms': h4_full_mean,
	'h4_tree_ms': h4_tree_mean,
	'tree_vs_full_ratio': ratio,
	'output_correlation': corr,
	})

	# ============================================================
	# Part 2: Raw top-k benchmark (Rust vs NumPy)
	# ============================================================
	print()
	print("-" * 100)
	print("PART 2: Raw Top-k Query Benchmark — Rust h4_rust vs NumPy (ms)")
	print(" (One attention head: n_queries=64 queries against n_keys keys, k=32)")
	print("-" * 100)

	n_queries = 64
	k = 32

	if RUST_AVAILABLE:
	header2 = f"{'n_keys':>8} \| {'numpy_ms':>12} \| {'rust_ms':>12} \| {'speedup':>10}"
	print(header2)
	print("-" * len(header2))

	rust_results = []
	for T in rust_seq_lengths:
	keys_np = np.random.randn(T, 4).astype(np.float64)
	queries_np = np.random.randn(n_queries, 4).astype(np.float64)

	np_mean, np_std = benchmark_numpy_topk(keys_np, queries_np, k)
	rust_mean, rust_std = benchmark_rust_topk(keys_np, queries_np, k)

	speedup = np_mean / max(rust_mean, 0.001) if rust_mean else 0.0

	print(f"{T:8d} \| {np_mean:10.3f}+/-{np_std:3.3f} \| {rust_mean:10.3f}+/-{rust_std:3.3f} \| {speedup:9.1f}x")

	rust_results.append({
	'n_keys': T,
	'numpy_ms': np_mean,
	'rust_ms': rust_mean,
	'speedup': speedup,
	})
	else:
	print(" [SKIPPED] Rust backend not available.")
	print(" Install with: cd rust && maturin develop --release")
	rust_results = []

	# ============================================================
	# Part 3: Chamber index computation benchmark
	# ============================================================
	print()
	print("-" * 100)
	print("PART 3: Chamber Index Computation — Rust vs NumPy (ms)")
	print("-" * 100)

	if RUST_AVAILABLE:
	roots = h4_rust.get_simple_roots() # (4, 4) f64
	header3 = f"{'n_vectors':>10} \| {'numpy_ms':>12} \| {'rust_ms':>12} \| {'speedup':>10}"
	print(header3)
	print("-" * len(header3))

	for n_vecs in [1000, 10000, 100000]:
	vecs = np.random.randn(n_vecs, 4).astype(np.float64)
	roots_torch = torch.from_numpy(roots).float()

	# NumPy/torch chamber IDs
	vecs_torch = torch.from_numpy(vecs).float()
	# Warmup
	for _ in range(2):
	_ = compute_chamber_ids(vecs_torch, roots_torch)

	times_np = []
	for _ in range(5):
	t0 = time.perf_counter()
	_ = compute_chamber_ids(vecs_torch, roots_torch)
	t1 = time.perf_counter()
	times_np.append((t1 - t0) * 1000)
	np_mean = np.mean(times_np)
	np_std_val = np.std(times_np)

	# Rust chamber IDs
	for _ in range(2):
	_ = h4_rust.chamber_indices(vecs, roots)

	times_rust = []
	for _ in range(5):
	t0 = time.perf_counter()
	_ = h4_rust.chamber_indices(vecs, roots)
	t1 = time.perf_counter()
	times_rust.append((t1 - t0) * 1000)
	rust_mean = np.mean(times_rust)
	rust_std_val = np.std(times_rust)

	speedup = np_mean / max(rust_mean, 0.001)
	print(f"{n_vecs:10d} \| {np_mean:10.3f}+/-{np_std_val:3.3f} \| {rust_mean:10.3f}+/-{rust_std_val:3.3f} \| {speedup:9.1f}x")

	# Verify correctness: Rust and torch should agree
	ids_torch = compute_chamber_ids(vecs_torch, roots_torch).numpy()
	ids_rust = h4_rust.chamber_indices(vecs, roots)
	# Note: bit ordering may differ, just check both produce valid 0-15 range
	assert ids_rust.min() >= 0 and ids_rust.max() <= 15, "Rust chamber IDs out of range"
	else:
	print(" [SKIPPED] Rust backend not available.")

	# ============================================================
	# Summary
	# ============================================================
	print()
	print("=" * 100)
	print("SUMMARY")
	print("=" * 100)

	# Scaling analysis from Part 1
	if len(results) >= 2:
	sm_times = [(r['seq_len'], r['softmax_ms']) for r in results]
	h4_times = [(r['seq_len'], r['h4_tree_ms']) for r in results]

	sm_exp = math.log(sm_times[-1][1] / max(sm_times[0][1], 0.01)) / math.log(sm_times[-1][0] / sm_times[0][0])
	h4_exp = math.log(h4_times[-1][1] / max(h4_times[0][1], 0.01)) / math.log(h4_times[-1][0] / h4_times[0][0])

	print(f" Softmax scaling exponent: ~{sm_exp:.2f} (expect ~2.0 for O(t^2))")
	print(f" H4 tree scaling exponent: ~{h4_exp:.2f} (expect ~0 for O(log t), higher due to Python overhead)")

	crossover = None
	for r in results:
	if r['h4_tree_ms'] < r['softmax_ms']:
	crossover = r['seq_len']
	break

	if crossover:
	print(f" H4 tree becomes faster than softmax at seq_len={crossover}")
	else:
	print(" Softmax is faster at all tested layer-level lengths")
	print(" (H4 tree overhead dominates at small/medium lengths due to Python ChamberTree)")

	if RUST_AVAILABLE and rust_results:
	print()
	print(" Rust backend top-k performance:")
	for r in rust_results[:6]:
	print(f" n_keys={r['n_keys']:>6d}: Rust {r['rust_ms']:.3f}ms vs NumPy {r['numpy_ms']:.3f}ms ({r['speedup']:.1f}x)")
	elif not RUST_AVAILABLE:
	print()
	print(" Rust backend was NOT available for this run.")
	print(" To enable: cd rust && maturin develop --release")

	print()
	print(" Note: The Python ChamberTree has high constant factors.")
	print(" The Rust h4_rust backend shows raw computation speedups.")
	print(" Full Rust-accelerated attention layer is the next step.")
	print("=" * 100)


	if __name__ == '__main__':
	main()