"""GPU vs CPU Benchmark — wall-clock comparison across network sizes. Usage: python benchmarks/gpu_benchmark.py """ import sys import os import time sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import neurocore as nc try: import torch HAS_CUDA = torch.cuda.is_available() except ImportError: HAS_CUDA = False def build_network(n_neurons, fan_out=4, weight=200, seed=42): """Build a network with fixed fan-out connectivity.""" net = nc.Network() pop = net.population(n_neurons, params={"threshold": 500, "leak": 3}) net.connect(pop, pop, topology="fixed_fan_out", fan_out=fan_out, weight=weight, seed=seed) return net, pop def time_cpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5): """Time CPU simulator execution (includes stimulus injection).""" sim = nc.Simulator() sim.deploy(net) start = time.perf_counter() for t in range(stim_steps): sim.inject(pop[:stim_neurons], current=1200) sim.run(1) result = sim.run(timesteps - stim_steps) elapsed = time.perf_counter() - start return elapsed, result.total_spikes def time_gpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5, device=None): """Time GPU simulator execution (includes stimulus injection).""" sim = nc.GpuSimulator(device=device) sim.deploy(net) # Warm up CUDA (1 throwaway step, then redeploy for fair comparison) sim.run(1) torch.cuda.synchronize(sim.device) sim.close() # Fresh deploy for timed run sim = nc.GpuSimulator(device=device) sim.deploy(net) start = time.perf_counter() for t in range(stim_steps): sim.inject(pop[:stim_neurons], current=1200) sim.run(1) result = sim.run(timesteps - stim_steps) torch.cuda.synchronize(sim.device) elapsed = time.perf_counter() - start sim.close() return elapsed, result.total_spikes def main(): if not HAS_CUDA: print("CUDA not available. Cannot run GPU benchmark.") return device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0") gpu_name = torch.cuda.get_device_name(device) vram = torch.cuda.get_device_properties(device).total_memory / 1e9 print(f"GPU: {gpu_name} ({vram:.1f} GB)") print() print("=" * 72) print(" Part 1: CPU vs GPU Wall-Clock (50 timesteps, fan_out=4)") print("=" * 72) print(f"{'Neurons':>8} {'Synapses':>10} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>8}") print("-" * 72) configs = [ (64, 4), (256, 4), (1024, 4), (4096, 4), (8192, 4), (16384, 4), (32768, 4), ] for n_neurons, fan_out in configs: try: net, pop = build_network(n_neurons, fan_out=fan_out) synapses = n_neurons * fan_out if n_neurons <= 8192: cpu_time, _ = time_cpu(net, pop) else: cpu_time = float('inf') gpu_time, _ = time_gpu(net, pop, device=device) speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf') cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else " n/a" print(f"{n_neurons:>8} {synapses:>10} {cpu_str} {gpu_time:10.4f} {speedup:7.1f}x") except Exception as e: print(f"{n_neurons:>8} {'FAILED':>10} {e}") print() print("=" * 72) print(" Part 2: Denser Networks (50 timesteps, fan_out=8)") print("=" * 72) print(f"{'Neurons':>8} {'Synapses':>10} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>8}") print("-" * 72) dense_configs = [ (256, 8), (512, 8), (1024, 8), (4096, 8), ] for n_neurons, fan_out in dense_configs: try: net, pop = build_network(n_neurons, fan_out=fan_out) synapses = n_neurons * fan_out if n_neurons <= 4096: cpu_time, _ = time_cpu(net, pop) else: cpu_time = float('inf') gpu_time, _ = time_gpu(net, pop, device=device) speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf') cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else " n/a" print(f"{n_neurons:>8} {synapses:>10} {cpu_str} {gpu_time:10.4f} {speedup:7.1f}x") except Exception as e: print(f"{n_neurons:>8} {'FAILED':>10} {e}") print() print("=" * 72) print(" Part 3: GPU-Only Large Scale (100 timesteps)") print("=" * 72) hdr = f"{'Neurons':>8} {'Fan-out':>8} {'Synapses':>10} {'Time (s)':>10} {'ts/sec':>8}" print(hdr) print("-" * 72) large_configs = [ (16384, 4), (32768, 4), (65536, 4), (131072, 4), ] for n_neurons, fan_out in large_configs: try: net, pop = build_network(n_neurons, fan_out=fan_out) gpu_time, _ = time_gpu(net, pop, timesteps=100, device=device) ts_per_sec = 100 / gpu_time if gpu_time > 0 else float('inf') print(f"{n_neurons:>8} {fan_out:>8} {n_neurons * fan_out:>10} {gpu_time:10.4f} {ts_per_sec:7.0f}") except Exception as e: print(f"{n_neurons:>8} {fan_out:>8} {n_neurons * fan_out:>10} FAILED: {e}") print() print("Benchmark complete.") if __name__ == "__main__": main()