| | """GPU vs CPU Benchmark — wall-clock comparison across network sizes.
|
| |
|
| | Usage:
|
| | python benchmarks/gpu_benchmark.py
|
| | """
|
| |
|
| | import sys
|
| | import os
|
| | import time
|
| |
|
| | sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| |
|
| | import neurocore as nc
|
| |
|
| | try:
|
| | import torch
|
| | HAS_CUDA = torch.cuda.is_available()
|
| | except ImportError:
|
| | HAS_CUDA = False
|
| |
|
| |
|
| | def build_network(n_neurons, fan_out=4, weight=200, seed=42):
|
| | """Build a network with fixed fan-out connectivity."""
|
| | net = nc.Network()
|
| | pop = net.population(n_neurons, params={"threshold": 500, "leak": 3})
|
| | net.connect(pop, pop, topology="fixed_fan_out", fan_out=fan_out,
|
| | weight=weight, seed=seed)
|
| | return net, pop
|
| |
|
| |
|
| | def time_cpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5):
|
| | """Time CPU simulator execution (includes stimulus injection)."""
|
| | sim = nc.Simulator()
|
| | sim.deploy(net)
|
| |
|
| | start = time.perf_counter()
|
| | for t in range(stim_steps):
|
| | sim.inject(pop[:stim_neurons], current=1200)
|
| | sim.run(1)
|
| | result = sim.run(timesteps - stim_steps)
|
| | elapsed = time.perf_counter() - start
|
| | return elapsed, result.total_spikes
|
| |
|
| |
|
| | def time_gpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5, device=None):
|
| | """Time GPU simulator execution (includes stimulus injection)."""
|
| | sim = nc.GpuSimulator(device=device)
|
| | sim.deploy(net)
|
| |
|
| |
|
| | sim.run(1)
|
| | torch.cuda.synchronize(sim.device)
|
| | sim.close()
|
| |
|
| |
|
| | sim = nc.GpuSimulator(device=device)
|
| | sim.deploy(net)
|
| |
|
| | start = time.perf_counter()
|
| | for t in range(stim_steps):
|
| | sim.inject(pop[:stim_neurons], current=1200)
|
| | sim.run(1)
|
| | result = sim.run(timesteps - stim_steps)
|
| | torch.cuda.synchronize(sim.device)
|
| | elapsed = time.perf_counter() - start
|
| | sim.close()
|
| | return elapsed, result.total_spikes
|
| |
|
| |
|
| | def main():
|
| | if not HAS_CUDA:
|
| | print("CUDA not available. Cannot run GPU benchmark.")
|
| | return
|
| |
|
| | device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0")
|
| | gpu_name = torch.cuda.get_device_name(device)
|
| | vram = torch.cuda.get_device_properties(device).total_memory / 1e9
|
| | print(f"GPU: {gpu_name} ({vram:.1f} GB)")
|
| | print()
|
| |
|
| | print("=" * 72)
|
| | print(" Part 1: CPU vs GPU Wall-Clock (50 timesteps, fan_out=4)")
|
| | print("=" * 72)
|
| | print(f"{'Neurons':>8} {'Synapses':>10} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>8}")
|
| | print("-" * 72)
|
| |
|
| | configs = [
|
| | (64, 4),
|
| | (256, 4),
|
| | (1024, 4),
|
| | (4096, 4),
|
| | (8192, 4),
|
| | (16384, 4),
|
| | (32768, 4),
|
| | ]
|
| |
|
| | for n_neurons, fan_out in configs:
|
| | try:
|
| | net, pop = build_network(n_neurons, fan_out=fan_out)
|
| | synapses = n_neurons * fan_out
|
| |
|
| | if n_neurons <= 8192:
|
| | cpu_time, _ = time_cpu(net, pop)
|
| | else:
|
| | cpu_time = float('inf')
|
| |
|
| | gpu_time, _ = time_gpu(net, pop, device=device)
|
| |
|
| | speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
|
| | cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else " n/a"
|
| |
|
| | print(f"{n_neurons:>8} {synapses:>10} {cpu_str} {gpu_time:10.4f} {speedup:7.1f}x")
|
| | except Exception as e:
|
| | print(f"{n_neurons:>8} {'FAILED':>10} {e}")
|
| |
|
| | print()
|
| | print("=" * 72)
|
| | print(" Part 2: Denser Networks (50 timesteps, fan_out=8)")
|
| | print("=" * 72)
|
| | print(f"{'Neurons':>8} {'Synapses':>10} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>8}")
|
| | print("-" * 72)
|
| |
|
| | dense_configs = [
|
| | (256, 8),
|
| | (512, 8),
|
| | (1024, 8),
|
| | (4096, 8),
|
| | ]
|
| |
|
| | for n_neurons, fan_out in dense_configs:
|
| | try:
|
| | net, pop = build_network(n_neurons, fan_out=fan_out)
|
| | synapses = n_neurons * fan_out
|
| |
|
| | if n_neurons <= 4096:
|
| | cpu_time, _ = time_cpu(net, pop)
|
| | else:
|
| | cpu_time = float('inf')
|
| |
|
| | gpu_time, _ = time_gpu(net, pop, device=device)
|
| | speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
|
| | cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else " n/a"
|
| |
|
| | print(f"{n_neurons:>8} {synapses:>10} {cpu_str} {gpu_time:10.4f} {speedup:7.1f}x")
|
| | except Exception as e:
|
| | print(f"{n_neurons:>8} {'FAILED':>10} {e}")
|
| |
|
| | print()
|
| | print("=" * 72)
|
| | print(" Part 3: GPU-Only Large Scale (100 timesteps)")
|
| | print("=" * 72)
|
| | hdr = f"{'Neurons':>8} {'Fan-out':>8} {'Synapses':>10} {'Time (s)':>10} {'ts/sec':>8}"
|
| | print(hdr)
|
| | print("-" * 72)
|
| |
|
| | large_configs = [
|
| | (16384, 4),
|
| | (32768, 4),
|
| | (65536, 4),
|
| | (131072, 4),
|
| | ]
|
| |
|
| | for n_neurons, fan_out in large_configs:
|
| | try:
|
| | net, pop = build_network(n_neurons, fan_out=fan_out)
|
| | gpu_time, _ = time_gpu(net, pop, timesteps=100, device=device)
|
| | ts_per_sec = 100 / gpu_time if gpu_time > 0 else float('inf')
|
| | print(f"{n_neurons:>8} {fan_out:>8} {n_neurons * fan_out:>10} {gpu_time:10.4f} {ts_per_sec:7.0f}")
|
| | except Exception as e:
|
| | print(f"{n_neurons:>8} {fan_out:>8} {n_neurons * fan_out:>10} FAILED: {e}")
|
| |
|
| | print()
|
| | print("Benchmark complete.")
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|