catalyst-n1 / sdk /benchmarks /gpu_benchmark.py
mrwabbit's picture
Initial upload: Catalyst N1 open source neuromorphic processor RTL
e4cdd5f verified
"""GPU vs CPU Benchmark — wall-clock comparison across network sizes.
Usage:
python benchmarks/gpu_benchmark.py
"""
import sys
import os
import time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import neurocore as nc
try:
import torch
HAS_CUDA = torch.cuda.is_available()
except ImportError:
HAS_CUDA = False
def build_network(n_neurons, fan_out=4, weight=200, seed=42):
"""Build a network with fixed fan-out connectivity."""
net = nc.Network()
pop = net.population(n_neurons, params={"threshold": 500, "leak": 3})
net.connect(pop, pop, topology="fixed_fan_out", fan_out=fan_out,
weight=weight, seed=seed)
return net, pop
def time_cpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5):
"""Time CPU simulator execution (includes stimulus injection)."""
sim = nc.Simulator()
sim.deploy(net)
start = time.perf_counter()
for t in range(stim_steps):
sim.inject(pop[:stim_neurons], current=1200)
sim.run(1)
result = sim.run(timesteps - stim_steps)
elapsed = time.perf_counter() - start
return elapsed, result.total_spikes
def time_gpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5, device=None):
"""Time GPU simulator execution (includes stimulus injection)."""
sim = nc.GpuSimulator(device=device)
sim.deploy(net)
# Warm up CUDA (1 throwaway step, then redeploy for fair comparison)
sim.run(1)
torch.cuda.synchronize(sim.device)
sim.close()
# Fresh deploy for timed run
sim = nc.GpuSimulator(device=device)
sim.deploy(net)
start = time.perf_counter()
for t in range(stim_steps):
sim.inject(pop[:stim_neurons], current=1200)
sim.run(1)
result = sim.run(timesteps - stim_steps)
torch.cuda.synchronize(sim.device)
elapsed = time.perf_counter() - start
sim.close()
return elapsed, result.total_spikes
def main():
if not HAS_CUDA:
print("CUDA not available. Cannot run GPU benchmark.")
return
device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0")
gpu_name = torch.cuda.get_device_name(device)
vram = torch.cuda.get_device_properties(device).total_memory / 1e9
print(f"GPU: {gpu_name} ({vram:.1f} GB)")
print()
print("=" * 72)
print(" Part 1: CPU vs GPU Wall-Clock (50 timesteps, fan_out=4)")
print("=" * 72)
print(f"{'Neurons':>8} {'Synapses':>10} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>8}")
print("-" * 72)
configs = [
(64, 4),
(256, 4),
(1024, 4),
(4096, 4),
(8192, 4),
(16384, 4),
(32768, 4),
]
for n_neurons, fan_out in configs:
try:
net, pop = build_network(n_neurons, fan_out=fan_out)
synapses = n_neurons * fan_out
if n_neurons <= 8192:
cpu_time, _ = time_cpu(net, pop)
else:
cpu_time = float('inf')
gpu_time, _ = time_gpu(net, pop, device=device)
speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else " n/a"
print(f"{n_neurons:>8} {synapses:>10} {cpu_str} {gpu_time:10.4f} {speedup:7.1f}x")
except Exception as e:
print(f"{n_neurons:>8} {'FAILED':>10} {e}")
print()
print("=" * 72)
print(" Part 2: Denser Networks (50 timesteps, fan_out=8)")
print("=" * 72)
print(f"{'Neurons':>8} {'Synapses':>10} {'CPU (s)':>10} {'GPU (s)':>10} {'Speedup':>8}")
print("-" * 72)
dense_configs = [
(256, 8),
(512, 8),
(1024, 8),
(4096, 8),
]
for n_neurons, fan_out in dense_configs:
try:
net, pop = build_network(n_neurons, fan_out=fan_out)
synapses = n_neurons * fan_out
if n_neurons <= 4096:
cpu_time, _ = time_cpu(net, pop)
else:
cpu_time = float('inf')
gpu_time, _ = time_gpu(net, pop, device=device)
speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else " n/a"
print(f"{n_neurons:>8} {synapses:>10} {cpu_str} {gpu_time:10.4f} {speedup:7.1f}x")
except Exception as e:
print(f"{n_neurons:>8} {'FAILED':>10} {e}")
print()
print("=" * 72)
print(" Part 3: GPU-Only Large Scale (100 timesteps)")
print("=" * 72)
hdr = f"{'Neurons':>8} {'Fan-out':>8} {'Synapses':>10} {'Time (s)':>10} {'ts/sec':>8}"
print(hdr)
print("-" * 72)
large_configs = [
(16384, 4),
(32768, 4),
(65536, 4),
(131072, 4),
]
for n_neurons, fan_out in large_configs:
try:
net, pop = build_network(n_neurons, fan_out=fan_out)
gpu_time, _ = time_gpu(net, pop, timesteps=100, device=device)
ts_per_sec = 100 / gpu_time if gpu_time > 0 else float('inf')
print(f"{n_neurons:>8} {fan_out:>8} {n_neurons * fan_out:>10} {gpu_time:10.4f} {ts_per_sec:7.0f}")
except Exception as e:
print(f"{n_neurons:>8} {fan_out:>8} {n_neurons * fan_out:>10} FAILED: {e}")
print()
print("Benchmark complete.")
if __name__ == "__main__":
main()