"""GPU vs CPU Benchmark — wall-clock comparison across network sizes.

Usage:
    python benchmarks/gpu_benchmark.py
"""

import sys
import os
import time

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

import neurocore as nc

try:
    import torch
    HAS_CUDA = torch.cuda.is_available()
except ImportError:
    HAS_CUDA = False


def build_network(n_neurons, fan_out=4, weight=200, seed=42):
    """Build a network with fixed fan-out connectivity."""
    net = nc.Network()
    pop = net.population(n_neurons, params={"threshold": 500, "leak": 3})
    net.connect(pop, pop, topology="fixed_fan_out", fan_out=fan_out,
                weight=weight, seed=seed)
    return net, pop


def time_cpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5):
    """Time CPU simulator execution (includes stimulus injection)."""
    sim = nc.Simulator()
    sim.deploy(net)

    start = time.perf_counter()
    for t in range(stim_steps):
        sim.inject(pop[:stim_neurons], current=1200)
        sim.run(1)
    result = sim.run(timesteps - stim_steps)
    elapsed = time.perf_counter() - start
    return elapsed, result.total_spikes


def time_gpu(net, pop, timesteps=50, stim_neurons=16, stim_steps=5, device=None):
    """Time GPU simulator execution (includes stimulus injection)."""
    sim = nc.GpuSimulator(device=device)
    sim.deploy(net)

    # Warm up CUDA (1 throwaway step, then redeploy for fair comparison)
    sim.run(1)
    torch.cuda.synchronize(sim.device)
    sim.close()

    # Fresh deploy for timed run
    sim = nc.GpuSimulator(device=device)
    sim.deploy(net)

    start = time.perf_counter()
    for t in range(stim_steps):
        sim.inject(pop[:stim_neurons], current=1200)
        sim.run(1)
    result = sim.run(timesteps - stim_steps)
    torch.cuda.synchronize(sim.device)
    elapsed = time.perf_counter() - start
    sim.close()
    return elapsed, result.total_spikes


def main():
    if not HAS_CUDA:
        print("CUDA not available. Cannot run GPU benchmark.")
        return

    device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0")
    gpu_name = torch.cuda.get_device_name(device)
    vram = torch.cuda.get_device_properties(device).total_memory / 1e9
    print(f"GPU: {gpu_name} ({vram:.1f} GB)")
    print()

    print("=" * 72)
    print("  Part 1: CPU vs GPU Wall-Clock (50 timesteps, fan_out=4)")
    print("=" * 72)
    print(f"{'Neurons':>8}  {'Synapses':>10}  {'CPU (s)':>10}  {'GPU (s)':>10}  {'Speedup':>8}")
    print("-" * 72)

    configs = [
        (64, 4),
        (256, 4),
        (1024, 4),
        (4096, 4),
        (8192, 4),
        (16384, 4),
        (32768, 4),
    ]

    for n_neurons, fan_out in configs:
        try:
            net, pop = build_network(n_neurons, fan_out=fan_out)
            synapses = n_neurons * fan_out

            if n_neurons <= 8192:
                cpu_time, _ = time_cpu(net, pop)
            else:
                cpu_time = float('inf')

            gpu_time, _ = time_gpu(net, pop, device=device)

            speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
            cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else "       n/a"

            print(f"{n_neurons:>8}  {synapses:>10}  {cpu_str}  {gpu_time:10.4f}  {speedup:7.1f}x")
        except Exception as e:
            print(f"{n_neurons:>8}  {'FAILED':>10}  {e}")

    print()
    print("=" * 72)
    print("  Part 2: Denser Networks (50 timesteps, fan_out=8)")
    print("=" * 72)
    print(f"{'Neurons':>8}  {'Synapses':>10}  {'CPU (s)':>10}  {'GPU (s)':>10}  {'Speedup':>8}")
    print("-" * 72)

    dense_configs = [
        (256, 8),
        (512, 8),
        (1024, 8),
        (4096, 8),
    ]

    for n_neurons, fan_out in dense_configs:
        try:
            net, pop = build_network(n_neurons, fan_out=fan_out)
            synapses = n_neurons * fan_out

            if n_neurons <= 4096:
                cpu_time, _ = time_cpu(net, pop)
            else:
                cpu_time = float('inf')

            gpu_time, _ = time_gpu(net, pop, device=device)
            speedup = cpu_time / gpu_time if gpu_time > 0 else float('inf')
            cpu_str = f"{cpu_time:10.4f}" if cpu_time < float('inf') else "       n/a"

            print(f"{n_neurons:>8}  {synapses:>10}  {cpu_str}  {gpu_time:10.4f}  {speedup:7.1f}x")
        except Exception as e:
            print(f"{n_neurons:>8}  {'FAILED':>10}  {e}")

    print()
    print("=" * 72)
    print("  Part 3: GPU-Only Large Scale (100 timesteps)")
    print("=" * 72)
    hdr = f"{'Neurons':>8}  {'Fan-out':>8}  {'Synapses':>10}  {'Time (s)':>10}  {'ts/sec':>8}"
    print(hdr)
    print("-" * 72)

    large_configs = [
        (16384, 4),
        (32768, 4),
        (65536, 4),
        (131072, 4),
    ]

    for n_neurons, fan_out in large_configs:
        try:
            net, pop = build_network(n_neurons, fan_out=fan_out)
            gpu_time, _ = time_gpu(net, pop, timesteps=100, device=device)
            ts_per_sec = 100 / gpu_time if gpu_time > 0 else float('inf')
            print(f"{n_neurons:>8}  {fan_out:>8}  {n_neurons * fan_out:>10}  {gpu_time:10.4f}  {ts_per_sec:7.0f}")
        except Exception as e:
            print(f"{n_neurons:>8}  {fan_out:>8}  {n_neurons * fan_out:>10}  FAILED: {e}")

    print()
    print("Benchmark complete.")


if __name__ == "__main__":
    main()