"""
benchmark/bench_layer1.py
--------------------------
Run this first when you connect your RunPod instance.
Proves every Layer 1 component is faster than baseline.

Usage:
    python benchmark/bench_layer1.py
"""

import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

import torch
import time
from kernels.rank_estimator import sketch_rank, estimate_prune_counts
from kernels.varlen_packing  import pack_varlen_batch
from kernels.sparse_attn     import sparse_vision_attn


def timeit(fn, n_warmup=5, n_runs=50, device="cpu"):
    for _ in range(n_warmup):
        fn()
    if device == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(n_runs):
        fn()
    if device == "cuda":
        torch.cuda.synchronize()
    return (time.perf_counter() - t0) / n_runs * 1000


def bench_rank(device):
    print("\n── Rank Estimator ───────────────────────────────────────────────")
    print(f"{'Config':<30} {'SVD':>10} {'Sketch':>10} {'Speedup':>10} {'MaxErr':>10}")
    print("─" * 75)

    for B, T, V in [(1,77,196),(4,77,196),(8,77,196),(8,128,576)]:
        P = torch.rand(B, T, V, device=device)
        P = P / P.sum(dim=-1, keepdim=True)

        svd_ms    = timeit(lambda: torch.stack([torch.linalg.matrix_rank(P[i]) for i in range(B)]), device=device)
        sketch_ms = timeit(lambda: sketch_rank(P), device=device)

        r_svd = torch.stack([torch.linalg.matrix_rank(P[i]) for i in range(B)]).float()
        r_skc = sketch_rank(P).float()
        err   = (r_svd - r_skc).abs().max().item()

        print(f"B={B} T={T} V={V:<10} {svd_ms:>9.1f}ms {sketch_ms:>9.1f}ms {svd_ms/sketch_ms:>9.1f}x {err:>10.0f}")


def bench_packing(device):
    print("\n── Varlen Packing ───────────────────────────────────────────────")
    print(f"{'Config':<35} {'pad_seq':>10} {'pack':>10} {'Speedup':>10} {'Mem':>10}")
    print("─" * 80)

    from torch.nn.utils.rnn import pad_sequence

    for B, D, lens in [
        (4, 768, [120, 80, 100, 90]),
        (8, 768, [160, 80, 90, 110, 140, 70, 130, 100]),
    ]:
        tokens = [torch.randn(L, D, device=device) for L in lens]
        pad_ms  = timeit(lambda: pad_sequence(tokens, batch_first=True), device=device)
        pack_ms = timeit(lambda: pack_varlen_batch(tokens), device=device)

        pack_mem = sum(lens) * D
        pad_mem  = max(lens) * B * D
        saving   = (pack_mem / pad_mem - 1) * 100

        label = f"B={B} D={D} lens=[{min(lens)}..{max(lens)}]"
        print(f"{label:<35} {pad_ms:>9.2f}ms {pack_ms:>9.2f}ms {pad_ms/pack_ms:>9.1f}x {saving:>+9.0f}%")


def bench_sparse_attn(device):
    print("\n── Sparse Attention ─────────────────────────────────────────────")
    print(f"{'Config':<38} {'Dense':>10} {'Sparse':>10} {'Speedup':>10} {'MaxErr':>10}")
    print("─" * 83)

    for B, N_vis, K, T, D in [
        (1,196,80,77,768),
        (4,196,80,77,768),
        (8,196,80,77,768),
        (8,576,127,77,1024),
    ]:
        patch = torch.randn(B, N_vis, D, device=device)
        text  = torch.randn(B, T, D, device=device)
        kept  = torch.stack([torch.randperm(N_vis, device=device)[:K] for _ in range(B)])

        scale = D ** -0.5
        dense_ms  = timeit(lambda: torch.bmm(patch, text.transpose(1,2)) * scale, device=device)
        sparse_ms = timeit(lambda: sparse_vision_attn(patch, text, kept, use_triton=False), device=device)

        dense_out  = torch.bmm(patch, text.transpose(1,2)) * scale
        sparse_out = sparse_vision_attn(patch, text, kept, use_triton=False)
        idx = kept.unsqueeze(-1).expand(B, K, T)
        err = (torch.gather(dense_out,1,idx) - sparse_out).abs().max().item()

        label = f"B={B} N={N_vis} K={K} T={T} D={D}"
        print(f"{label:<38} {dense_ms:>9.2f}ms {sparse_ms:>9.2f}ms {dense_ms/sparse_ms:>9.1f}x {err:>10.2e}")


if __name__ == "__main__":
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"\nSparseVLM Layer 1 Benchmark | Device: {device}")
    if device == "cuda":
        print(f"GPU: {torch.cuda.get_device_name(0)} | VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
    bench_rank(device)
    bench_packing(device)
    bench_sparse_attn(device)
    print("\n── Done. Replace README.md benchmark table with these numbers. ──\n")