"""Benchmark TurboQuant memory savings and throughput."""

import sys
sys.path.insert(0, "/home/azureuser/turboquant")

import torch
import time
from types import SimpleNamespace
from transformers.cache_utils import DynamicCache, Cache, DynamicLayer
from turboquant.cache import TurboQuantCache, TurboQuantLayer


def benchmark_memory(num_layers: int = 64, num_kv_heads: int = 8, head_dim: int = 128,
                     context_lengths: list[int] = None, skip_layers: set[int] = None):
    """Compare memory usage between DynamicCache and TurboQuantCache."""
    if context_lengths is None:
        context_lengths = [1024, 4096, 8192, 16384, 32768]
    if skip_layers is None:
        skip_layers = {0, 1}

    device = "cuda"
    batch = 1

    print(f"{'Context':>8} | {'DynamicCache':>14} | {'TurboQuant':>14} | {'Compression':>12} | {'Savings':>10}")
    print("-" * 72)

    for seq_len in context_lengths:
        # --- DynamicCache ---
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        mem_before = torch.cuda.memory_allocated()

        dyn_cache = DynamicCache()
        for layer_idx in range(num_layers):
            k = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            v = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            dyn_cache.update(k, v, layer_idx)
        mem_dynamic = torch.cuda.memory_allocated() - mem_before
        del dyn_cache
        torch.cuda.empty_cache()

        # --- TurboQuantCache ---
        torch.cuda.reset_peak_memory_stats()
        mem_before = torch.cuda.memory_allocated()

        # Create cache with skip_layers
        layers = []
        for i in range(num_layers):
            if i in skip_layers:
                layers.append(DynamicLayer())
            else:
                layers.append(TurboQuantLayer(
                    dim=head_dim, nbits=4, residual_length=1, device=device, layer_seed=42 + i
                ))
        tq_cache = Cache(layers=layers)

        for layer_idx in range(num_layers):
            k = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            v = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            tq_cache.update(k, v, layer_idx)
        mem_tq = torch.cuda.memory_allocated() - mem_before
        del tq_cache
        torch.cuda.empty_cache()

        ratio = mem_dynamic / max(mem_tq, 1)
        savings = (mem_dynamic - mem_tq) / 1024**2

        print(f"{seq_len:>8} | {mem_dynamic/1024**2:>11.1f} MB | {mem_tq/1024**2:>11.1f} MB | "
              f"{ratio:>10.2f}x | {savings:>7.1f} MB")


def benchmark_throughput(num_layers: int = 64, num_kv_heads: int = 8, head_dim: int = 128):
    """Benchmark quantization and dequantization throughput."""
    device = "cuda"
    batch = 1

    print(f"\n{'Operation':>20} | {'Seq Len':>8} | {'Time (ms)':>10} | {'Throughput':>15}")
    print("-" * 65)

    quantizer_layer = TurboQuantLayer(dim=head_dim, nbits=4, residual_length=1, device=device, layer_seed=42)

    for seq_len in [1024, 4096, 16384, 32768]:
        k = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
        v = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)

        # Warmup
        for _ in range(3):
            packed, norms = quantizer_layer.quantizer.quantize(k)
            _ = quantizer_layer.quantizer.dequantize(packed, norms)
        torch.cuda.synchronize()

        # Quantize timing
        start = time.perf_counter()
        for _ in range(10):
            packed, norms = quantizer_layer.quantizer.quantize(k)
            torch.cuda.synchronize()
        quant_time = (time.perf_counter() - start) / 10 * 1000

        # Dequantize timing
        start = time.perf_counter()
        for _ in range(10):
            _ = quantizer_layer.quantizer.dequantize(packed, norms)
            torch.cuda.synchronize()
        dequant_time = (time.perf_counter() - start) / 10 * 1000

        n_vectors = batch * num_kv_heads * seq_len
        print(f"{'Quantize':>20} | {seq_len:>8} | {quant_time:>8.2f} ms | {n_vectors/quant_time*1000:>12.0f} vec/s")
        print(f"{'Dequantize':>20} | {seq_len:>8} | {dequant_time:>8.2f} ms | {n_vectors/dequant_time*1000:>12.0f} vec/s")


if __name__ == "__main__":
    print("=" * 72)
    print("TurboQuant Memory Benchmark — Qwen2.5-32B Configuration")
    print("  64 layers, 8 KV heads, head_dim=128, 4-bit, skip layers {0,1}")
    print("=" * 72)

    benchmark_memory()

    print("\n" + "=" * 72)
    print("TurboQuant Throughput Benchmark (single layer)")
    print("=" * 72)

    benchmark_throughput()