File size: 4,787 Bytes
d4ec3e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Benchmark TurboQuant memory savings and throughput."""

import sys
sys.path.insert(0, "/home/azureuser/turboquant")

import torch
import time
from types import SimpleNamespace
from transformers.cache_utils import DynamicCache, Cache, DynamicLayer
from turboquant.cache import TurboQuantCache, TurboQuantLayer


def benchmark_memory(num_layers: int = 64, num_kv_heads: int = 8, head_dim: int = 128,
                     context_lengths: list[int] = None, skip_layers: set[int] = None):
    """Compare memory usage between DynamicCache and TurboQuantCache."""
    if context_lengths is None:
        context_lengths = [1024, 4096, 8192, 16384, 32768]
    if skip_layers is None:
        skip_layers = {0, 1}

    device = "cuda"
    batch = 1

    print(f"{'Context':>8} | {'DynamicCache':>14} | {'TurboQuant':>14} | {'Compression':>12} | {'Savings':>10}")
    print("-" * 72)

    for seq_len in context_lengths:
        # --- DynamicCache ---
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        mem_before = torch.cuda.memory_allocated()

        dyn_cache = DynamicCache()
        for layer_idx in range(num_layers):
            k = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            v = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            dyn_cache.update(k, v, layer_idx)
        mem_dynamic = torch.cuda.memory_allocated() - mem_before
        del dyn_cache
        torch.cuda.empty_cache()

        # --- TurboQuantCache ---
        torch.cuda.reset_peak_memory_stats()
        mem_before = torch.cuda.memory_allocated()

        # Create cache with skip_layers
        layers = []
        for i in range(num_layers):
            if i in skip_layers:
                layers.append(DynamicLayer())
            else:
                layers.append(TurboQuantLayer(
                    dim=head_dim, nbits=4, residual_length=1, device=device, layer_seed=42 + i
                ))
        tq_cache = Cache(layers=layers)

        for layer_idx in range(num_layers):
            k = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            v = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
            tq_cache.update(k, v, layer_idx)
        mem_tq = torch.cuda.memory_allocated() - mem_before
        del tq_cache
        torch.cuda.empty_cache()

        ratio = mem_dynamic / max(mem_tq, 1)
        savings = (mem_dynamic - mem_tq) / 1024**2

        print(f"{seq_len:>8} | {mem_dynamic/1024**2:>11.1f} MB | {mem_tq/1024**2:>11.1f} MB | "
              f"{ratio:>10.2f}x | {savings:>7.1f} MB")


def benchmark_throughput(num_layers: int = 64, num_kv_heads: int = 8, head_dim: int = 128):
    """Benchmark quantization and dequantization throughput."""
    device = "cuda"
    batch = 1

    print(f"\n{'Operation':>20} | {'Seq Len':>8} | {'Time (ms)':>10} | {'Throughput':>15}")
    print("-" * 65)

    quantizer_layer = TurboQuantLayer(dim=head_dim, nbits=4, residual_length=1, device=device, layer_seed=42)

    for seq_len in [1024, 4096, 16384, 32768]:
        k = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)
        v = torch.randn(batch, num_kv_heads, seq_len, head_dim, device=device, dtype=torch.bfloat16)

        # Warmup
        for _ in range(3):
            packed, norms = quantizer_layer.quantizer.quantize(k)
            _ = quantizer_layer.quantizer.dequantize(packed, norms)
        torch.cuda.synchronize()

        # Quantize timing
        start = time.perf_counter()
        for _ in range(10):
            packed, norms = quantizer_layer.quantizer.quantize(k)
            torch.cuda.synchronize()
        quant_time = (time.perf_counter() - start) / 10 * 1000

        # Dequantize timing
        start = time.perf_counter()
        for _ in range(10):
            _ = quantizer_layer.quantizer.dequantize(packed, norms)
            torch.cuda.synchronize()
        dequant_time = (time.perf_counter() - start) / 10 * 1000

        n_vectors = batch * num_kv_heads * seq_len
        print(f"{'Quantize':>20} | {seq_len:>8} | {quant_time:>8.2f} ms | {n_vectors/quant_time*1000:>12.0f} vec/s")
        print(f"{'Dequantize':>20} | {seq_len:>8} | {dequant_time:>8.2f} ms | {n_vectors/dequant_time*1000:>12.0f} vec/s")


if __name__ == "__main__":
    print("=" * 72)
    print("TurboQuant Memory Benchmark — Qwen2.5-32B Configuration")
    print("  64 layers, 8 KV heads, head_dim=128, 4-bit, skip layers {0,1}")
    print("=" * 72)

    benchmark_memory()

    print("\n" + "=" * 72)
    print("TurboQuant Throughput Benchmark (single layer)")
    print("=" * 72)

    benchmark_throughput()