File size: 6,981 Bytes
af7d321 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | #!/usr/bin/env python3
"""Measure realized SpectralQuant KV-cache footprint.
This is not a throughput benchmark. It initializes the real vLLM engine and
prints the actual KV cache tensor allocation from the live kv_cache_config,
then compares it to the equivalent fp8 full-head KV cache layout for Gemma 4.
"""
from __future__ import annotations
from dataclasses import is_dataclass
from typing import Any
from vllm import LLM
def _get_attr(obj: Any, name: str, default: Any = None) -> Any:
return getattr(obj, name, default)
def _bytes(num: float) -> str:
mib = num / (1024**2)
gib = num / (1024**3)
if gib >= 1:
return f"{gib:.3f} GiB"
return f"{mib:.3f} MiB"
def _tensor_bytes(tensors: dict[str, Any]) -> int:
total = 0
for tensor in tensors.values():
total += tensor.numel() * tensor.element_size()
return total
def _iter_layer_specs(cfg: Any):
for group in cfg.kv_cache_groups:
spec = group.kv_cache_spec
per_layer = _get_attr(spec, "kv_cache_specs")
if per_layer is None:
for layer_name in group.layer_names:
yield layer_name, spec
else:
for layer_name in group.layer_names:
yield layer_name, per_layer[layer_name]
def _worker_from_llm(llm: LLM) -> Any:
executor = llm.llm_engine.model_executor
return executor.driver_worker.worker
def main() -> None:
llm = LLM(
model="Intel/gemma-4-31B-it-int4-AutoRound",
spectral_calibration="/workspace/gemmacut/results_it/spectral_sidecar_chat_v2.pt",
spectral_quantize=True,
kv_cache_dtype="fp8_e4m3",
max_model_len=512,
max_num_batched_tokens=512,
max_num_seqs=1,
gpu_memory_utilization=0.8,
compilation_config={"compile_sizes": []},
)
worker = _worker_from_llm(llm)
runner = worker.model_runner
cfg = runner.kv_cache_config
kv_tensor_bytes = sum(t.size for t in cfg.kv_cache_tensors)
kv_tensor_per_block = kv_tensor_bytes / cfg.num_blocks
print("MEASURE spectral_quantize=True kv_cache_dtype=fp8_e4m3")
print(f"num_blocks={cfg.num_blocks}")
print(f"kv_cache_tensors={len(cfg.kv_cache_tensors)}")
print(
f"kv_tensor_bytes={kv_tensor_bytes} ({_bytes(kv_tensor_bytes)}) "
f"per_block={kv_tensor_per_block:.0f}"
)
for i, tensor in enumerate(cfg.kv_cache_tensors):
if i >= 5:
remaining = len(cfg.kv_cache_tensors) - i
print(f"tensor[{i}..] {remaining} more tensors omitted")
break
print(
f"tensor[{i}] size={tensor.size} shared_by={len(tensor.shared_by)} "
f"first={tensor.shared_by[:2]}"
)
for i, group in enumerate(cfg.kv_cache_groups):
spec = group.kv_cache_spec
fields = {
"layers": len(group.layer_names),
"spec": type(spec).__name__,
"page_size": _get_attr(spec, "page_size_bytes"),
"real_page_size": _get_attr(spec, "real_page_size_bytes"),
"block_size": _get_attr(spec, "block_size"),
"num_kv_heads": _get_attr(spec, "num_kv_heads"),
"head_size": _get_attr(spec, "head_size"),
}
if is_dataclass(spec):
fields["repr"] = repr(spec)
print(f"group[{i}] {fields}")
from vllm.v1.attention import spectral
norm = spectral._NORM_BUFFER
norm_bytes = 0 if norm is None else norm.numel() * norm.element_size()
norm_per_block = norm_bytes / cfg.num_blocks if cfg.num_blocks else 0
print(
f"norm_buffer_bytes={norm_bytes} ({_bytes(norm_bytes)}) "
f"per_block={norm_per_block:.0f} shape={None if norm is None else tuple(norm.shape)}"
)
scratch_bytes = 0
scratch_bytes += _tensor_bytes(spectral._DEQUANT_KEY_BUF)
scratch_bytes += _tensor_bytes(spectral._DEQUANT_VAL_BUF)
scratch_bytes += _tensor_bytes(spectral._DEQUANT_REMAP)
scratch_bytes += _tensor_bytes(spectral._DEQUANT_ACTIVE_MASK)
scratch_bytes += _tensor_bytes(spectral._DEQUANT_BLOCK_LIST)
scratch_bytes += _tensor_bytes(spectral._ROTATE_BUF_K)
scratch_bytes += _tensor_bytes(spectral._ROTATE_BUF_V)
scratch_bytes += _tensor_bytes(spectral._ROTATE_NORMS_K)
scratch_bytes += _tensor_bytes(spectral._ROTATE_NORMS_V)
print(
f"scratch_dequant_rotate_bytes={scratch_bytes} ({_bytes(scratch_bytes)}) "
"not counted as persistent KV cache"
)
# Full-head fp8 baseline from the active sidecar geometry. This keeps the
# comparison valid for both shared/padded and per-layer KV tensor layouts.
baseline_per_block = 0
baseline_layers = 0
cal = spectral.get_calibration()
if cal is not None:
for layer_name, spec in _iter_layer_specs(cfg):
layer_idx = spectral._extract_layer_index(layer_name)
lc = cal.get_layer(layer_idx)
if lc is None:
real_page_size = _get_attr(spec, "real_page_size_bytes", 0)
baseline_per_block += real_page_size
else:
baseline_per_block += (
2 * spec.block_size * lc.num_kv_heads * lc.head_dim
)
baseline_layers += 1
if baseline_per_block == 0:
# Fallback for unexpected engines where the spectral calibration is not
# visible. This is the Gemma 4 text-layer full-head fp8 geometry.
block_sizes = [
_get_attr(group.kv_cache_spec, "block_size")
for group in cfg.kv_cache_groups
if _get_attr(group.kv_cache_spec, "block_size") is not None
]
base_block_size = min(block_sizes) if block_sizes else 16
baseline_per_block = (
50 * 2 * base_block_size * 16 * 256
+ 10 * 2 * base_block_size * 4 * 512
)
baseline_layers = 60
baseline_total = baseline_per_block * cfg.num_blocks
codebook_plus_norm = kv_tensor_bytes + norm_bytes
print(
f"baseline_fp8_full_head_bytes={baseline_total} ({_bytes(baseline_total)}) "
f"per_block={baseline_per_block} layers={baseline_layers}"
)
print(
f"codebook_kv_plus_norm_bytes={codebook_plus_norm} "
f"({_bytes(codebook_plus_norm)}) "
f"per_block={kv_tensor_per_block + norm_per_block:.0f}"
)
print(
"compression_vs_fp8_full_head "
f"kv_tensor_only={baseline_total / kv_tensor_bytes:.4f}x "
f"savings={100 * (1 - kv_tensor_bytes / baseline_total):.2f}%"
)
print(
"compression_vs_fp8_full_head "
f"kv_plus_norm={baseline_total / codebook_plus_norm:.4f}x "
f"savings={100 * (1 - codebook_plus_norm / baseline_total):.2f}%"
)
print(
"compression_vs_bf16_full_head "
f"kv_plus_norm={(2 * baseline_total) / codebook_plus_norm:.4f}x "
f"savings={100 * (1 - codebook_plus_norm / (2 * baseline_total)):.2f}%"
)
if __name__ == "__main__":
main()
|