turboquant / scripts /benchmark_models.py
vivekvar's picture
Upload folder using huggingface_hub
d4ec3e8 verified
"""
Comprehensive TurboQuant benchmark across model families and sizes.
Tests: Qwen, Llama, Gemma, Phi, Mistral — 7B to 72B.
For each model:
1. Architecture analysis (layers, heads, KV heads, head_dim)
2. Outlier layer detection (key norm distribution)
3. Output quality (greedy decode comparison)
4. Memory savings at multiple context lengths
5. Prefill logit fidelity
"""
import sys
sys.path.insert(0, "/home/azureuser/turboquant")
import torch
import time
import json
import gc
import os
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from turboquant.cache import TurboQuantCache
RESULTS_FILE = "/home/azureuser/turboquant/benchmark_results.json"
MODELS = [
# (name, hf_id, approx_4bit_size_gb)
("Qwen2.5-7B", "Qwen/Qwen2.5-7B-Instruct", 5),
("Llama-3.1-8B", "meta-llama/Llama-3.1-8B-Instruct", 5),
("Gemma-2-9B", "google/gemma-2-9b-it", 6),
("Phi-4-14B", "microsoft/phi-4", 9),
("Qwen2.5-32B", "Qwen/Qwen2.5-32B-Instruct", 19),
("Llama-3.3-70B", "meta-llama/Llama-3.3-70B-Instruct", 38),
("Qwen2.5-72B", "Qwen/Qwen2.5-72B-Instruct", 40),
]
PROMPTS = [
"Explain quantum computing in simple terms.",
"Write a Python function to check if a number is prime.",
"What causes the northern lights?",
]
CONTEXT_LENGTHS = [1024, 4096, 8192]
PASSAGE = (
"The history of artificial intelligence began in antiquity, with myths, stories "
"and rumors of artificial beings endowed with intelligence or consciousness by "
"master craftsmen. The seeds of modern AI were planted by philosophers who attempted "
"to describe the process of human thinking as the mechanical manipulation of symbols. "
"This work culminated in the invention of the programmable digital computer in the 1940s, "
"a machine based on the abstract essence of mathematical reasoning. "
)
def cleanup_model():
"""Free GPU memory between model tests."""
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
def load_model(model_id):
"""Load model in 4-bit with bitsandbytes."""
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
trust_remote_code=True,
dtype=torch.bfloat16,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4",
),
)
return model, tokenizer
def get_architecture_info(model, config):
"""Extract architecture details."""
tc = config.get_text_config(decoder=True) if hasattr(config, "get_text_config") else config
info = {
"num_layers": getattr(tc, "num_hidden_layers", None),
"hidden_size": getattr(tc, "hidden_size", None),
"num_attention_heads": getattr(tc, "num_attention_heads", None),
"num_kv_heads": getattr(tc, "num_key_value_heads", getattr(tc, "num_attention_heads", None)),
"head_dim": None,
"model_type": getattr(tc, "model_type", "unknown"),
"max_position_embeddings": getattr(tc, "max_position_embeddings", None),
"rope_theta": getattr(tc, "rope_theta", None),
"torch_dtype": str(getattr(tc, "torch_dtype", "unknown")),
}
# Some models (Gemma-2) have explicit head_dim different from hidden_size/num_heads
info["head_dim"] = getattr(tc, "head_dim", None)
if info["head_dim"] is None and info["hidden_size"] and info["num_attention_heads"]:
info["head_dim"] = info["hidden_size"] // info["num_attention_heads"]
info["model_memory_gb"] = torch.cuda.memory_allocated() / 1024**3
return info
def analyze_layer_norms(model, tokenizer):
"""Run calibration to find outlier layer norms."""
inputs = tokenizer("The quick brown fox jumps over the lazy dog.", return_tensors="pt").to(model.device)
with torch.no_grad():
out = model(inputs.input_ids, use_cache=True)
cache = out.past_key_values
norms = []
for i in range(len(cache.layers)):
k = cache.layers[i].keys
if k is not None and k.numel() > 0:
norms.append(round(k.float().norm(dim=-1).mean().item(), 2))
else:
norms.append(0.0)
median_norm = sorted(norms)[len(norms) // 2]
outlier_layers = [i for i, n in enumerate(norms) if n > 5.0 * median_norm]
max_norm = max(norms)
max_layer = norms.index(max_norm)
del out, cache
cleanup_model()
return {
"median_norm": round(median_norm, 2),
"max_norm": round(max_norm, 2),
"max_norm_layer": max_layer,
"max_to_median_ratio": round(max_norm / median_norm, 2) if median_norm > 0 else 0,
"outlier_layers": outlier_layers,
"all_norms_first5": norms[:5],
"all_norms_last3": norms[-3:],
}
def test_output_quality(model, tokenizer, skip_layers):
"""Compare outputs on test prompts."""
results = []
for prompt in PROMPTS:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
n_input = inputs.input_ids.shape[1]
with torch.no_grad():
out_d = model.generate(**inputs, max_new_tokens=100, do_sample=False)
text_d = tokenizer.decode(out_d[0][n_input:], skip_special_tokens=True)
cleanup_model()
cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
device="cuda", skip_layers=skip_layers)
with torch.no_grad():
out_t = model.generate(**inputs, max_new_tokens=100, do_sample=False,
past_key_values=cache)
text_t = tokenizer.decode(out_t[0][n_input:], skip_special_tokens=True)
cleanup_model()
# Find divergence
diverge = min(len(text_d), len(text_t))
for i, (a, b) in enumerate(zip(text_d, text_t)):
if a != b:
diverge = i
break
# Token-level match
toks_d = tokenizer.encode(text_d)
toks_t = tokenizer.encode(text_t)
matching = sum(a == b for a, b in zip(toks_d, toks_t))
total = max(len(toks_d), len(toks_t))
results.append({
"prompt": prompt,
"exact_match": text_d == text_t,
"diverge_at_char": diverge,
"total_chars": len(text_d),
"token_match_pct": round(100 * matching / total, 1) if total > 0 else 100,
"default_output": text_d[:200],
"turboquant_output": text_t[:200],
"both_coherent": True, # Manual check flag
})
return results
def test_memory_savings(model, tokenizer, skip_layers, arch_info):
"""Measure memory at different context lengths."""
results = []
for target_ctx in CONTEXT_LENGTHS:
n_repeats = target_ctx // len(tokenizer.encode(PASSAGE)) + 1
long_prompt = PASSAGE * n_repeats + "\n\nSummarize the above in 2 sentences."
inputs = tokenizer(long_prompt, return_tensors="pt", truncation=True,
max_length=target_ctx).to(model.device)
actual_len = inputs.input_ids.shape[1]
# Default
cleanup_model()
torch.cuda.reset_peak_memory_stats()
with torch.no_grad():
out_d = model.generate(**inputs, max_new_tokens=30, do_sample=False)
peak_d = torch.cuda.max_memory_allocated()
text_d = tokenizer.decode(out_d[0][actual_len:], skip_special_tokens=True)
cleanup_model()
# TurboQuant
cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
device="cuda", skip_layers=skip_layers)
torch.cuda.reset_peak_memory_stats()
with torch.no_grad():
out_t = model.generate(**inputs, max_new_tokens=30, do_sample=False,
past_key_values=cache)
peak_t = torch.cuda.max_memory_allocated()
text_t = tokenizer.decode(out_t[0][actual_len:], skip_special_tokens=True)
cleanup_model()
saved_mb = (peak_d - peak_t) / 1024**2
results.append({
"context_length": actual_len,
"peak_default_gb": round(peak_d / 1024**3, 2),
"peak_turboquant_gb": round(peak_t / 1024**3, 2),
"saved_mb": round(saved_mb, 0),
"output_match": text_d[:100] == text_t[:100],
})
return results
def test_prefill_logits(model, tokenizer, skip_layers):
"""Compare prefill logits (should be near-identical since first call returns originals)."""
prompt = "The meaning of life is"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
out_d = model(inputs.input_ids, use_cache=True)
logits_d = out_d.logits[0, -1].float()
cleanup_model()
cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
device="cuda", skip_layers=skip_layers)
out_t = model(inputs.input_ids, use_cache=True, past_key_values=cache)
logits_t = out_t.logits[0, -1].float()
cleanup_model()
diff = (logits_d - logits_t).abs()
top1_d = logits_d.argmax().item()
top1_t = logits_t.argmax().item()
return {
"max_logit_diff": round(diff.max().item(), 6),
"mean_logit_diff": round(diff.mean().item(), 6),
"same_top1": top1_d == top1_t,
"top1_token": tokenizer.decode([top1_d]),
}
def benchmark_model(model_name, model_id, approx_size):
"""Run full benchmark for one model."""
print(f"\n{'='*70}")
print(f" BENCHMARKING: {model_name} ({model_id})")
print(f"{'='*70}")
# Check disk space
import shutil
free_gb = shutil.disk_usage("/").free / 1024**3
if free_gb < approx_size + 10:
print(f" SKIP: Only {free_gb:.0f}GB free, need ~{approx_size+10}GB")
return None
result = {"model_name": model_name, "model_id": model_id}
try:
# Load
print(f" Loading model...")
model, tokenizer = load_model(model_id)
print(f" Loaded: {torch.cuda.memory_allocated()/1024**3:.1f} GB on GPU")
# Architecture
print(f" Analyzing architecture...")
result["architecture"] = get_architecture_info(model, model.config)
print(f" Layers={result['architecture']['num_layers']}, "
f"KV heads={result['architecture']['num_kv_heads']}, "
f"head_dim={result['architecture']['head_dim']}")
# Check head_dim compatibility
head_dim = result["architecture"]["head_dim"]
if head_dim is None or head_dim % 2 != 0:
print(f" SKIP: Unsupported head_dim={head_dim}")
del model, tokenizer
cleanup_model()
return result
# Layer norms
print(f" Analyzing layer norms...")
result["layer_norms"] = analyze_layer_norms(model, tokenizer)
skip = set(result["layer_norms"]["outlier_layers"])
print(f" Median={result['layer_norms']['median_norm']}, "
f"Max={result['layer_norms']['max_norm']} (layer {result['layer_norms']['max_norm_layer']}), "
f"Ratio={result['layer_norms']['max_to_median_ratio']}x, "
f"Skip layers={skip}")
# Prefill logits
print(f" Testing prefill logit fidelity...")
result["prefill_logits"] = test_prefill_logits(model, tokenizer, skip)
print(f" Max diff={result['prefill_logits']['max_logit_diff']}, "
f"Same top-1={result['prefill_logits']['same_top1']}")
# Output quality
print(f" Testing output quality ({len(PROMPTS)} prompts)...")
result["quality"] = test_output_quality(model, tokenizer, skip)
for q in result["quality"]:
print(f" '{q['prompt'][:40]}...' → diverge@{q['diverge_at_char']}, "
f"tokens={q['token_match_pct']}%")
# Memory
print(f" Testing memory savings...")
result["memory"] = test_memory_savings(model, tokenizer, skip, result["architecture"])
for m in result["memory"]:
print(f" {m['context_length']}tok: "
f"{m['peak_default_gb']}GB → {m['peak_turboquant_gb']}GB "
f"(saved {m['saved_mb']}MB)")
result["status"] = "success"
except Exception as e:
print(f" ERROR: {e}")
result["status"] = "error"
result["error"] = str(e)
finally:
# Cleanup
try:
del model, tokenizer
except:
pass
cleanup_model()
# Clear HF cache for this model to save disk
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
print(f" Cleaned up GPU memory")
return result
def main():
all_results = []
# Load existing results if any
if Path(RESULTS_FILE).exists():
with open(RESULTS_FILE) as f:
all_results = json.load(f)
tested = {r["model_id"] for r in all_results if r.get("status") == "success"}
else:
tested = set()
for model_name, model_id, approx_size in MODELS:
if model_id in tested:
print(f"\n SKIP {model_name}: already tested")
continue
result = benchmark_model(model_name, model_id, approx_size)
if result:
# Remove any previous failed result for this model
all_results = [r for r in all_results if r.get("model_id") != model_id]
all_results.append(result)
# Save after each model
with open(RESULTS_FILE, "w") as f:
json.dump(all_results, f, indent=2, default=str)
print(f" Results saved to {RESULTS_FILE}")
# Print summary table
print(f"\n{'='*90}")
print(f" SUMMARY: TurboQuant Benchmark Results")
print(f"{'='*90}")
print(f"{'Model':<20} {'Layers':>6} {'KV/Hd':>6} {'HeadDim':>7} "
f"{'Outliers':>8} {'Prefill':>8} {'Quality':>8} {'Saved@8K':>10}")
print("-" * 90)
for r in all_results:
if r.get("status") != "success":
print(f"{r['model_name']:<20} {'ERROR':>6}")
continue
arch = r["architecture"]
norms = r["layer_norms"]
prefill = r["prefill_logits"]
quality = r["quality"]
mem = r.get("memory", [])
avg_diverge = sum(q["diverge_at_char"] for q in quality) / len(quality) if quality else 0
saved_8k = next((m["saved_mb"] for m in mem if m["context_length"] >= 8000), "N/A")
prefill_str = "exact" if prefill["max_logit_diff"] == 0 else f"{prefill['max_logit_diff']:.4f}"
saved_str = "N/A" if saved_8k == "N/A" else f"{saved_8k}MB"
print(f"{r['model_name']:<20} {arch['num_layers']:>6} {arch['num_kv_heads']:>6} "
f"{arch['head_dim']:>7} {len(norms['outlier_layers']):>8} "
f"{prefill_str:>8} "
f"{avg_diverge:>7.0f}ch {saved_str:>10}")
if __name__ == "__main__":
main()