turboquant / scripts /benchmark_models.py

Upload folder using huggingface_hub

d4ec3e8 verified 6 days ago

15.1 kB

	"""
	Comprehensive TurboQuant benchmark across model families and sizes.
	Tests: Qwen, Llama, Gemma, Phi, Mistral — 7B to 72B.

	For each model:
	1. Architecture analysis (layers, heads, KV heads, head_dim)
	2. Outlier layer detection (key norm distribution)
	3. Output quality (greedy decode comparison)
	4. Memory savings at multiple context lengths
	5. Prefill logit fidelity
	"""

	import sys
	sys.path.insert(0, "/home/azureuser/turboquant")

	import torch
	import time
	import json
	import gc
	import os
	from pathlib import Path
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from turboquant.cache import TurboQuantCache

	RESULTS_FILE = "/home/azureuser/turboquant/benchmark_results.json"

	MODELS = [
	# (name, hf_id, approx_4bit_size_gb)
	("Qwen2.5-7B", "Qwen/Qwen2.5-7B-Instruct", 5),
	("Llama-3.1-8B", "meta-llama/Llama-3.1-8B-Instruct", 5),
	("Gemma-2-9B", "google/gemma-2-9b-it", 6),
	("Phi-4-14B", "microsoft/phi-4", 9),
	("Qwen2.5-32B", "Qwen/Qwen2.5-32B-Instruct", 19),
	("Llama-3.3-70B", "meta-llama/Llama-3.3-70B-Instruct", 38),
	("Qwen2.5-72B", "Qwen/Qwen2.5-72B-Instruct", 40),
	]

	PROMPTS = [
	"Explain quantum computing in simple terms.",
	"Write a Python function to check if a number is prime.",
	"What causes the northern lights?",
	]

	CONTEXT_LENGTHS = [1024, 4096, 8192]

	PASSAGE = (
	"The history of artificial intelligence began in antiquity, with myths, stories "
	"and rumors of artificial beings endowed with intelligence or consciousness by "
	"master craftsmen. The seeds of modern AI were planted by philosophers who attempted "
	"to describe the process of human thinking as the mechanical manipulation of symbols. "
	"This work culminated in the invention of the programmable digital computer in the 1940s, "
	"a machine based on the abstract essence of mathematical reasoning. "
	)


	def cleanup_model():
	"""Free GPU memory between model tests."""
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.reset_peak_memory_stats()


	def load_model(model_id):
	"""Load model in 4-bit with bitsandbytes."""
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	trust_remote_code=True,
	dtype=torch.bfloat16,
	quantization_config=BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_quant_type="nf4",
	),
	)
	return model, tokenizer


	def get_architecture_info(model, config):
	"""Extract architecture details."""
	tc = config.get_text_config(decoder=True) if hasattr(config, "get_text_config") else config
	info = {
	"num_layers": getattr(tc, "num_hidden_layers", None),
	"hidden_size": getattr(tc, "hidden_size", None),
	"num_attention_heads": getattr(tc, "num_attention_heads", None),
	"num_kv_heads": getattr(tc, "num_key_value_heads", getattr(tc, "num_attention_heads", None)),
	"head_dim": None,
	"model_type": getattr(tc, "model_type", "unknown"),
	"max_position_embeddings": getattr(tc, "max_position_embeddings", None),
	"rope_theta": getattr(tc, "rope_theta", None),
	"torch_dtype": str(getattr(tc, "torch_dtype", "unknown")),
	}
	# Some models (Gemma-2) have explicit head_dim different from hidden_size/num_heads
	info["head_dim"] = getattr(tc, "head_dim", None)
	if info["head_dim"] is None and info["hidden_size"] and info["num_attention_heads"]:
	info["head_dim"] = info["hidden_size"] // info["num_attention_heads"]
	info["model_memory_gb"] = torch.cuda.memory_allocated() / 1024**3
	return info


	def analyze_layer_norms(model, tokenizer):
	"""Run calibration to find outlier layer norms."""
	inputs = tokenizer("The quick brown fox jumps over the lazy dog.", return_tensors="pt").to(model.device)
	with torch.no_grad():
	out = model(inputs.input_ids, use_cache=True)

	cache = out.past_key_values
	norms = []
	for i in range(len(cache.layers)):
	k = cache.layers[i].keys
	if k is not None and k.numel() > 0:
	norms.append(round(k.float().norm(dim=-1).mean().item(), 2))
	else:
	norms.append(0.0)

	median_norm = sorted(norms)[len(norms) // 2]
	outlier_layers = [i for i, n in enumerate(norms) if n > 5.0 * median_norm]
	max_norm = max(norms)
	max_layer = norms.index(max_norm)

	del out, cache
	cleanup_model()

	return {
	"median_norm": round(median_norm, 2),
	"max_norm": round(max_norm, 2),
	"max_norm_layer": max_layer,
	"max_to_median_ratio": round(max_norm / median_norm, 2) if median_norm > 0 else 0,
	"outlier_layers": outlier_layers,
	"all_norms_first5": norms[:5],
	"all_norms_last3": norms[-3:],
	}


	def test_output_quality(model, tokenizer, skip_layers):
	"""Compare outputs on test prompts."""
	results = []
	for prompt in PROMPTS:
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
	n_input = inputs.input_ids.shape[1]

	with torch.no_grad():
	out_d = model.generate(**inputs, max_new_tokens=100, do_sample=False)
	text_d = tokenizer.decode(out_d[0][n_input:], skip_special_tokens=True)
	cleanup_model()

	cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
	device="cuda", skip_layers=skip_layers)
	with torch.no_grad():
	out_t = model.generate(**inputs, max_new_tokens=100, do_sample=False,
	past_key_values=cache)
	text_t = tokenizer.decode(out_t[0][n_input:], skip_special_tokens=True)
	cleanup_model()

	# Find divergence
	diverge = min(len(text_d), len(text_t))
	for i, (a, b) in enumerate(zip(text_d, text_t)):
	if a != b:
	diverge = i
	break

	# Token-level match
	toks_d = tokenizer.encode(text_d)
	toks_t = tokenizer.encode(text_t)
	matching = sum(a == b for a, b in zip(toks_d, toks_t))
	total = max(len(toks_d), len(toks_t))

	results.append({
	"prompt": prompt,
	"exact_match": text_d == text_t,
	"diverge_at_char": diverge,
	"total_chars": len(text_d),
	"token_match_pct": round(100 * matching / total, 1) if total > 0 else 100,
	"default_output": text_d[:200],
	"turboquant_output": text_t[:200],
	"both_coherent": True, # Manual check flag
	})

	return results


	def test_memory_savings(model, tokenizer, skip_layers, arch_info):
	"""Measure memory at different context lengths."""
	results = []

	for target_ctx in CONTEXT_LENGTHS:
	n_repeats = target_ctx // len(tokenizer.encode(PASSAGE)) + 1
	long_prompt = PASSAGE * n_repeats + "\n\nSummarize the above in 2 sentences."
	inputs = tokenizer(long_prompt, return_tensors="pt", truncation=True,
	max_length=target_ctx).to(model.device)
	actual_len = inputs.input_ids.shape[1]

	# Default
	cleanup_model()
	torch.cuda.reset_peak_memory_stats()
	with torch.no_grad():
	out_d = model.generate(**inputs, max_new_tokens=30, do_sample=False)
	peak_d = torch.cuda.max_memory_allocated()
	text_d = tokenizer.decode(out_d[0][actual_len:], skip_special_tokens=True)
	cleanup_model()

	# TurboQuant
	cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
	device="cuda", skip_layers=skip_layers)
	torch.cuda.reset_peak_memory_stats()
	with torch.no_grad():
	out_t = model.generate(**inputs, max_new_tokens=30, do_sample=False,
	past_key_values=cache)
	peak_t = torch.cuda.max_memory_allocated()
	text_t = tokenizer.decode(out_t[0][actual_len:], skip_special_tokens=True)
	cleanup_model()

	saved_mb = (peak_d - peak_t) / 1024**2

	results.append({
	"context_length": actual_len,
	"peak_default_gb": round(peak_d / 1024**3, 2),
	"peak_turboquant_gb": round(peak_t / 1024**3, 2),
	"saved_mb": round(saved_mb, 0),
	"output_match": text_d[:100] == text_t[:100],
	})

	return results


	def test_prefill_logits(model, tokenizer, skip_layers):
	"""Compare prefill logits (should be near-identical since first call returns originals)."""
	prompt = "The meaning of life is"
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	with torch.no_grad():
	out_d = model(inputs.input_ids, use_cache=True)
	logits_d = out_d.logits[0, -1].float()
	cleanup_model()

	cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
	device="cuda", skip_layers=skip_layers)
	out_t = model(inputs.input_ids, use_cache=True, past_key_values=cache)
	logits_t = out_t.logits[0, -1].float()
	cleanup_model()

	diff = (logits_d - logits_t).abs()
	top1_d = logits_d.argmax().item()
	top1_t = logits_t.argmax().item()

	return {
	"max_logit_diff": round(diff.max().item(), 6),
	"mean_logit_diff": round(diff.mean().item(), 6),
	"same_top1": top1_d == top1_t,
	"top1_token": tokenizer.decode([top1_d]),
	}


	def benchmark_model(model_name, model_id, approx_size):
	"""Run full benchmark for one model."""
	print(f"\n{'='*70}")
	print(f" BENCHMARKING: {model_name} ({model_id})")
	print(f"{'='*70}")

	# Check disk space
	import shutil
	free_gb = shutil.disk_usage("/").free / 1024**3
	if free_gb < approx_size + 10:
	print(f" SKIP: Only {free_gb:.0f}GB free, need ~{approx_size+10}GB")
	return None

	result = {"model_name": model_name, "model_id": model_id}

	try:
	# Load
	print(f" Loading model...")
	model, tokenizer = load_model(model_id)
	print(f" Loaded: {torch.cuda.memory_allocated()/1024**3:.1f} GB on GPU")

	# Architecture
	print(f" Analyzing architecture...")
	result["architecture"] = get_architecture_info(model, model.config)
	print(f" Layers={result['architecture']['num_layers']}, "
	f"KV heads={result['architecture']['num_kv_heads']}, "
	f"head_dim={result['architecture']['head_dim']}")

	# Check head_dim compatibility
	head_dim = result["architecture"]["head_dim"]
	if head_dim is None or head_dim % 2 != 0:
	print(f" SKIP: Unsupported head_dim={head_dim}")
	del model, tokenizer
	cleanup_model()
	return result

	# Layer norms
	print(f" Analyzing layer norms...")
	result["layer_norms"] = analyze_layer_norms(model, tokenizer)
	skip = set(result["layer_norms"]["outlier_layers"])
	print(f" Median={result['layer_norms']['median_norm']}, "
	f"Max={result['layer_norms']['max_norm']} (layer {result['layer_norms']['max_norm_layer']}), "
	f"Ratio={result['layer_norms']['max_to_median_ratio']}x, "
	f"Skip layers={skip}")

	# Prefill logits
	print(f" Testing prefill logit fidelity...")
	result["prefill_logits"] = test_prefill_logits(model, tokenizer, skip)
	print(f" Max diff={result['prefill_logits']['max_logit_diff']}, "
	f"Same top-1={result['prefill_logits']['same_top1']}")

	# Output quality
	print(f" Testing output quality ({len(PROMPTS)} prompts)...")
	result["quality"] = test_output_quality(model, tokenizer, skip)
	for q in result["quality"]:
	print(f" '{q['prompt'][:40]}...' → diverge@{q['diverge_at_char']}, "
	f"tokens={q['token_match_pct']}%")

	# Memory
	print(f" Testing memory savings...")
	result["memory"] = test_memory_savings(model, tokenizer, skip, result["architecture"])
	for m in result["memory"]:
	print(f" {m['context_length']}tok: "
	f"{m['peak_default_gb']}GB → {m['peak_turboquant_gb']}GB "
	f"(saved {m['saved_mb']}MB)")

	result["status"] = "success"

	except Exception as e:
	print(f" ERROR: {e}")
	result["status"] = "error"
	result["error"] = str(e)

	finally:
	# Cleanup
	try:
	del model, tokenizer
	except:
	pass
	cleanup_model()
	# Clear HF cache for this model to save disk
	cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
	print(f" Cleaned up GPU memory")

	return result


	def main():
	all_results = []

	# Load existing results if any
	if Path(RESULTS_FILE).exists():
	with open(RESULTS_FILE) as f:
	all_results = json.load(f)
	tested = {r["model_id"] for r in all_results if r.get("status") == "success"}
	else:
	tested = set()

	for model_name, model_id, approx_size in MODELS:
	if model_id in tested:
	print(f"\n SKIP {model_name}: already tested")
	continue

	result = benchmark_model(model_name, model_id, approx_size)
	if result:
	# Remove any previous failed result for this model
	all_results = [r for r in all_results if r.get("model_id") != model_id]
	all_results.append(result)

	# Save after each model
	with open(RESULTS_FILE, "w") as f:
	json.dump(all_results, f, indent=2, default=str)
	print(f" Results saved to {RESULTS_FILE}")

	# Print summary table
	print(f"\n{'='*90}")
	print(f" SUMMARY: TurboQuant Benchmark Results")
	print(f"{'='*90}")
	print(f"{'Model':<20} {'Layers':>6} {'KV/Hd':>6} {'HeadDim':>7} "
	f"{'Outliers':>8} {'Prefill':>8} {'Quality':>8} {'Saved@8K':>10}")
	print("-" * 90)

	for r in all_results:
	if r.get("status") != "success":
	print(f"{r['model_name']:<20} {'ERROR':>6}")
	continue

	arch = r["architecture"]
	norms = r["layer_norms"]
	prefill = r["prefill_logits"]
	quality = r["quality"]
	mem = r.get("memory", [])

	avg_diverge = sum(q["diverge_at_char"] for q in quality) / len(quality) if quality else 0
	saved_8k = next((m["saved_mb"] for m in mem if m["context_length"] >= 8000), "N/A")

	prefill_str = "exact" if prefill["max_logit_diff"] == 0 else f"{prefill['max_logit_diff']:.4f}"
	saved_str = "N/A" if saved_8k == "N/A" else f"{saved_8k}MB"
	print(f"{r['model_name']:<20} {arch['num_layers']:>6} {arch['num_kv_heads']:>6} "
	f"{arch['head_dim']:>7} {len(norms['outlier_layers']):>8} "
	f"{prefill_str:>8} "
	f"{avg_diverge:>7.0f}ch {saved_str:>10}")


	if __name__ == "__main__":
	main()