File size: 15,064 Bytes

d4ec3e8

"""
Comprehensive TurboQuant benchmark across model families and sizes.
Tests: Qwen, Llama, Gemma, Phi, Mistral — 7B to 72B.

For each model:
1. Architecture analysis (layers, heads, KV heads, head_dim)
2. Outlier layer detection (key norm distribution)
3. Output quality (greedy decode comparison)
4. Memory savings at multiple context lengths
5. Prefill logit fidelity
"""

import sys
sys.path.insert(0, "/home/azureuser/turboquant")

import torch
import time
import json
import gc
import os
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from turboquant.cache import TurboQuantCache

RESULTS_FILE = "/home/azureuser/turboquant/benchmark_results.json"

MODELS = [
    # (name, hf_id, approx_4bit_size_gb)
    ("Qwen2.5-7B", "Qwen/Qwen2.5-7B-Instruct", 5),
    ("Llama-3.1-8B", "meta-llama/Llama-3.1-8B-Instruct", 5),
    ("Gemma-2-9B", "google/gemma-2-9b-it", 6),
    ("Phi-4-14B", "microsoft/phi-4", 9),
    ("Qwen2.5-32B", "Qwen/Qwen2.5-32B-Instruct", 19),
    ("Llama-3.3-70B", "meta-llama/Llama-3.3-70B-Instruct", 38),
    ("Qwen2.5-72B", "Qwen/Qwen2.5-72B-Instruct", 40),
]

PROMPTS = [
    "Explain quantum computing in simple terms.",
    "Write a Python function to check if a number is prime.",
    "What causes the northern lights?",
]

CONTEXT_LENGTHS = [1024, 4096, 8192]

PASSAGE = (
    "The history of artificial intelligence began in antiquity, with myths, stories "
    "and rumors of artificial beings endowed with intelligence or consciousness by "
    "master craftsmen. The seeds of modern AI were planted by philosophers who attempted "
    "to describe the process of human thinking as the mechanical manipulation of symbols. "
    "This work culminated in the invention of the programmable digital computer in the 1940s, "
    "a machine based on the abstract essence of mathematical reasoning. "
)


def cleanup_model():
    """Free GPU memory between model tests."""
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()


def load_model(model_id):
    """Load model in 4-bit with bitsandbytes."""
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        trust_remote_code=True,
        dtype=torch.bfloat16,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4",
        ),
    )
    return model, tokenizer


def get_architecture_info(model, config):
    """Extract architecture details."""
    tc = config.get_text_config(decoder=True) if hasattr(config, "get_text_config") else config
    info = {
        "num_layers": getattr(tc, "num_hidden_layers", None),
        "hidden_size": getattr(tc, "hidden_size", None),
        "num_attention_heads": getattr(tc, "num_attention_heads", None),
        "num_kv_heads": getattr(tc, "num_key_value_heads", getattr(tc, "num_attention_heads", None)),
        "head_dim": None,
        "model_type": getattr(tc, "model_type", "unknown"),
        "max_position_embeddings": getattr(tc, "max_position_embeddings", None),
        "rope_theta": getattr(tc, "rope_theta", None),
        "torch_dtype": str(getattr(tc, "torch_dtype", "unknown")),
    }
    # Some models (Gemma-2) have explicit head_dim different from hidden_size/num_heads
    info["head_dim"] = getattr(tc, "head_dim", None)
    if info["head_dim"] is None and info["hidden_size"] and info["num_attention_heads"]:
        info["head_dim"] = info["hidden_size"] // info["num_attention_heads"]
    info["model_memory_gb"] = torch.cuda.memory_allocated() / 1024**3
    return info


def analyze_layer_norms(model, tokenizer):
    """Run calibration to find outlier layer norms."""
    inputs = tokenizer("The quick brown fox jumps over the lazy dog.", return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model(inputs.input_ids, use_cache=True)

    cache = out.past_key_values
    norms = []
    for i in range(len(cache.layers)):
        k = cache.layers[i].keys
        if k is not None and k.numel() > 0:
            norms.append(round(k.float().norm(dim=-1).mean().item(), 2))
        else:
            norms.append(0.0)

    median_norm = sorted(norms)[len(norms) // 2]
    outlier_layers = [i for i, n in enumerate(norms) if n > 5.0 * median_norm]
    max_norm = max(norms)
    max_layer = norms.index(max_norm)

    del out, cache
    cleanup_model()

    return {
        "median_norm": round(median_norm, 2),
        "max_norm": round(max_norm, 2),
        "max_norm_layer": max_layer,
        "max_to_median_ratio": round(max_norm / median_norm, 2) if median_norm > 0 else 0,
        "outlier_layers": outlier_layers,
        "all_norms_first5": norms[:5],
        "all_norms_last3": norms[-3:],
    }


def test_output_quality(model, tokenizer, skip_layers):
    """Compare outputs on test prompts."""
    results = []
    for prompt in PROMPTS:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        n_input = inputs.input_ids.shape[1]

        with torch.no_grad():
            out_d = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        text_d = tokenizer.decode(out_d[0][n_input:], skip_special_tokens=True)
        cleanup_model()

        cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
                                device="cuda", skip_layers=skip_layers)
        with torch.no_grad():
            out_t = model.generate(**inputs, max_new_tokens=100, do_sample=False,
                                   past_key_values=cache)
        text_t = tokenizer.decode(out_t[0][n_input:], skip_special_tokens=True)
        cleanup_model()

        # Find divergence
        diverge = min(len(text_d), len(text_t))
        for i, (a, b) in enumerate(zip(text_d, text_t)):
            if a != b:
                diverge = i
                break

        # Token-level match
        toks_d = tokenizer.encode(text_d)
        toks_t = tokenizer.encode(text_t)
        matching = sum(a == b for a, b in zip(toks_d, toks_t))
        total = max(len(toks_d), len(toks_t))

        results.append({
            "prompt": prompt,
            "exact_match": text_d == text_t,
            "diverge_at_char": diverge,
            "total_chars": len(text_d),
            "token_match_pct": round(100 * matching / total, 1) if total > 0 else 100,
            "default_output": text_d[:200],
            "turboquant_output": text_t[:200],
            "both_coherent": True,  # Manual check flag
        })

    return results


def test_memory_savings(model, tokenizer, skip_layers, arch_info):
    """Measure memory at different context lengths."""
    results = []

    for target_ctx in CONTEXT_LENGTHS:
        n_repeats = target_ctx // len(tokenizer.encode(PASSAGE)) + 1
        long_prompt = PASSAGE * n_repeats + "\n\nSummarize the above in 2 sentences."
        inputs = tokenizer(long_prompt, return_tensors="pt", truncation=True,
                           max_length=target_ctx).to(model.device)
        actual_len = inputs.input_ids.shape[1]

        # Default
        cleanup_model()
        torch.cuda.reset_peak_memory_stats()
        with torch.no_grad():
            out_d = model.generate(**inputs, max_new_tokens=30, do_sample=False)
        peak_d = torch.cuda.max_memory_allocated()
        text_d = tokenizer.decode(out_d[0][actual_len:], skip_special_tokens=True)
        cleanup_model()

        # TurboQuant
        cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
                                device="cuda", skip_layers=skip_layers)
        torch.cuda.reset_peak_memory_stats()
        with torch.no_grad():
            out_t = model.generate(**inputs, max_new_tokens=30, do_sample=False,
                                   past_key_values=cache)
        peak_t = torch.cuda.max_memory_allocated()
        text_t = tokenizer.decode(out_t[0][actual_len:], skip_special_tokens=True)
        cleanup_model()

        saved_mb = (peak_d - peak_t) / 1024**2

        results.append({
            "context_length": actual_len,
            "peak_default_gb": round(peak_d / 1024**3, 2),
            "peak_turboquant_gb": round(peak_t / 1024**3, 2),
            "saved_mb": round(saved_mb, 0),
            "output_match": text_d[:100] == text_t[:100],
        })

    return results


def test_prefill_logits(model, tokenizer, skip_layers):
    """Compare prefill logits (should be near-identical since first call returns originals)."""
    prompt = "The meaning of life is"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out_d = model(inputs.input_ids, use_cache=True)
        logits_d = out_d.logits[0, -1].float()
        cleanup_model()

        cache = TurboQuantCache(model.config, nbits=4, residual_length=128,
                                device="cuda", skip_layers=skip_layers)
        out_t = model(inputs.input_ids, use_cache=True, past_key_values=cache)
        logits_t = out_t.logits[0, -1].float()
        cleanup_model()

    diff = (logits_d - logits_t).abs()
    top1_d = logits_d.argmax().item()
    top1_t = logits_t.argmax().item()

    return {
        "max_logit_diff": round(diff.max().item(), 6),
        "mean_logit_diff": round(diff.mean().item(), 6),
        "same_top1": top1_d == top1_t,
        "top1_token": tokenizer.decode([top1_d]),
    }


def benchmark_model(model_name, model_id, approx_size):
    """Run full benchmark for one model."""
    print(f"\n{'='*70}")
    print(f"  BENCHMARKING: {model_name} ({model_id})")
    print(f"{'='*70}")

    # Check disk space
    import shutil
    free_gb = shutil.disk_usage("/").free / 1024**3
    if free_gb < approx_size + 10:
        print(f"  SKIP: Only {free_gb:.0f}GB free, need ~{approx_size+10}GB")
        return None

    result = {"model_name": model_name, "model_id": model_id}

    try:
        # Load
        print(f"  Loading model...")
        model, tokenizer = load_model(model_id)
        print(f"  Loaded: {torch.cuda.memory_allocated()/1024**3:.1f} GB on GPU")

        # Architecture
        print(f"  Analyzing architecture...")
        result["architecture"] = get_architecture_info(model, model.config)
        print(f"    Layers={result['architecture']['num_layers']}, "
              f"KV heads={result['architecture']['num_kv_heads']}, "
              f"head_dim={result['architecture']['head_dim']}")

        # Check head_dim compatibility
        head_dim = result["architecture"]["head_dim"]
        if head_dim is None or head_dim % 2 != 0:
            print(f"  SKIP: Unsupported head_dim={head_dim}")
            del model, tokenizer
            cleanup_model()
            return result

        # Layer norms
        print(f"  Analyzing layer norms...")
        result["layer_norms"] = analyze_layer_norms(model, tokenizer)
        skip = set(result["layer_norms"]["outlier_layers"])
        print(f"    Median={result['layer_norms']['median_norm']}, "
              f"Max={result['layer_norms']['max_norm']} (layer {result['layer_norms']['max_norm_layer']}), "
              f"Ratio={result['layer_norms']['max_to_median_ratio']}x, "
              f"Skip layers={skip}")

        # Prefill logits
        print(f"  Testing prefill logit fidelity...")
        result["prefill_logits"] = test_prefill_logits(model, tokenizer, skip)
        print(f"    Max diff={result['prefill_logits']['max_logit_diff']}, "
              f"Same top-1={result['prefill_logits']['same_top1']}")

        # Output quality
        print(f"  Testing output quality ({len(PROMPTS)} prompts)...")
        result["quality"] = test_output_quality(model, tokenizer, skip)
        for q in result["quality"]:
            print(f"    '{q['prompt'][:40]}...' → diverge@{q['diverge_at_char']}, "
                  f"tokens={q['token_match_pct']}%")

        # Memory
        print(f"  Testing memory savings...")
        result["memory"] = test_memory_savings(model, tokenizer, skip, result["architecture"])
        for m in result["memory"]:
            print(f"    {m['context_length']}tok: "
                  f"{m['peak_default_gb']}GB → {m['peak_turboquant_gb']}GB "
                  f"(saved {m['saved_mb']}MB)")

        result["status"] = "success"

    except Exception as e:
        print(f"  ERROR: {e}")
        result["status"] = "error"
        result["error"] = str(e)

    finally:
        # Cleanup
        try:
            del model, tokenizer
        except:
            pass
        cleanup_model()
        # Clear HF cache for this model to save disk
        cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
        print(f"  Cleaned up GPU memory")

    return result


def main():
    all_results = []

    # Load existing results if any
    if Path(RESULTS_FILE).exists():
        with open(RESULTS_FILE) as f:
            all_results = json.load(f)
        tested = {r["model_id"] for r in all_results if r.get("status") == "success"}
    else:
        tested = set()

    for model_name, model_id, approx_size in MODELS:
        if model_id in tested:
            print(f"\n  SKIP {model_name}: already tested")
            continue

        result = benchmark_model(model_name, model_id, approx_size)
        if result:
            # Remove any previous failed result for this model
            all_results = [r for r in all_results if r.get("model_id") != model_id]
            all_results.append(result)

            # Save after each model
            with open(RESULTS_FILE, "w") as f:
                json.dump(all_results, f, indent=2, default=str)
            print(f"  Results saved to {RESULTS_FILE}")

    # Print summary table
    print(f"\n{'='*90}")
    print(f"  SUMMARY: TurboQuant Benchmark Results")
    print(f"{'='*90}")
    print(f"{'Model':<20} {'Layers':>6} {'KV/Hd':>6} {'HeadDim':>7} "
          f"{'Outliers':>8} {'Prefill':>8} {'Quality':>8} {'Saved@8K':>10}")
    print("-" * 90)

    for r in all_results:
        if r.get("status") != "success":
            print(f"{r['model_name']:<20} {'ERROR':>6}")
            continue

        arch = r["architecture"]
        norms = r["layer_norms"]
        prefill = r["prefill_logits"]
        quality = r["quality"]
        mem = r.get("memory", [])

        avg_diverge = sum(q["diverge_at_char"] for q in quality) / len(quality) if quality else 0
        saved_8k = next((m["saved_mb"] for m in mem if m["context_length"] >= 8000), "N/A")

        prefill_str = "exact" if prefill["max_logit_diff"] == 0 else f"{prefill['max_logit_diff']:.4f}"
        saved_str = "N/A" if saved_8k == "N/A" else f"{saved_8k}MB"
        print(f"{r['model_name']:<20} {arch['num_layers']:>6} {arch['num_kv_heads']:>6} "
              f"{arch['head_dim']:>7} {len(norms['outlier_layers']):>8} "
              f"{prefill_str:>8} "
              f"{avg_diverge:>7.0f}ch {saved_str:>10}")


if __name__ == "__main__":
    main()