benchmark_real_model.py · Joysulem/FireEcho at main

File size: 15,844 Bytes

b5bff9c

#!/usr/bin/env python3
"""
Real Model Benchmark: FireEcho vs HuggingFace
==============================================
Loads Qwen2-0.5B into both HuggingFace and FireEcho, validates correctness,
then benchmarks generation tok/s, TTFT, and VRAM across prompt lengths.

Usage:
    python benchmark_real_model.py
    python benchmark_real_model.py --model Qwen/Qwen2-0.5B --prompt-lengths 128 512 2048
"""

import argparse
import sys
import time
from typing import Dict, List, Optional, Tuple

import torch
import torch.nn.functional as F

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _cuda_sync():
    if torch.cuda.is_available():
        torch.cuda.synchronize()


def _peak_vram_mb() -> float:
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / (1024 ** 2)
    return 0.0


def _reset_peak_vram():
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()


def _timed_cuda(fn, warmup: int = 2, repeats: int = 5) -> float:
    """Run *fn* with CUDA events, return median wall-time in seconds."""
    for _ in range(warmup):
        fn()
    _cuda_sync()

    times = []
    for _ in range(repeats):
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        fn()
        end.record()
        _cuda_sync()
        times.append(start.elapsed_time(end) / 1000.0)  # ms → s
    times.sort()
    return times[len(times) // 2]  # median


# ---------------------------------------------------------------------------
# 1. Load models
# ---------------------------------------------------------------------------

def load_hf_model(model_name: str, dtype=torch.bfloat16, device='cuda'):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    print(f"\n[HF] Loading {model_name} ...")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=dtype, device_map=device,
        trust_remote_code=True, attn_implementation="sdpa",
    )
    model.eval()
    params_m = sum(p.numel() for p in model.parameters()) / 1e6
    print(f"[HF] {params_m:.1f}M params, dtype={dtype}, device={device}")
    return model, tokenizer


def load_fireecho(model_name: str, dtype=torch.bfloat16, device='cuda',
                  use_goliath: bool = False, goliath_bits: int = 4):
    sys.path.insert(0, '.')
    from fireecho_kernel import FireEchoEngine, FireEchoConfig

    if use_goliath:
        # Load base first, then build quantised config
        from transformers import AutoConfig
        hf_cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
        config = FireEchoConfig(
            dim=hf_cfg.hidden_size,
            num_heads=hf_cfg.num_attention_heads,
            num_kv_heads=getattr(hf_cfg, 'num_key_value_heads',
                                 hf_cfg.num_attention_heads),
            num_layers=hf_cfg.num_hidden_layers,
            vocab_size=hf_cfg.vocab_size,
            intermediate_size=hf_cfg.intermediate_size,
            max_seq_len=min(getattr(hf_cfg, 'max_position_embeddings', 4096),
                            32768),
            rope_theta=getattr(hf_cfg, 'rope_theta', 10000.0),
            attn_bias=getattr(hf_cfg, 'attention_bias', False),
            tie_word_embeddings=getattr(hf_cfg, 'tie_word_embeddings', False),
            use_nvfp4=True,
            use_goliath=True,
            goliath_bits=goliath_bits,
            use_hebbian=False,
        )
        tag = f"FP{goliath_bits}"
        print(f"\n[FE-{tag}] Loading {model_name} (Goliath {tag}) ...")
        engine = FireEchoEngine.from_pretrained(model_name, config=config,
                                                dtype=dtype, device=device)
    else:
        tag = "BF16"
        print(f"\n[FE-{tag}] Loading {model_name} ...")
        engine = FireEchoEngine.from_pretrained(model_name, dtype=dtype,
                                                device=device)

    engine.eval()
    params_m = sum(p.numel() for p in engine.parameters()) / 1e6
    print(f"[FE-{tag}] {params_m:.1f}M params")
    return engine


# ---------------------------------------------------------------------------
# 2. Correctness validation
# ---------------------------------------------------------------------------

def validate_correctness(hf_model, fe_engine, tokenizer, device='cuda',
                         seq_len: int = 128) -> Dict:
    """Compare HF vs FireEcho logits on the same input."""
    prompt = "The quick brown fox jumps over the lazy dog. " * 20
    input_ids = tokenizer(prompt, return_tensors='pt',
                          max_length=seq_len, truncation=True).input_ids.to(device)
    actual_len = input_ids.shape[1]

    with torch.no_grad():
        hf_logits = hf_model(input_ids).logits             # [1, S, V]
        fe_logits = fe_engine(input_ids)                    # [1, S, V]

    # Top-1 match rate
    hf_top1 = hf_logits.argmax(dim=-1)        # [1, S]
    fe_top1 = fe_logits.argmax(dim=-1)
    match_rate = (hf_top1 == fe_top1).float().mean().item()

    # Numerical distance
    max_abs_diff = (hf_logits - fe_logits).abs().max().item()
    cos_sim = F.cosine_similarity(
        hf_logits.view(-1, hf_logits.shape[-1]).float(),
        fe_logits.view(-1, fe_logits.shape[-1]).float(),
        dim=-1,
    ).mean().item()

    return {
        'seq_len': actual_len,
        'top1_match': match_rate,
        'max_abs_diff': max_abs_diff,
        'cosine_sim': cos_sim,
    }


# ---------------------------------------------------------------------------
# 3. Benchmark helpers
# ---------------------------------------------------------------------------

@torch.no_grad()
def bench_prefill(model, input_ids, is_hf: bool) -> Tuple[float, float]:
    """Measure TTFT (time-to-first-token) and peak VRAM for prefill."""
    _reset_peak_vram()
    _cuda_sync()

    def _run():
        if is_hf:
            model(input_ids)
        else:
            model(input_ids)

    ttft = _timed_cuda(_run, warmup=2, repeats=5)
    vram = _peak_vram_mb()
    return ttft, vram


@torch.no_grad()
def bench_decode(model, input_ids, max_new_tokens: int, is_hf: bool,
                 tokenizer=None) -> Tuple[float, float]:
    """Measure decode tok/s and peak VRAM."""
    _reset_peak_vram()
    _cuda_sync()

    def _run():
        if is_hf:
            model.generate(input_ids, max_new_tokens=max_new_tokens,
                           do_sample=False, use_cache=True)
        else:
            model.generate(input_ids, max_new_tokens=max_new_tokens,
                           temperature=0.0, top_k=1, use_cache=True)

    elapsed = _timed_cuda(_run, warmup=1, repeats=3)
    tok_per_s = max_new_tokens / elapsed
    vram = _peak_vram_mb()
    return tok_per_s, vram


def make_input(tokenizer, seq_len: int, device='cuda') -> torch.Tensor:
    """Create an input_ids tensor of the desired length."""
    # Repeat a seed sentence until we reach the desired length
    seed = ("The quick brown fox jumps over the lazy dog. "
            "In a distant land, ancient scholars studied the stars. ")
    text = seed * ((seq_len // 20) + 1)
    ids = tokenizer(text, return_tensors='pt',
                    max_length=seq_len, truncation=True).input_ids.to(device)
    return ids


# ---------------------------------------------------------------------------
# 4. Run full benchmark
# ---------------------------------------------------------------------------

def _free_model(model):
    """Move model to CPU and free GPU memory."""
    if model is not None:
        model.cpu()
        del model
    import gc
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()


def run_benchmark(model_name: str, prompt_lengths: List[int],
                  max_new_tokens: int, device: str, dtype):
    results = []

    # Define configs to benchmark (loaded/freed one at a time)
    config_specs = [
        ('HF-BF16', True, {}),
        ('FE-BF16', False, {}),
        ('FE-FP4',  False, {'use_goliath': True, 'goliath_bits': 4}),
        ('FE-FP8',  False, {'use_goliath': True, 'goliath_bits': 8}),
    ]

    # --- Load HF for tokenizer + reference logits ---
    hf_model, tokenizer = load_hf_model(model_name, dtype=dtype, device=device)

    # Generate reference logits for correctness validation (then free HF)
    print("\n" + "=" * 70)
    print("CORRECTNESS VALIDATION (vs HuggingFace)")
    print("=" * 70)

    ref_prompt = "The quick brown fox jumps over the lazy dog. " * 20
    ref_ids = tokenizer(ref_prompt, return_tensors='pt',
                        max_length=128, truncation=True).input_ids.to(device)
    with torch.no_grad():
        ref_logits = hf_model(ref_ids).logits.cpu()  # save to CPU
    ref_top1 = ref_logits.argmax(dim=-1)

    _free_model(hf_model)
    hf_model = None

    # Validate each FE config against saved reference
    for name, is_hf, fe_kwargs in config_specs:
        if is_hf:
            continue
        try:
            fe_model = load_fireecho(model_name, dtype=dtype, device=device, **fe_kwargs)
            with torch.no_grad():
                fe_logits = fe_model(ref_ids).cpu()
            fe_top1 = fe_logits.argmax(dim=-1)
            match_rate = (ref_top1 == fe_top1).float().mean().item()
            cos_sim = F.cosine_similarity(
                ref_logits.view(-1, ref_logits.shape[-1]).float(),
                fe_logits.view(-1, fe_logits.shape[-1]).float(),
                dim=-1).mean().item()
            max_diff = (ref_logits - fe_logits).abs().max().item()
            status = "PASS" if match_rate > 0.90 else "FAIL"
            print(f"  {name}: top1={match_rate:.3f}  "
                  f"cos_sim={cos_sim:.5f}  "
                  f"max_diff={max_diff:.4f}  [{status}]")
            _free_model(fe_model)
        except Exception as e:
            print(f"  {name}: ERROR - {e}")

    del ref_logits, ref_top1
    import gc; gc.collect()

    # --- Benchmark (one config at a time to avoid OOM on large models) ---
    print("\n" + "=" * 70)
    print(f"INFERENCE BENCHMARK  (decode {max_new_tokens} tokens)")
    print("=" * 70)

    printed_headers = set()

    for name, is_hf, fe_kwargs in config_specs:
        try:
            if is_hf:
                model, _ = load_hf_model(model_name, dtype=dtype, device=device)
            else:
                model = load_fireecho(model_name, dtype=dtype, device=device, **fe_kwargs)
        except Exception as e:
            print(f"\n[WARN] {name} load failed: {e}")
            continue

        for seq_len in prompt_lengths:
            input_ids = make_input(tokenizer, seq_len, device)
            actual_len = input_ids.shape[1]

            if actual_len not in printed_headers:
                print(f"\n--- Prompt length: {actual_len} tokens ---")
                print(f"{'Config':<12} {'TTFT(ms)':>10} {'Tok/s':>10} "
                      f"{'Prefill MB':>12} {'Decode MB':>12}")
                print("-" * 60)
                printed_headers.add(actual_len)

            try:
                if not is_hf and hasattr(model, 'reset_cache'):
                    model.reset_cache()

                ttft, pre_vram = bench_prefill(model, input_ids, is_hf)

                if not is_hf and hasattr(model, 'reset_cache'):
                    model.reset_cache()

                tok_s, dec_vram = bench_decode(
                    model, input_ids, max_new_tokens, is_hf, tokenizer)

                print(f"{name:<12} {ttft*1000:>10.1f} {tok_s:>10.1f} "
                      f"{pre_vram:>12.1f} {dec_vram:>12.1f}")

                results.append({
                    'config': name, 'prompt_len': actual_len,
                    'ttft_ms': ttft * 1000, 'tok_s': tok_s,
                    'prefill_vram_mb': pre_vram, 'decode_vram_mb': dec_vram,
                })
            except Exception as e:
                print(f"{name:<12} {'ERROR':>10} - {e}")

        _free_model(model)

    # --- Summary table ---
    print("\n" + "=" * 70)
    print("SUMMARY TABLE")
    print("=" * 70)
    print(f"{'Config':<12} {'Prompt':>7} {'TTFT(ms)':>10} {'Tok/s':>10} "
          f"{'Peak VRAM':>12}")
    print("-" * 55)
    for r in results:
        print(f"{r['config']:<12} {r['prompt_len']:>7} "
              f"{r['ttft_ms']:>10.1f} {r['tok_s']:>10.1f} "
              f"{r['decode_vram_mb']:>12.1f}")

    return results


# ---------------------------------------------------------------------------
# 5. Generation demo
# ---------------------------------------------------------------------------

def generation_demo(model_name: str, device: str, dtype):
    """Show side-by-side generation from both engines."""
    hf_model, tokenizer = load_hf_model(model_name, dtype=dtype, device=device)

    sys.path.insert(0, '.')
    from fireecho_kernel import FireEchoEngine
    fe_engine = load_fireecho(model_name, dtype=dtype, device=device)
    fe_engine.eval()

    prompt = "Once upon a time in a land far away,"
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

    print("\n" + "=" * 70)
    print(f"GENERATION DEMO  (prompt: {prompt!r})")
    print("=" * 70)

    # HuggingFace
    with torch.no_grad():
        hf_out = hf_model.generate(input_ids, max_new_tokens=60,
                                   do_sample=False, use_cache=True)
    hf_text = tokenizer.decode(hf_out[0], skip_special_tokens=True)
    print(f"\n[HF]  {hf_text}")

    # FireEcho
    fe_engine.reset_cache()
    with torch.no_grad():
        fe_out = fe_engine.generate(input_ids, max_new_tokens=60,
                                    temperature=0.0, top_k=1, use_cache=True)
    fe_text = tokenizer.decode(fe_out[0], skip_special_tokens=True)
    print(f"[FE]  {fe_text}")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Benchmark FireEcho vs HuggingFace on a real model")
    parser.add_argument('--model', default='Qwen/Qwen2-0.5B',
                        help='HuggingFace model name')
    parser.add_argument('--prompt-lengths', nargs='+', type=int,
                        default=[128, 512, 2048],
                        help='Prompt lengths to benchmark')
    parser.add_argument('--max-new-tokens', type=int, default=100,
                        help='Tokens to generate per benchmark')
    parser.add_argument('--device', default='cuda')
    parser.add_argument('--dtype', default='bfloat16',
                        choices=['bfloat16', 'float16', 'float32'])
    parser.add_argument('--demo', action='store_true',
                        help='Run generation demo only')
    args = parser.parse_args()

    dtype_map = {
        'bfloat16': torch.bfloat16,
        'float16': torch.float16,
        'float32': torch.float32,
    }
    dtype = dtype_map[args.dtype]

    if not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU")
        args.device = 'cpu'

    print(f"Model: {args.model}")
    print(f"Device: {args.device}")
    print(f"Dtype: {dtype}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name()}")
        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

    if args.demo:
        generation_demo(args.model, args.device, dtype)
    else:
        run_benchmark(args.model, args.prompt_lengths,
                      args.max_new_tokens, args.device, dtype)


if __name__ == '__main__':
    main()