| | |
| | """ |
| | Real Model Benchmark: FireEcho vs HuggingFace |
| | ============================================== |
| | Loads Qwen2-0.5B into both HuggingFace and FireEcho, validates correctness, |
| | then benchmarks generation tok/s, TTFT, and VRAM across prompt lengths. |
| | |
| | Usage: |
| | python benchmark_real_model.py |
| | python benchmark_real_model.py --model Qwen/Qwen2-0.5B --prompt-lengths 128 512 2048 |
| | """ |
| |
|
| | import argparse |
| | import sys |
| | import time |
| | from typing import Dict, List, Optional, Tuple |
| |
|
| | import torch |
| | import torch.nn.functional as F |
| |
|
| | |
| | |
| | |
| |
|
| | def _cuda_sync(): |
| | if torch.cuda.is_available(): |
| | torch.cuda.synchronize() |
| |
|
| |
|
| | def _peak_vram_mb() -> float: |
| | if torch.cuda.is_available(): |
| | return torch.cuda.max_memory_allocated() / (1024 ** 2) |
| | return 0.0 |
| |
|
| |
|
| | def _reset_peak_vram(): |
| | if torch.cuda.is_available(): |
| | torch.cuda.reset_peak_memory_stats() |
| |
|
| |
|
| | def _timed_cuda(fn, warmup: int = 2, repeats: int = 5) -> float: |
| | """Run *fn* with CUDA events, return median wall-time in seconds.""" |
| | for _ in range(warmup): |
| | fn() |
| | _cuda_sync() |
| |
|
| | times = [] |
| | for _ in range(repeats): |
| | start = torch.cuda.Event(enable_timing=True) |
| | end = torch.cuda.Event(enable_timing=True) |
| | start.record() |
| | fn() |
| | end.record() |
| | _cuda_sync() |
| | times.append(start.elapsed_time(end) / 1000.0) |
| | times.sort() |
| | return times[len(times) // 2] |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def load_hf_model(model_name: str, dtype=torch.bfloat16, device='cuda'): |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | print(f"\n[HF] Loading {model_name} ...") |
| | tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_name, torch_dtype=dtype, device_map=device, |
| | trust_remote_code=True, attn_implementation="sdpa", |
| | ) |
| | model.eval() |
| | params_m = sum(p.numel() for p in model.parameters()) / 1e6 |
| | print(f"[HF] {params_m:.1f}M params, dtype={dtype}, device={device}") |
| | return model, tokenizer |
| |
|
| |
|
| | def load_fireecho(model_name: str, dtype=torch.bfloat16, device='cuda', |
| | use_goliath: bool = False, goliath_bits: int = 4): |
| | sys.path.insert(0, '.') |
| | from fireecho_kernel import FireEchoEngine, FireEchoConfig |
| |
|
| | if use_goliath: |
| | |
| | from transformers import AutoConfig |
| | hf_cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True) |
| | config = FireEchoConfig( |
| | dim=hf_cfg.hidden_size, |
| | num_heads=hf_cfg.num_attention_heads, |
| | num_kv_heads=getattr(hf_cfg, 'num_key_value_heads', |
| | hf_cfg.num_attention_heads), |
| | num_layers=hf_cfg.num_hidden_layers, |
| | vocab_size=hf_cfg.vocab_size, |
| | intermediate_size=hf_cfg.intermediate_size, |
| | max_seq_len=min(getattr(hf_cfg, 'max_position_embeddings', 4096), |
| | 32768), |
| | rope_theta=getattr(hf_cfg, 'rope_theta', 10000.0), |
| | attn_bias=getattr(hf_cfg, 'attention_bias', False), |
| | tie_word_embeddings=getattr(hf_cfg, 'tie_word_embeddings', False), |
| | use_nvfp4=True, |
| | use_goliath=True, |
| | goliath_bits=goliath_bits, |
| | use_hebbian=False, |
| | ) |
| | tag = f"FP{goliath_bits}" |
| | print(f"\n[FE-{tag}] Loading {model_name} (Goliath {tag}) ...") |
| | engine = FireEchoEngine.from_pretrained(model_name, config=config, |
| | dtype=dtype, device=device) |
| | else: |
| | tag = "BF16" |
| | print(f"\n[FE-{tag}] Loading {model_name} ...") |
| | engine = FireEchoEngine.from_pretrained(model_name, dtype=dtype, |
| | device=device) |
| |
|
| | engine.eval() |
| | params_m = sum(p.numel() for p in engine.parameters()) / 1e6 |
| | print(f"[FE-{tag}] {params_m:.1f}M params") |
| | return engine |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def validate_correctness(hf_model, fe_engine, tokenizer, device='cuda', |
| | seq_len: int = 128) -> Dict: |
| | """Compare HF vs FireEcho logits on the same input.""" |
| | prompt = "The quick brown fox jumps over the lazy dog. " * 20 |
| | input_ids = tokenizer(prompt, return_tensors='pt', |
| | max_length=seq_len, truncation=True).input_ids.to(device) |
| | actual_len = input_ids.shape[1] |
| |
|
| | with torch.no_grad(): |
| | hf_logits = hf_model(input_ids).logits |
| | fe_logits = fe_engine(input_ids) |
| |
|
| | |
| | hf_top1 = hf_logits.argmax(dim=-1) |
| | fe_top1 = fe_logits.argmax(dim=-1) |
| | match_rate = (hf_top1 == fe_top1).float().mean().item() |
| |
|
| | |
| | max_abs_diff = (hf_logits - fe_logits).abs().max().item() |
| | cos_sim = F.cosine_similarity( |
| | hf_logits.view(-1, hf_logits.shape[-1]).float(), |
| | fe_logits.view(-1, fe_logits.shape[-1]).float(), |
| | dim=-1, |
| | ).mean().item() |
| |
|
| | return { |
| | 'seq_len': actual_len, |
| | 'top1_match': match_rate, |
| | 'max_abs_diff': max_abs_diff, |
| | 'cosine_sim': cos_sim, |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @torch.no_grad() |
| | def bench_prefill(model, input_ids, is_hf: bool) -> Tuple[float, float]: |
| | """Measure TTFT (time-to-first-token) and peak VRAM for prefill.""" |
| | _reset_peak_vram() |
| | _cuda_sync() |
| |
|
| | def _run(): |
| | if is_hf: |
| | model(input_ids) |
| | else: |
| | model(input_ids) |
| |
|
| | ttft = _timed_cuda(_run, warmup=2, repeats=5) |
| | vram = _peak_vram_mb() |
| | return ttft, vram |
| |
|
| |
|
| | @torch.no_grad() |
| | def bench_decode(model, input_ids, max_new_tokens: int, is_hf: bool, |
| | tokenizer=None) -> Tuple[float, float]: |
| | """Measure decode tok/s and peak VRAM.""" |
| | _reset_peak_vram() |
| | _cuda_sync() |
| |
|
| | def _run(): |
| | if is_hf: |
| | model.generate(input_ids, max_new_tokens=max_new_tokens, |
| | do_sample=False, use_cache=True) |
| | else: |
| | model.generate(input_ids, max_new_tokens=max_new_tokens, |
| | temperature=0.0, top_k=1, use_cache=True) |
| |
|
| | elapsed = _timed_cuda(_run, warmup=1, repeats=3) |
| | tok_per_s = max_new_tokens / elapsed |
| | vram = _peak_vram_mb() |
| | return tok_per_s, vram |
| |
|
| |
|
| | def make_input(tokenizer, seq_len: int, device='cuda') -> torch.Tensor: |
| | """Create an input_ids tensor of the desired length.""" |
| | |
| | seed = ("The quick brown fox jumps over the lazy dog. " |
| | "In a distant land, ancient scholars studied the stars. ") |
| | text = seed * ((seq_len // 20) + 1) |
| | ids = tokenizer(text, return_tensors='pt', |
| | max_length=seq_len, truncation=True).input_ids.to(device) |
| | return ids |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def _free_model(model): |
| | """Move model to CPU and free GPU memory.""" |
| | if model is not None: |
| | model.cpu() |
| | del model |
| | import gc |
| | gc.collect() |
| | if torch.cuda.is_available(): |
| | torch.cuda.empty_cache() |
| | torch.cuda.reset_peak_memory_stats() |
| |
|
| |
|
| | def run_benchmark(model_name: str, prompt_lengths: List[int], |
| | max_new_tokens: int, device: str, dtype): |
| | results = [] |
| |
|
| | |
| | config_specs = [ |
| | ('HF-BF16', True, {}), |
| | ('FE-BF16', False, {}), |
| | ('FE-FP4', False, {'use_goliath': True, 'goliath_bits': 4}), |
| | ('FE-FP8', False, {'use_goliath': True, 'goliath_bits': 8}), |
| | ] |
| |
|
| | |
| | hf_model, tokenizer = load_hf_model(model_name, dtype=dtype, device=device) |
| |
|
| | |
| | print("\n" + "=" * 70) |
| | print("CORRECTNESS VALIDATION (vs HuggingFace)") |
| | print("=" * 70) |
| |
|
| | ref_prompt = "The quick brown fox jumps over the lazy dog. " * 20 |
| | ref_ids = tokenizer(ref_prompt, return_tensors='pt', |
| | max_length=128, truncation=True).input_ids.to(device) |
| | with torch.no_grad(): |
| | ref_logits = hf_model(ref_ids).logits.cpu() |
| | ref_top1 = ref_logits.argmax(dim=-1) |
| |
|
| | _free_model(hf_model) |
| | hf_model = None |
| |
|
| | |
| | for name, is_hf, fe_kwargs in config_specs: |
| | if is_hf: |
| | continue |
| | try: |
| | fe_model = load_fireecho(model_name, dtype=dtype, device=device, **fe_kwargs) |
| | with torch.no_grad(): |
| | fe_logits = fe_model(ref_ids).cpu() |
| | fe_top1 = fe_logits.argmax(dim=-1) |
| | match_rate = (ref_top1 == fe_top1).float().mean().item() |
| | cos_sim = F.cosine_similarity( |
| | ref_logits.view(-1, ref_logits.shape[-1]).float(), |
| | fe_logits.view(-1, fe_logits.shape[-1]).float(), |
| | dim=-1).mean().item() |
| | max_diff = (ref_logits - fe_logits).abs().max().item() |
| | status = "PASS" if match_rate > 0.90 else "FAIL" |
| | print(f" {name}: top1={match_rate:.3f} " |
| | f"cos_sim={cos_sim:.5f} " |
| | f"max_diff={max_diff:.4f} [{status}]") |
| | _free_model(fe_model) |
| | except Exception as e: |
| | print(f" {name}: ERROR - {e}") |
| |
|
| | del ref_logits, ref_top1 |
| | import gc; gc.collect() |
| |
|
| | |
| | print("\n" + "=" * 70) |
| | print(f"INFERENCE BENCHMARK (decode {max_new_tokens} tokens)") |
| | print("=" * 70) |
| |
|
| | printed_headers = set() |
| |
|
| | for name, is_hf, fe_kwargs in config_specs: |
| | try: |
| | if is_hf: |
| | model, _ = load_hf_model(model_name, dtype=dtype, device=device) |
| | else: |
| | model = load_fireecho(model_name, dtype=dtype, device=device, **fe_kwargs) |
| | except Exception as e: |
| | print(f"\n[WARN] {name} load failed: {e}") |
| | continue |
| |
|
| | for seq_len in prompt_lengths: |
| | input_ids = make_input(tokenizer, seq_len, device) |
| | actual_len = input_ids.shape[1] |
| |
|
| | if actual_len not in printed_headers: |
| | print(f"\n--- Prompt length: {actual_len} tokens ---") |
| | print(f"{'Config':<12} {'TTFT(ms)':>10} {'Tok/s':>10} " |
| | f"{'Prefill MB':>12} {'Decode MB':>12}") |
| | print("-" * 60) |
| | printed_headers.add(actual_len) |
| |
|
| | try: |
| | if not is_hf and hasattr(model, 'reset_cache'): |
| | model.reset_cache() |
| |
|
| | ttft, pre_vram = bench_prefill(model, input_ids, is_hf) |
| |
|
| | if not is_hf and hasattr(model, 'reset_cache'): |
| | model.reset_cache() |
| |
|
| | tok_s, dec_vram = bench_decode( |
| | model, input_ids, max_new_tokens, is_hf, tokenizer) |
| |
|
| | print(f"{name:<12} {ttft*1000:>10.1f} {tok_s:>10.1f} " |
| | f"{pre_vram:>12.1f} {dec_vram:>12.1f}") |
| |
|
| | results.append({ |
| | 'config': name, 'prompt_len': actual_len, |
| | 'ttft_ms': ttft * 1000, 'tok_s': tok_s, |
| | 'prefill_vram_mb': pre_vram, 'decode_vram_mb': dec_vram, |
| | }) |
| | except Exception as e: |
| | print(f"{name:<12} {'ERROR':>10} - {e}") |
| |
|
| | _free_model(model) |
| |
|
| | |
| | print("\n" + "=" * 70) |
| | print("SUMMARY TABLE") |
| | print("=" * 70) |
| | print(f"{'Config':<12} {'Prompt':>7} {'TTFT(ms)':>10} {'Tok/s':>10} " |
| | f"{'Peak VRAM':>12}") |
| | print("-" * 55) |
| | for r in results: |
| | print(f"{r['config']:<12} {r['prompt_len']:>7} " |
| | f"{r['ttft_ms']:>10.1f} {r['tok_s']:>10.1f} " |
| | f"{r['decode_vram_mb']:>12.1f}") |
| |
|
| | return results |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def generation_demo(model_name: str, device: str, dtype): |
| | """Show side-by-side generation from both engines.""" |
| | hf_model, tokenizer = load_hf_model(model_name, dtype=dtype, device=device) |
| |
|
| | sys.path.insert(0, '.') |
| | from fireecho_kernel import FireEchoEngine |
| | fe_engine = load_fireecho(model_name, dtype=dtype, device=device) |
| | fe_engine.eval() |
| |
|
| | prompt = "Once upon a time in a land far away," |
| | input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device) |
| |
|
| | print("\n" + "=" * 70) |
| | print(f"GENERATION DEMO (prompt: {prompt!r})") |
| | print("=" * 70) |
| |
|
| | |
| | with torch.no_grad(): |
| | hf_out = hf_model.generate(input_ids, max_new_tokens=60, |
| | do_sample=False, use_cache=True) |
| | hf_text = tokenizer.decode(hf_out[0], skip_special_tokens=True) |
| | print(f"\n[HF] {hf_text}") |
| |
|
| | |
| | fe_engine.reset_cache() |
| | with torch.no_grad(): |
| | fe_out = fe_engine.generate(input_ids, max_new_tokens=60, |
| | temperature=0.0, top_k=1, use_cache=True) |
| | fe_text = tokenizer.decode(fe_out[0], skip_special_tokens=True) |
| | print(f"[FE] {fe_text}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser( |
| | description="Benchmark FireEcho vs HuggingFace on a real model") |
| | parser.add_argument('--model', default='Qwen/Qwen2-0.5B', |
| | help='HuggingFace model name') |
| | parser.add_argument('--prompt-lengths', nargs='+', type=int, |
| | default=[128, 512, 2048], |
| | help='Prompt lengths to benchmark') |
| | parser.add_argument('--max-new-tokens', type=int, default=100, |
| | help='Tokens to generate per benchmark') |
| | parser.add_argument('--device', default='cuda') |
| | parser.add_argument('--dtype', default='bfloat16', |
| | choices=['bfloat16', 'float16', 'float32']) |
| | parser.add_argument('--demo', action='store_true', |
| | help='Run generation demo only') |
| | args = parser.parse_args() |
| |
|
| | dtype_map = { |
| | 'bfloat16': torch.bfloat16, |
| | 'float16': torch.float16, |
| | 'float32': torch.float32, |
| | } |
| | dtype = dtype_map[args.dtype] |
| |
|
| | if not torch.cuda.is_available(): |
| | print("CUDA not available, falling back to CPU") |
| | args.device = 'cpu' |
| |
|
| | print(f"Model: {args.model}") |
| | print(f"Device: {args.device}") |
| | print(f"Dtype: {dtype}") |
| | if torch.cuda.is_available(): |
| | print(f"GPU: {torch.cuda.get_device_name()}") |
| | print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") |
| |
|
| | if args.demo: |
| | generation_demo(args.model, args.device, dtype) |
| | else: |
| | run_benchmark(args.model, args.prompt_lengths, |
| | args.max_new_tokens, args.device, dtype) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|