| | """ |
| | Benchmark Generation — Prefill + Decode across Goliath configs |
| | ============================================================== |
| | |
| | Measures: |
| | - Prefill throughput (tok/s) |
| | - Decode throughput (tok/s) and per-token latency (ms) |
| | - VRAM usage (GB) |
| | |
| | Configs tested: |
| | - Goliath FP4 (goliath_bits=4) |
| | - Goliath FP8 (goliath_bits=8) |
| | - Goliath Auto (goliath_bits='auto') |
| | - Legacy path (use_goliath=False) |
| | |
| | Context lengths: 512, 2048, 8192 |
| | |
| | Usage: |
| | python3 benchmark_generation.py |
| | """ |
| |
|
| | import gc |
| | import sys |
| | import time |
| | import torch |
| |
|
| | |
| | import os |
| | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| |
|
| | from fireecho_kernel import FireEchoConfig, FireEchoEngine, _GOLIATH_AVAILABLE |
| |
|
| | if _GOLIATH_AVAILABLE: |
| | from goliath_kernel import _can_use_goliath_dot_scaled |
| | else: |
| | _can_use_goliath_dot_scaled = None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def create_bench_engine(goliath_bits=4, use_goliath=True, num_layers=8): |
| | """Create a 7B-scale engine (reduced layers) with the given Goliath config.""" |
| | config = FireEchoConfig( |
| | dim=4096, |
| | num_heads=32, |
| | num_kv_heads=8, |
| | num_layers=num_layers, |
| | vocab_size=32000, |
| | intermediate_size=11008, |
| | max_seq_len=16384, |
| | max_kv_blocks=1024, |
| | use_nvfp4=True, |
| | quantize_weights=True, |
| | goliath_bits=goliath_bits, |
| | use_goliath=use_goliath, |
| | use_hebbian=False, |
| | use_vision=False, |
| | use_audio=False, |
| | ) |
| | engine = FireEchoEngine(config).cuda() |
| | engine.eval() |
| | return engine |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def bench_prefill(engine, seq_len, warmup=3, iters=5): |
| | """Benchmark prefill (forward pass on full prompt). |
| | |
| | Returns dict with ms, tok_s, vram_gb. |
| | """ |
| | input_ids = torch.randint(0, 32000, (1, seq_len), device='cuda') |
| |
|
| | |
| | for _ in range(warmup): |
| | engine.reset_cache() |
| | with torch.no_grad(): |
| | _ = engine(input_ids, use_cache=False) |
| | torch.cuda.synchronize() |
| |
|
| | |
| | start_evt = torch.cuda.Event(enable_timing=True) |
| | end_evt = torch.cuda.Event(enable_timing=True) |
| |
|
| | start_evt.record() |
| | for _ in range(iters): |
| | engine.reset_cache() |
| | with torch.no_grad(): |
| | _ = engine(input_ids, use_cache=False) |
| | end_evt.record() |
| | torch.cuda.synchronize() |
| |
|
| | ms = start_evt.elapsed_time(end_evt) / iters |
| | tok_s = seq_len / (ms / 1000.0) |
| | vram_gb = torch.cuda.memory_allocated() / 1e9 |
| |
|
| | return {'ms': ms, 'tok_s': tok_s, 'vram_gb': vram_gb} |
| |
|
| |
|
| | def bench_decode(engine, prompt_len, num_decode_tokens=50, warmup=2): |
| | """Benchmark decode (token-by-token generation after prefill). |
| | |
| | Returns dict with total_ms, per_token_ms, tok_s, vram_gb. |
| | """ |
| | prompt = torch.randint(0, 32000, (1, prompt_len), device='cuda') |
| |
|
| | |
| | for _ in range(warmup): |
| | engine.reset_cache() |
| | with torch.no_grad(): |
| | _ = engine.generate(prompt, max_new_tokens=5, use_cache=False) |
| | torch.cuda.synchronize() |
| |
|
| | gc.collect() |
| | torch.cuda.empty_cache() |
| |
|
| | |
| | start_evt = torch.cuda.Event(enable_timing=True) |
| | end_evt = torch.cuda.Event(enable_timing=True) |
| |
|
| | engine.reset_cache() |
| | start_evt.record() |
| | with torch.no_grad(): |
| | output = engine.generate(prompt, max_new_tokens=num_decode_tokens, |
| | use_cache=False) |
| | end_evt.record() |
| | torch.cuda.synchronize() |
| |
|
| | gen_tokens = output.shape[1] - prompt_len |
| | total_ms = start_evt.elapsed_time(end_evt) |
| | per_token_ms = total_ms / max(gen_tokens, 1) |
| | tok_s = gen_tokens / (total_ms / 1000.0) if total_ms > 0 else 0.0 |
| | vram_gb = torch.cuda.memory_allocated() / 1e9 |
| |
|
| | return { |
| | 'total_ms': total_ms, |
| | 'per_token_ms': per_token_ms, |
| | 'tok_s': tok_s, |
| | 'vram_gb': vram_gb, |
| | 'gen_tokens': gen_tokens, |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def main(): |
| | if not torch.cuda.is_available(): |
| | print("CUDA not available.") |
| | return |
| |
|
| | props = torch.cuda.get_device_properties(0) |
| | print("=" * 85) |
| | print("GENERATION BENCHMARK — Goliath FP4/FP8 Configs") |
| | print("=" * 85) |
| | print(f"GPU: {props.name}") |
| | print(f"VRAM: {props.total_memory / 1e9:.1f} GB") |
| | print(f"Goliath available: {_GOLIATH_AVAILABLE}") |
| | if _can_use_goliath_dot_scaled is not None: |
| | print(f"Goliath dot_scaled (native FP4 TCs): {_can_use_goliath_dot_scaled()}") |
| | print() |
| |
|
| | configs = [ |
| | ('Goliath FP4', dict(goliath_bits=4, use_goliath=True)), |
| | ('Goliath FP8', dict(goliath_bits=8, use_goliath=True)), |
| | ('Goliath Auto', dict(goliath_bits='auto', use_goliath=True)), |
| | ('Legacy path', dict(goliath_bits=4, use_goliath=False)), |
| | ] |
| |
|
| | context_lengths = [512, 2048, 8192] |
| |
|
| | |
| | print("-" * 85) |
| | print("PREFILL BENCHMARK") |
| | print("-" * 85) |
| | header = f"{'Config':<16} | {'Ctx':>5} | {'Prefill ms':>11} | {'Prefill tok/s':>14} | {'VRAM GB':>8}" |
| | print(header) |
| | print("-" * len(header)) |
| |
|
| | for cfg_name, cfg_kwargs in configs: |
| | try: |
| | engine = create_bench_engine(**cfg_kwargs) |
| | except Exception as e: |
| | print(f"{cfg_name:<16} | {'ERROR':>5} | {str(e)[:40]}") |
| | continue |
| |
|
| | for ctx in context_lengths: |
| | try: |
| | r = bench_prefill(engine, ctx) |
| | print(f"{cfg_name:<16} | {ctx:>5} | {r['ms']:>9.2f}ms | {r['tok_s']:>12,.0f} | {r['vram_gb']:>7.2f}") |
| | except Exception as e: |
| | print(f"{cfg_name:<16} | {ctx:>5} | ERROR: {str(e)[:30]}") |
| |
|
| | del engine |
| | gc.collect() |
| | torch.cuda.empty_cache() |
| |
|
| | |
| | print() |
| | print("-" * 85) |
| | print("DECODE BENCHMARK (50 tokens)") |
| | print("-" * 85) |
| | header = f"{'Config':<16} | {'Ctx':>5} | {'Decode tok/s':>13} | {'ms/token':>9} | {'VRAM GB':>8}" |
| | print(header) |
| | print("-" * len(header)) |
| |
|
| | for cfg_name, cfg_kwargs in configs: |
| | try: |
| | engine = create_bench_engine(**cfg_kwargs) |
| | except Exception as e: |
| | print(f"{cfg_name:<16} | {'ERROR':>5} | {str(e)[:40]}") |
| | continue |
| |
|
| | for ctx in context_lengths: |
| | try: |
| | r = bench_decode(engine, ctx) |
| | print(f"{cfg_name:<16} | {ctx:>5} | {r['tok_s']:>11,.1f} | {r['per_token_ms']:>7.2f}ms | {r['vram_gb']:>7.2f}") |
| | except Exception as e: |
| | print(f"{cfg_name:<16} | {ctx:>5} | ERROR: {str(e)[:30]}") |
| |
|
| | del engine |
| | gc.collect() |
| | torch.cuda.empty_cache() |
| |
|
| | print() |
| | print("=" * 85) |
| | print("BENCHMARK COMPLETE") |
| | print("=" * 85) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|