| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | FireEcho Full-Stack Benchmark — Path B: Every Optimization Stacked |
| | =================================================================== |
| | Part of the FireEcho Engine — Custom inference kernel for NVIDIA Blackwell |
| | Copyright (c) 2025-2026 Echo (FireEcho Project). All rights reserved. |
| | |
| | Stacks ALL FireEcho architecture optimizations and benchmarks each layer: |
| | |
| | Already in baseline: |
| | - Goliath FP4 packed MoE (dequant-matmul Triton kernels) |
| | - Fused SwiGLU+Down (1 kernel launch, not 3) |
| | - FlashDecode attention (Triton online softmax) |
| | - Flat KV cache (zero torch.cat, pre-allocated) |
| | |
| | Layer 0: Baseline (all above) — current ~37 tok/s |
| | Layer 1: + FP8 KV cache (half attention bandwidth) |
| | Layer 2: + L2 prefetch (next layer pre-staged in L2 cache) |
| | Layer 3: + Atlas Ban & Pick + MoDES (8→~5 experts + skip easy tokens) |
| | Layer 4: + FE-XC cold expert demotion (5.3x faster 2-bit codebook kernel) |
| | Layer 5: + CUDA Graph decode (zero Python overhead, single graph replay) |
| | |
| | Target: 15.8ms → ~8ms base forward = 125+ tok/s (no speculation) |
| | |
| | Usage: |
| | PYTHONUNBUFFERED=1 python benchmark_fullstack.py |
| | """ |
| |
|
| | import sys, os, time, argparse, torch |
| | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| |
|
| | from hebbian_finetune_demo import load_engine |
| |
|
| | MODEL_PATH = "/run/media/echo/Echo/ECHO/training/Prototype Fireecho/model/Qwen3-Omni-30B-A3B-Instruct" |
| |
|
| | TEST_PROMPTS = [ |
| | "Explain the theory of general relativity in simple terms.", |
| | "Write a Python function to find the longest palindromic substring.", |
| | "What are the main differences between TCP and UDP protocols?", |
| | "Describe the process of photosynthesis step by step.", |
| | "What caused the fall of the Roman Empire?", |
| | "How does a compiler optimize code?", |
| | "Explain how public key cryptography works.", |
| | "What is the difference between a stack and a queue?", |
| | ] |
| |
|
| |
|
| | def benchmark_generate(engine, tokenizer, prompts, max_tokens=100, warmup=3, |
| | label="Standard"): |
| | """Benchmark generate() with current engine config.""" |
| | print(f"\n{'=' * 60}") |
| | print(f"Benchmark: {label}") |
| | print(f"{'=' * 60}") |
| |
|
| | |
| | for i in range(warmup): |
| | ids = tokenizer.encode(prompts[0], return_tensors='pt').cuda() |
| | engine.generate(ids, max_new_tokens=20, temperature=0.0, top_k=0, top_p=1.0) |
| | print(f" Warmup {i+1}/{warmup}") |
| |
|
| | results = [] |
| | for prompt in prompts: |
| | input_ids = tokenizer.encode(prompt, return_tensors='pt').cuda() |
| | prompt_len = input_ids.shape[1] |
| |
|
| | torch.cuda.synchronize() |
| | t0 = time.perf_counter() |
| |
|
| | output = engine.generate( |
| | input_ids, max_new_tokens=max_tokens, temperature=0.0, |
| | top_k=0, top_p=1.0) |
| |
|
| | torch.cuda.synchronize() |
| | elapsed = time.perf_counter() - t0 |
| |
|
| | gen_len = output.shape[1] - prompt_len |
| | tok_s = gen_len / elapsed |
| |
|
| | results.append({ |
| | 'prompt': prompt[:50], |
| | 'gen_len': gen_len, |
| | 'elapsed': elapsed, |
| | 'tok_s': tok_s, |
| | }) |
| | print(f" [{gen_len:3d} tok] {tok_s:6.1f} tok/s | {prompt[:50]}...") |
| |
|
| | avg_tok_s = sum(r['tok_s'] for r in results) / len(results) |
| | avg_gen = sum(r['gen_len'] for r in results) / len(results) |
| | print(f"\n >> {label}: {avg_tok_s:.1f} tok/s avg, {avg_gen:.0f} tokens/prompt") |
| | return avg_tok_s |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="FireEcho Full-Stack Benchmark") |
| | parser.add_argument('--max-tokens', type=int, default=200) |
| | parser.add_argument('--warmup', type=int, default=3) |
| | parser.add_argument('--atlas-prompts', type=int, default=50, |
| | help='Number of prompts for Atlas profiling') |
| | parser.add_argument('--ban-ratio', type=float, default=0.25, |
| | help='Atlas Ban & Pick: fraction of experts to ban') |
| | parser.add_argument('--modes-threshold', type=float, default=2.0, |
| | help='Atlas MoDES: multiplier on uniform baseline (2.0 = skip when max_prob < 2/128)') |
| | parser.add_argument('--fexc-cold-pct', type=float, default=0.10, |
| | help='FE-XC: fraction of experts to demote to 2-bit codebook') |
| | parser.add_argument('--int2-cold-pct', type=float, default=0.05, |
| | help='INT2: fraction of coldest experts to demote to 2-bit scalar') |
| | args = parser.parse_args() |
| |
|
| | summary = {} |
| |
|
| | |
| | |
| | |
| | print("=" * 60) |
| | print("FireEcho Full-Stack Benchmark — Path B") |
| | print("Stacking ALL optimizations, measuring each layer") |
| | print("=" * 60) |
| | print("\nLoading Qwen3-Omni engine...") |
| |
|
| | engine, tokenizer, config = load_engine( |
| | MODEL_PATH, max_seq_len=4096, device="cuda", |
| | ) |
| | engine.pack_all_experts() |
| | engine.kv_cache.enable_flat_decode() |
| | engine.eval() |
| |
|
| | |
| | |
| | for layer in engine.layers: |
| | if hasattr(layer, 'ffn'): |
| | layer.ffn._quiet = True |
| | layer.ffn.femx_tier_interval = 10_000_000 |
| |
|
| | vram_base = torch.cuda.max_memory_allocated() / 1e9 |
| | print(f" Base VRAM: {vram_base:.2f} GB") |
| |
|
| | |
| | |
| | |
| | tok_s = benchmark_generate(engine, tokenizer, TEST_PROMPTS, |
| | max_tokens=args.max_tokens, warmup=args.warmup, |
| | label="Layer 0: Baseline (FP4 + packed MoE + flat KV BF16)") |
| | summary['L0_baseline'] = tok_s |
| |
|
| | |
| | |
| | |
| | print("\n>> Enabling FP8 KV cache...") |
| | engine.kv_cache.enable_flat_decode(kv_dtype='fp8') |
| | print(" [FP8 KV] Enabled — 50% attention bandwidth reduction") |
| |
|
| | tok_s = benchmark_generate(engine, tokenizer, TEST_PROMPTS, |
| | max_tokens=args.max_tokens, warmup=args.warmup, |
| | label="Layer 1: + FP8 KV cache") |
| | summary['L1_fp8_kv'] = tok_s |
| |
|
| | |
| | |
| | |
| | print("\n>> Enabling L2 layer-ahead prefetch...") |
| | engine.enable_l2_prefetch() |
| |
|
| | tok_s = benchmark_generate(engine, tokenizer, TEST_PROMPTS, |
| | max_tokens=args.max_tokens, warmup=args.warmup, |
| | label="Layer 2: + L2 prefetch") |
| | summary['L2_l2_prefetch'] = tok_s |
| |
|
| | |
| | |
| | |
| | print("\n>> Enabling Atlas the Gatekeeper (Ban & Pick)...") |
| | engine.enable_atlas(ban_threshold=0.01, modes_threshold=args.modes_threshold) |
| | engine.atlas_profile(tokenizer, num_prompts=args.atlas_prompts) |
| | engine.atlas_ban(ban_ratio=args.ban_ratio) |
| | engine.atlas_stats() |
| |
|
| | tok_s = benchmark_generate(engine, tokenizer, TEST_PROMPTS, |
| | max_tokens=args.max_tokens, warmup=args.warmup, |
| | label="Layer 3: + Atlas Ban & Pick (8→~5 experts)") |
| | summary['L3_atlas_ban'] = tok_s |
| |
|
| | |
| | |
| | |
| | print("\n>> Enabling FE-XC cold expert demotion...") |
| | engine.enable_auto_fexc_demotion(cold_threshold_pct=args.fexc_cold_pct) |
| |
|
| | |
| | |
| | print(" Building expert usage statistics (8 prompts × 50 tokens)...") |
| | for prompt in TEST_PROMPTS: |
| | ids = tokenizer.encode(prompt, return_tensors='pt').cuda() |
| | with torch.no_grad(): |
| | engine.generate(ids, max_new_tokens=50, temperature=0.0, |
| | top_k=0, top_p=1.0) |
| |
|
| | |
| | |
| | print(" Triggering FE-XC demotion (learning codebooks)...") |
| | fexc_count = 0 |
| | for layer in engine.layers: |
| | if hasattr(layer.ffn, 'update_expert_tiers'): |
| | layer.ffn.update_expert_tiers() |
| | if hasattr(layer.ffn, '_expert_is_fexc'): |
| | fexc_count += layer.ffn._expert_is_fexc.sum().item() |
| | print(f" [FE-XC] {fexc_count} total experts demoted across all layers") |
| |
|
| | tok_s = benchmark_generate(engine, tokenizer, TEST_PROMPTS, |
| | max_tokens=args.max_tokens, warmup=args.warmup, |
| | label="Layer 4: + FE-XC cold experts (2-bit codebook)") |
| | summary['L4_fexc'] = tok_s |
| |
|
| | |
| | |
| | |
| | print("\n>> Enabling INT2 coldest expert demotion...") |
| | engine.enable_auto_int2_demotion(cold_threshold_pct=args.int2_cold_pct) |
| |
|
| | |
| | int2_count = 0 |
| | for layer in engine.layers: |
| | if hasattr(layer.ffn, 'update_expert_tiers'): |
| | layer.ffn.update_expert_tiers() |
| | if hasattr(layer.ffn, '_expert_is_int2'): |
| | int2_count += layer.ffn._expert_is_int2.sum().item() |
| | print(f" [INT2] {int2_count} coldest experts demoted across all layers") |
| |
|
| | tok_s = benchmark_generate(engine, tokenizer, TEST_PROMPTS, |
| | max_tokens=args.max_tokens, warmup=args.warmup, |
| | label="Layer 5: + INT2 coldest experts (2-bit scalar)") |
| | summary['L5_int2'] = tok_s |
| |
|
| | |
| | |
| | |
| | |
| | print("\n>> Enabling CUDA Graph decode...") |
| | engine.enable_cuda_graph_decode(max_seq_len=4096) |
| | print(" [CUDA Graph] Capturing full 48-layer decode as single graph replay") |
| |
|
| | tok_s = benchmark_generate(engine, tokenizer, TEST_PROMPTS, |
| | max_tokens=args.max_tokens, warmup=args.warmup + 2, |
| | label="Layer 6: + CUDA Graph (zero Python overhead)") |
| | summary['L6_cuda_graph'] = tok_s |
| |
|
| | |
| | |
| | |
| | vram_final = torch.cuda.max_memory_allocated() / 1e9 |
| | final_key = 'L6_cuda_graph' |
| |
|
| | print("\n" + "=" * 70) |
| | print("FIREECHO FULL-STACK BENCHMARK SUMMARY") |
| | print("=" * 70) |
| | print() |
| | print(" Components already in baseline:") |
| | print(" - Goliath FP4 packed MoE (Triton dequant-matmul)") |
| | print(" - Fused SwiGLU+Down (1 kernel launch per expert)") |
| | print(" - FlashDecode attention (Triton online softmax)") |
| | print(" - Flat KV cache (zero torch.cat)") |
| | print() |
| | print(f" {'Layer':<55s} {'tok/s':>8s} {'vs base':>8s}") |
| | print(f" {'-'*55} {'-'*8} {'-'*8}") |
| |
|
| | base = summary['L0_baseline'] |
| | display_order = [ |
| | ('L0_baseline', 'Baseline (Goliath FP4 + packed MoE + fused SwiGLU)'), |
| | ('L1_fp8_kv', '+ FP8 KV cache (half attention bandwidth)'), |
| | ('L2_l2_prefetch', '+ L2 layer-ahead prefetch'), |
| | ('L3_atlas_ban', '+ Atlas Ban & Pick + MoDES (FE-AGK)'), |
| | ('L4_fexc', '+ FE-XC cold expert demotion (2-bit codebook)'), |
| | ('L5_int2', '+ INT2 coldest experts (2-bit scalar)'), |
| | ('L6_cuda_graph', '+ CUDA Graph decode (zero Python overhead)'), |
| | ] |
| |
|
| | for key, name in display_order: |
| | val = summary[key] |
| | speedup = val / base if base > 0 else 0 |
| | print(f" {name:<55s} {val:>7.1f} {speedup:>6.2f}x") |
| |
|
| | final = summary[final_key] |
| | print(f"\n Base VRAM: {vram_base:.2f} GB") |
| | print(f" Peak VRAM: {vram_final:.2f} GB") |
| | print(f" Total speedup: {final / base:.2f}x over baseline") |
| | print(f"\n Baseline forward: ~{1000/base:.1f}ms/token") |
| | print(f" Full-stack forward: ~{1000/final:.1f}ms/token") |
| | print(f"\n With 50% speculation acceptance: ~{final * 6 / 1:.0f} tok/s (est.)") |
| | print(f" With 70% speculation acceptance: ~{final * 8 / 1:.0f} tok/s (est.)") |
| | print("=" * 70) |
| |
|
| | |
| | results_path = os.path.join(os.path.dirname(__file__), "fullstack_benchmark_results.txt") |
| | with open(results_path, 'w') as f: |
| | f.write("FireEcho Full-Stack Benchmark Results\n") |
| | f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n") |
| | f.write(f"GPU: RTX 5090 32GB\n\n") |
| | f.write("Components in baseline:\n") |
| | f.write(" Goliath FP4 packed MoE, Fused SwiGLU+Down,\n") |
| | f.write(" FlashDecode attention, Flat KV cache\n\n") |
| | for key, name in display_order: |
| | val = summary[key] |
| | speedup = val / base |
| | f.write(f"{name}: {val:.1f} tok/s ({speedup:.2f}x)\n") |
| | f.write(f"\nBaseline: {base:.1f} tok/s\n") |
| | f.write(f"Full-stack: {final:.1f} tok/s\n") |
| | f.write(f"Speedup: {final/base:.2f}x\n") |
| | f.write(f"Peak VRAM: {vram_final:.2f} GB\n") |
| | print(f"\n Results saved to: {results_path}") |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|