""" Head-to-head benchmark: TRT-LLM FP8 vs llama.cpp Q5_K_M vs INT4 W4A16 (Josiefied-Qwen3-4B) Workflow (can't run both at once — VRAM conflict): FP8 vs llama.cpp: STEP 1 — start FP8 server (serve_fp8_5000.ps1, port 5000), then: python D:\AI\models\benchmark_compare.py --backend trtllm STEP 2 — stop FP8, start llama.cpp GPU (port 5004), then: python D:\AI\models\benchmark_compare.py --backend llamacpp STEP 3 — compare: python D:\AI\models\benchmark_compare.py --compare FP8 vs INT4: STEP 1 — start FP8 server (serve_fp8_5000.ps1, port 5000), then: python D:\AI\models\benchmark_compare.py --backend trtllm STEP 2 — stop FP8, start INT4 server (serve_int4.ps1, port 5001), then: python D:\AI\models\benchmark_compare.py --backend int4 STEP 3 — compare: python D:\AI\models\benchmark_compare.py --compare-fp8-int4 Results saved to D:\AI\models\bench_trtllm.json / bench_llamacpp.json / bench_int4.json NOTE: if re-running after a fix, run both backends again so results are comparable. """ import argparse, json, os, statistics, sys, time import httpx import openai RESULTS_DIR = r"D:\AI\models" BACKENDS = { "trtllm": { "url": "http://localhost:5000/v1", "model": "Josiefied-Qwen3-4B-fp8", "label": "TRT-LLM FP8 16k", "file": os.path.join(RESULTS_DIR, "bench_trtllm.json"), "think_overhead": 0, # reasoning_parser handles this server-side }, "llamacpp": { "url": "http://localhost:5004/v1", "model": "Josiefied-Qwen3-4B-Q5", "label": "llama.cpp Q5_K_M", "file": os.path.join(RESULTS_DIR, "bench_llamacpp.json"), "think_overhead": 512, # thinking tokens count against budget here }, "int4": { "url": "http://localhost:5001/v1", "model": "Josiefied-Qwen3-4B-int4", "label": "TRT-LLM INT4 W4A16", "file": os.path.join(RESULTS_DIR, "bench_int4.json"), "think_overhead": 0, # reasoning_parser handles this server-side }, } PROMPTS = [ # (label, prompt, max_tokens) ("short / short", "What is 2+2?", 32), ("short / long", "Write a poem about autumn leaves.", 256), ("medium / long", "Explain how transformers work in machine learning, covering attention, positional encoding, and why they beat RNNs.", 512), ("long / medium", " ".join(["The quick brown fox jumps over the lazy dog."] * 40) + " Summarize the above in two sentences.", 128), ] RUNS = 3 # ── benchmark one backend ─────────────────────────────────────────────────── def run_backend(cfg): client = openai.OpenAI(api_key="none", base_url=cfg["url"]) # health check try: client.models.list() except Exception as e: print(f"ERROR: can't reach {cfg['label']} at {cfg['url']}") print(f" {e}") sys.exit(1) print(f"\n{'='*60}") print(f" {cfg['label']} ({cfg['url']})") print(f"{'='*60}\n") results = {} for label, prompt, max_tokens in PROMPTS: print(f"[{label}] max_tokens={max_tokens}") runs_data = [] for r in range(RUNS): t_start = time.perf_counter() t_first = None # first any token (thinking or response) t_resp = None # first response (content) token think_chars = 0 resp_chars = 0 payload = { "model": cfg["model"], "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens + cfg.get("think_overhead", 0), "temperature": 0.0, "stream": True, } with httpx.Client(timeout=120.0) as http: with http.stream("POST", cfg["url"] + "/chat/completions", json=payload, headers={"Content-Type": "application/json"}) as resp: for line in resp.iter_lines(): if not line.startswith("data:"): continue raw = line[5:].strip() if raw == "[DONE]": break try: obj = json.loads(raw) except json.JSONDecodeError: continue choices = obj.get("choices", []) if not choices: continue delta = choices[0].get("delta", {}) rc = delta.get("reasoning_content") or "" ct = delta.get("content") or "" if rc or ct: if t_first is None: t_first = time.perf_counter() if rc: think_chars += len(rc) if ct: if t_resp is None: t_resp = time.perf_counter() resp_chars += len(ct) t_end = time.perf_counter() if t_first is None: print(f" run {r+1}: no tokens — skip") continue # if model never left thinking phase (max_tokens too small), use thinking TTFT if t_resp is None: ttft_ms = (t_first - t_start) * 1000 total_chars = think_chars note = " (all-thinking)" else: ttft_ms = (t_resp - t_start) * 1000 total_chars = resp_chars note = "" total_s = t_end - t_start anchor = t_resp if t_resp else t_first decode_s = t_end - anchor tokens = max(1, total_chars // 4) tps = tokens / decode_s if decode_s > 0 else 0 runs_data.append({ "ttft_ms": round(ttft_ms, 1), "total_s": round(total_s, 2), "tokens": tokens, "tps": round(tps, 1), }) print(f" run {r+1}: TTFT={ttft_ms:6.0f}ms tokens~{tokens:4d} {tps:.1f} tok/s{note}") if runs_data: avg_ttft = statistics.mean(d["ttft_ms"] for d in runs_data) avg_tps = statistics.mean(d["tps"] for d in runs_data) print(f" => avg TTFT={avg_ttft:.0f}ms avg {avg_tps:.1f} tok/s\n") results[label] = {"runs": runs_data, "avg_ttft_ms": round(avg_ttft, 1), "avg_tps": round(avg_tps, 1)} else: results[label] = None with open(cfg["file"], "w") as f: json.dump({"backend": cfg["label"], "results": results}, f, indent=2) print(f"Results saved -> {cfg['file']}") # ── compare both ──────────────────────────────────────────────────────────── def compare(): data = {} for key, cfg in BACKENDS.items(): if not os.path.exists(cfg["file"]): print(f"Missing: {cfg['file']} (run --backend {key} first)") sys.exit(1) with open(cfg["file"]) as f: data[key] = json.load(f) a_label = data["trtllm"]["backend"] b_label = data["llamacpp"]["backend"] print(f"\n{'='*80}") print(f" COMPARISON: {a_label} vs {b_label}") print(f"{'='*80}") print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}") print(f"{'':22} {'TRT-LLM':>10} {'llama.cpp':>10} {'TRT-LLM':>12} {'llama.cpp':>12} {'winner':>7}") print("-"*80) for label, _, _ in PROMPTS: a = data["trtllm"]["results"].get(label) b = data["llamacpp"]["results"].get(label) if not a or not b: print(f" {label:<20} (missing data)") continue a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"] a_tps, b_tps = a["avg_tps"], b["avg_tps"] ttft_winner = "TRT-LLM" if a_ttft < b_ttft else "llama.cpp" tps_winner = "TRT-LLM" if a_tps > b_tps else "llama.cpp" ttft_ratio = b_ttft / a_ttft if a_ttft > 0 else 0 tps_ratio = a_tps / b_tps if b_tps > 0 else 0 print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}") print() # summary speedup a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v] b_all_ttft = [v["avg_ttft_ms"] for v in data["llamacpp"]["results"].values() if v] a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v] b_all_tps = [v["avg_tps"] for v in data["llamacpp"]["results"].values() if v] if a_all_ttft and b_all_ttft: ttft_speedup = statistics.mean(b_all_ttft) / statistics.mean(a_all_ttft) tps_speedup = statistics.mean(a_all_tps) / statistics.mean(b_all_tps) print(f" Overall: TRT-LLM is {ttft_speedup:.2f}x faster TTFT, {tps_speedup:.2f}x faster throughput") # ── FP8 vs INT4 compare ────────────────────────────────────────────────────── def compare_fp8_int4(): data = {} for key in ("trtllm", "int4"): cfg = BACKENDS[key] if not os.path.exists(cfg["file"]): print(f"Missing: {cfg['file']} (run --backend {key} first)") sys.exit(1) with open(cfg["file"]) as f: data[key] = json.load(f) a_label = data["trtllm"]["backend"] b_label = data["int4"]["backend"] print(f"\n{'='*80}") print(f" COMPARISON: {a_label} vs {b_label}") print(f"{'='*80}") print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}") print(f"{'':22} {'FP8':>10} {'INT4':>10} {'FP8':>12} {'INT4':>12} {'winner':>7}") print("-"*80) for label, _, _ in PROMPTS: a = data["trtllm"]["results"].get(label) b = data["int4"]["results"].get(label) if not a or not b: print(f" {label:<20} (missing data)") continue a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"] a_tps, b_tps = a["avg_tps"], b["avg_tps"] tps_winner = "FP8" if a_tps > b_tps else "INT4" print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}") print() a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v] b_all_ttft = [v["avg_ttft_ms"] for v in data["int4"]["results"].values() if v] a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v] b_all_tps = [v["avg_tps"] for v in data["int4"]["results"].values() if v] if a_all_tps and b_all_tps: avg_fp8 = statistics.mean(a_all_tps) avg_int4 = statistics.mean(b_all_tps) ratio = avg_int4 / avg_fp8 if avg_fp8 > 0 else 0 winner = "INT4" if avg_int4 > avg_fp8 else "FP8" print(f" Throughput: FP8={avg_fp8:.1f} tok/s INT4={avg_int4:.1f} tok/s => {winner} is {max(ratio, 1/ratio if ratio>0 else 0):.2f}x faster") if a_all_ttft and b_all_ttft: ttft_fp8 = statistics.mean(a_all_ttft) ttft_int4 = statistics.mean(b_all_ttft) ttft_winner = "INT4" if ttft_int4 < ttft_fp8 else "FP8" print(f" TTFT: FP8={ttft_fp8:.0f}ms INT4={ttft_int4:.0f}ms => {ttft_winner} faster to first token") # ── llama.cpp launcher helper ─────────────────────────────────────────────── def start_llamacpp(): """Print the command to start llama.cpp on :5004 for benchmarking.""" exe = r"D:\AI\apps\llama.cpp\build\bin\llama-server.exe" model = r"D:\AI\models\gguf\Josiefied-Qwen3-4B-abliterated-v1.Q5_K_M.gguf" print("\nRun this in a new terminal to start llama.cpp on :5004:") print(f'\n "{exe}" --model "{model}" --host 127.0.0.1 --port 5004 --ctx-size 2048 -ngl 99 -fa --alias Josiefied-Qwen3-4B-Q5\n') print("Then in this terminal:") print(" python D:\\AI\\models\\benchmark_compare.py --backend llamacpp\n") # ── main ──────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="TRT-LLM FP8 vs llama.cpp Q5_K_M benchmark") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--backend", choices=["trtllm", "llamacpp", "int4"], help="Run benchmark for this backend") group.add_argument("--compare", action="store_true", help="Compare FP8 vs llama.cpp") group.add_argument("--compare-fp8-int4", action="store_true", help="Compare FP8 vs INT4") group.add_argument("--llama-cmd", action="store_true", help="Print llama.cpp start command") args = parser.parse_args() if args.compare: compare() elif args.compare_fp8_int4: compare_fp8_int4() elif args.llama_cmd: start_llamacpp() else: run_backend(BACKENDS[args.backend]) if __name__ == "__main__": main()