Add source, patches, scripts, build notes, README, LICENSE

00db36f verified 2 days ago

13.9 kB

	"""
	Head-to-head benchmark: TRT-LLM FP8 vs llama.cpp Q5_K_M vs INT4 W4A16 (Josiefied-Qwen3-4B)

	Workflow (can't run both at once — VRAM conflict):

	FP8 vs llama.cpp:
	STEP 1 — start FP8 server (serve_fp8_5000.ps1, port 5000), then:
	python D:\AI\models\benchmark_compare.py --backend trtllm
	STEP 2 — stop FP8, start llama.cpp GPU (port 5004), then:
	python D:\AI\models\benchmark_compare.py --backend llamacpp
	STEP 3 — compare:
	python D:\AI\models\benchmark_compare.py --compare

	FP8 vs INT4:
	STEP 1 — start FP8 server (serve_fp8_5000.ps1, port 5000), then:
	python D:\AI\models\benchmark_compare.py --backend trtllm
	STEP 2 — stop FP8, start INT4 server (serve_int4.ps1, port 5001), then:
	python D:\AI\models\benchmark_compare.py --backend int4
	STEP 3 — compare:
	python D:\AI\models\benchmark_compare.py --compare-fp8-int4

	Results saved to D:\AI\models\bench_trtllm.json / bench_llamacpp.json / bench_int4.json

	NOTE: if re-running after a fix, run both backends again so results are comparable.
	"""
	import argparse, json, os, statistics, sys, time
	import httpx
	import openai

	RESULTS_DIR = r"D:\AI\models"

	BACKENDS = {
	"trtllm": {
	"url": "http://localhost:5000/v1",
	"model": "Josiefied-Qwen3-4B-fp8",
	"label": "TRT-LLM FP8 16k",
	"file": os.path.join(RESULTS_DIR, "bench_trtllm.json"),
	"think_overhead": 0, # reasoning_parser handles this server-side
	},
	"llamacpp": {
	"url": "http://localhost:5004/v1",
	"model": "Josiefied-Qwen3-4B-Q5",
	"label": "llama.cpp Q5_K_M",
	"file": os.path.join(RESULTS_DIR, "bench_llamacpp.json"),
	"think_overhead": 512, # thinking tokens count against budget here
	},
	"int4": {
	"url": "http://localhost:5001/v1",
	"model": "Josiefied-Qwen3-4B-int4",
	"label": "TRT-LLM INT4 W4A16",
	"file": os.path.join(RESULTS_DIR, "bench_int4.json"),
	"think_overhead": 0, # reasoning_parser handles this server-side
	},
	}

	PROMPTS = [
	# (label, prompt, max_tokens)
	("short / short", "What is 2+2?", 32),
	("short / long", "Write a poem about autumn leaves.", 256),
	("medium / long", "Explain how transformers work in machine learning, covering attention, positional encoding, and why they beat RNNs.", 512),
	("long / medium", " ".join(["The quick brown fox jumps over the lazy dog."] * 40) + " Summarize the above in two sentences.", 128),
	]

	RUNS = 3


	# ── benchmark one backend ───────────────────────────────────────────────────

	def run_backend(cfg):
	client = openai.OpenAI(api_key="none", base_url=cfg["url"])

	# health check
	try:
	client.models.list()
	except Exception as e:
	print(f"ERROR: can't reach {cfg['label']} at {cfg['url']}")
	print(f" {e}")
	sys.exit(1)

	print(f"\n{'='*60}")
	print(f" {cfg['label']} ({cfg['url']})")
	print(f"{'='*60}\n")

	results = {}

	for label, prompt, max_tokens in PROMPTS:
	print(f"[{label}] max_tokens={max_tokens}")
	runs_data = []

	for r in range(RUNS):
	t_start = time.perf_counter()
	t_first = None # first any token (thinking or response)
	t_resp = None # first response (content) token
	think_chars = 0
	resp_chars = 0

	payload = {
	"model": cfg["model"],
	"messages": [{"role": "user", "content": prompt}],
	"max_tokens": max_tokens + cfg.get("think_overhead", 0),
	"temperature": 0.0,
	"stream": True,
	}

	with httpx.Client(timeout=120.0) as http:
	with http.stream("POST", cfg["url"] + "/chat/completions",
	json=payload,
	headers={"Content-Type": "application/json"}) as resp:
	for line in resp.iter_lines():
	if not line.startswith("data:"):
	continue
	raw = line[5:].strip()
	if raw == "[DONE]":
	break
	try:
	obj = json.loads(raw)
	except json.JSONDecodeError:
	continue
	choices = obj.get("choices", [])
	if not choices:
	continue
	delta = choices[0].get("delta", {})
	rc = delta.get("reasoning_content") or ""
	ct = delta.get("content") or ""
	if rc or ct:
	if t_first is None:
	t_first = time.perf_counter()
	if rc:
	think_chars += len(rc)
	if ct:
	if t_resp is None:
	t_resp = time.perf_counter()
	resp_chars += len(ct)

	t_end = time.perf_counter()

	if t_first is None:
	print(f" run {r+1}: no tokens — skip")
	continue

	# if model never left thinking phase (max_tokens too small), use thinking TTFT
	if t_resp is None:
	ttft_ms = (t_first - t_start) * 1000
	total_chars = think_chars
	note = " (all-thinking)"
	else:
	ttft_ms = (t_resp - t_start) * 1000
	total_chars = resp_chars
	note = ""

	total_s = t_end - t_start
	anchor = t_resp if t_resp else t_first
	decode_s = t_end - anchor
	tokens = max(1, total_chars // 4)
	tps = tokens / decode_s if decode_s > 0 else 0

	runs_data.append({
	"ttft_ms": round(ttft_ms, 1),
	"total_s": round(total_s, 2),
	"tokens": tokens,
	"tps": round(tps, 1),
	})
	print(f" run {r+1}: TTFT={ttft_ms:6.0f}ms tokens~{tokens:4d} {tps:.1f} tok/s{note}")

	if runs_data:
	avg_ttft = statistics.mean(d["ttft_ms"] for d in runs_data)
	avg_tps = statistics.mean(d["tps"] for d in runs_data)
	print(f" => avg TTFT={avg_ttft:.0f}ms avg {avg_tps:.1f} tok/s\n")
	results[label] = {"runs": runs_data, "avg_ttft_ms": round(avg_ttft, 1), "avg_tps": round(avg_tps, 1)}
	else:
	results[label] = None

	with open(cfg["file"], "w") as f:
	json.dump({"backend": cfg["label"], "results": results}, f, indent=2)

	print(f"Results saved -> {cfg['file']}")


	# ── compare both ────────────────────────────────────────────────────────────

	def compare():
	data = {}
	for key, cfg in BACKENDS.items():
	if not os.path.exists(cfg["file"]):
	print(f"Missing: {cfg['file']} (run --backend {key} first)")
	sys.exit(1)
	with open(cfg["file"]) as f:
	data[key] = json.load(f)

	a_label = data["trtllm"]["backend"]
	b_label = data["llamacpp"]["backend"]

	print(f"\n{'='*80}")
	print(f" COMPARISON: {a_label} vs {b_label}")
	print(f"{'='*80}")
	print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}")
	print(f"{'':22} {'TRT-LLM':>10} {'llama.cpp':>10} {'TRT-LLM':>12} {'llama.cpp':>12} {'winner':>7}")
	print("-"*80)

	for label, _, _ in PROMPTS:
	a = data["trtllm"]["results"].get(label)
	b = data["llamacpp"]["results"].get(label)
	if not a or not b:
	print(f" {label:<20} (missing data)")
	continue

	a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"]
	a_tps, b_tps = a["avg_tps"], b["avg_tps"]

	ttft_winner = "TRT-LLM" if a_ttft < b_ttft else "llama.cpp"
	tps_winner = "TRT-LLM" if a_tps > b_tps else "llama.cpp"

	ttft_ratio = b_ttft / a_ttft if a_ttft > 0 else 0
	tps_ratio = a_tps / b_tps if b_tps > 0 else 0

	print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}")

	print()
	# summary speedup
	a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v]
	b_all_ttft = [v["avg_ttft_ms"] for v in data["llamacpp"]["results"].values() if v]
	a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v]
	b_all_tps = [v["avg_tps"] for v in data["llamacpp"]["results"].values() if v]

	if a_all_ttft and b_all_ttft:
	ttft_speedup = statistics.mean(b_all_ttft) / statistics.mean(a_all_ttft)
	tps_speedup = statistics.mean(a_all_tps) / statistics.mean(b_all_tps)
	print(f" Overall: TRT-LLM is {ttft_speedup:.2f}x faster TTFT, {tps_speedup:.2f}x faster throughput")


	# ── FP8 vs INT4 compare ──────────────────────────────────────────────────────

	def compare_fp8_int4():
	data = {}
	for key in ("trtllm", "int4"):
	cfg = BACKENDS[key]
	if not os.path.exists(cfg["file"]):
	print(f"Missing: {cfg['file']} (run --backend {key} first)")
	sys.exit(1)
	with open(cfg["file"]) as f:
	data[key] = json.load(f)

	a_label = data["trtllm"]["backend"]
	b_label = data["int4"]["backend"]

	print(f"\n{'='*80}")
	print(f" COMPARISON: {a_label} vs {b_label}")
	print(f"{'='*80}")
	print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}")
	print(f"{'':22} {'FP8':>10} {'INT4':>10} {'FP8':>12} {'INT4':>12} {'winner':>7}")
	print("-"*80)

	for label, _, _ in PROMPTS:
	a = data["trtllm"]["results"].get(label)
	b = data["int4"]["results"].get(label)
	if not a or not b:
	print(f" {label:<20} (missing data)")
	continue

	a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"]
	a_tps, b_tps = a["avg_tps"], b["avg_tps"]

	tps_winner = "FP8" if a_tps > b_tps else "INT4"
	print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}")

	print()
	a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v]
	b_all_ttft = [v["avg_ttft_ms"] for v in data["int4"]["results"].values() if v]
	a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v]
	b_all_tps = [v["avg_tps"] for v in data["int4"]["results"].values() if v]

	if a_all_tps and b_all_tps:
	avg_fp8 = statistics.mean(a_all_tps)
	avg_int4 = statistics.mean(b_all_tps)
	ratio = avg_int4 / avg_fp8 if avg_fp8 > 0 else 0
	winner = "INT4" if avg_int4 > avg_fp8 else "FP8"
	print(f" Throughput: FP8={avg_fp8:.1f} tok/s INT4={avg_int4:.1f} tok/s => {winner} is {max(ratio, 1/ratio if ratio>0 else 0):.2f}x faster")
	if a_all_ttft and b_all_ttft:
	ttft_fp8 = statistics.mean(a_all_ttft)
	ttft_int4 = statistics.mean(b_all_ttft)
	ttft_winner = "INT4" if ttft_int4 < ttft_fp8 else "FP8"
	print(f" TTFT: FP8={ttft_fp8:.0f}ms INT4={ttft_int4:.0f}ms => {ttft_winner} faster to first token")


	# ── llama.cpp launcher helper ───────────────────────────────────────────────

	def start_llamacpp():
	"""Print the command to start llama.cpp on :5004 for benchmarking."""
	exe = r"D:\AI\apps\llama.cpp\build\bin\llama-server.exe"
	model = r"D:\AI\models\gguf\Josiefied-Qwen3-4B-abliterated-v1.Q5_K_M.gguf"
	print("\nRun this in a new terminal to start llama.cpp on :5004:")
	print(f'\n "{exe}" --model "{model}" --host 127.0.0.1 --port 5004 --ctx-size 2048 -ngl 99 -fa --alias Josiefied-Qwen3-4B-Q5\n')
	print("Then in this terminal:")
	print(" python D:\\AI\\models\\benchmark_compare.py --backend llamacpp\n")


	# ── main ────────────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(description="TRT-LLM FP8 vs llama.cpp Q5_K_M benchmark")
	group = parser.add_mutually_exclusive_group(required=True)
	group.add_argument("--backend", choices=["trtllm", "llamacpp", "int4"], help="Run benchmark for this backend")
	group.add_argument("--compare", action="store_true", help="Compare FP8 vs llama.cpp")
	group.add_argument("--compare-fp8-int4", action="store_true", help="Compare FP8 vs INT4")
	group.add_argument("--llama-cmd", action="store_true", help="Print llama.cpp start command")
	args = parser.parse_args()

	if args.compare:
	compare()
	elif args.compare_fp8_int4:
	compare_fp8_int4()
	elif args.llama_cmd:
	start_llamacpp()
	else:
	run_backend(BACKENDS[args.backend])


	if __name__ == "__main__":
	main()