TensorRT-LLM-Windows-RTX40 / scripts /benchmark_compare.py
xThr45hx's picture
Add source, patches, scripts, build notes, README, LICENSE
00db36f verified
Raw
History Blame Contribute Delete
13.9 kB
"""
Head-to-head benchmark: TRT-LLM FP8 vs llama.cpp Q5_K_M vs INT4 W4A16 (Josiefied-Qwen3-4B)
Workflow (can't run both at once β€” VRAM conflict):
FP8 vs llama.cpp:
STEP 1 β€” start FP8 server (serve_fp8_5000.ps1, port 5000), then:
python D:\AI\models\benchmark_compare.py --backend trtllm
STEP 2 β€” stop FP8, start llama.cpp GPU (port 5004), then:
python D:\AI\models\benchmark_compare.py --backend llamacpp
STEP 3 β€” compare:
python D:\AI\models\benchmark_compare.py --compare
FP8 vs INT4:
STEP 1 β€” start FP8 server (serve_fp8_5000.ps1, port 5000), then:
python D:\AI\models\benchmark_compare.py --backend trtllm
STEP 2 β€” stop FP8, start INT4 server (serve_int4.ps1, port 5001), then:
python D:\AI\models\benchmark_compare.py --backend int4
STEP 3 β€” compare:
python D:\AI\models\benchmark_compare.py --compare-fp8-int4
Results saved to D:\AI\models\bench_trtllm.json / bench_llamacpp.json / bench_int4.json
NOTE: if re-running after a fix, run both backends again so results are comparable.
"""
import argparse, json, os, statistics, sys, time
import httpx
import openai
RESULTS_DIR = r"D:\AI\models"
BACKENDS = {
"trtllm": {
"url": "http://localhost:5000/v1",
"model": "Josiefied-Qwen3-4B-fp8",
"label": "TRT-LLM FP8 16k",
"file": os.path.join(RESULTS_DIR, "bench_trtllm.json"),
"think_overhead": 0, # reasoning_parser handles this server-side
},
"llamacpp": {
"url": "http://localhost:5004/v1",
"model": "Josiefied-Qwen3-4B-Q5",
"label": "llama.cpp Q5_K_M",
"file": os.path.join(RESULTS_DIR, "bench_llamacpp.json"),
"think_overhead": 512, # thinking tokens count against budget here
},
"int4": {
"url": "http://localhost:5001/v1",
"model": "Josiefied-Qwen3-4B-int4",
"label": "TRT-LLM INT4 W4A16",
"file": os.path.join(RESULTS_DIR, "bench_int4.json"),
"think_overhead": 0, # reasoning_parser handles this server-side
},
}
PROMPTS = [
# (label, prompt, max_tokens)
("short / short", "What is 2+2?", 32),
("short / long", "Write a poem about autumn leaves.", 256),
("medium / long", "Explain how transformers work in machine learning, covering attention, positional encoding, and why they beat RNNs.", 512),
("long / medium", " ".join(["The quick brown fox jumps over the lazy dog."] * 40) + " Summarize the above in two sentences.", 128),
]
RUNS = 3
# ── benchmark one backend ───────────────────────────────────────────────────
def run_backend(cfg):
client = openai.OpenAI(api_key="none", base_url=cfg["url"])
# health check
try:
client.models.list()
except Exception as e:
print(f"ERROR: can't reach {cfg['label']} at {cfg['url']}")
print(f" {e}")
sys.exit(1)
print(f"\n{'='*60}")
print(f" {cfg['label']} ({cfg['url']})")
print(f"{'='*60}\n")
results = {}
for label, prompt, max_tokens in PROMPTS:
print(f"[{label}] max_tokens={max_tokens}")
runs_data = []
for r in range(RUNS):
t_start = time.perf_counter()
t_first = None # first any token (thinking or response)
t_resp = None # first response (content) token
think_chars = 0
resp_chars = 0
payload = {
"model": cfg["model"],
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens + cfg.get("think_overhead", 0),
"temperature": 0.0,
"stream": True,
}
with httpx.Client(timeout=120.0) as http:
with http.stream("POST", cfg["url"] + "/chat/completions",
json=payload,
headers={"Content-Type": "application/json"}) as resp:
for line in resp.iter_lines():
if not line.startswith("data:"):
continue
raw = line[5:].strip()
if raw == "[DONE]":
break
try:
obj = json.loads(raw)
except json.JSONDecodeError:
continue
choices = obj.get("choices", [])
if not choices:
continue
delta = choices[0].get("delta", {})
rc = delta.get("reasoning_content") or ""
ct = delta.get("content") or ""
if rc or ct:
if t_first is None:
t_first = time.perf_counter()
if rc:
think_chars += len(rc)
if ct:
if t_resp is None:
t_resp = time.perf_counter()
resp_chars += len(ct)
t_end = time.perf_counter()
if t_first is None:
print(f" run {r+1}: no tokens β€” skip")
continue
# if model never left thinking phase (max_tokens too small), use thinking TTFT
if t_resp is None:
ttft_ms = (t_first - t_start) * 1000
total_chars = think_chars
note = " (all-thinking)"
else:
ttft_ms = (t_resp - t_start) * 1000
total_chars = resp_chars
note = ""
total_s = t_end - t_start
anchor = t_resp if t_resp else t_first
decode_s = t_end - anchor
tokens = max(1, total_chars // 4)
tps = tokens / decode_s if decode_s > 0 else 0
runs_data.append({
"ttft_ms": round(ttft_ms, 1),
"total_s": round(total_s, 2),
"tokens": tokens,
"tps": round(tps, 1),
})
print(f" run {r+1}: TTFT={ttft_ms:6.0f}ms tokens~{tokens:4d} {tps:.1f} tok/s{note}")
if runs_data:
avg_ttft = statistics.mean(d["ttft_ms"] for d in runs_data)
avg_tps = statistics.mean(d["tps"] for d in runs_data)
print(f" => avg TTFT={avg_ttft:.0f}ms avg {avg_tps:.1f} tok/s\n")
results[label] = {"runs": runs_data, "avg_ttft_ms": round(avg_ttft, 1), "avg_tps": round(avg_tps, 1)}
else:
results[label] = None
with open(cfg["file"], "w") as f:
json.dump({"backend": cfg["label"], "results": results}, f, indent=2)
print(f"Results saved -> {cfg['file']}")
# ── compare both ────────────────────────────────────────────────────────────
def compare():
data = {}
for key, cfg in BACKENDS.items():
if not os.path.exists(cfg["file"]):
print(f"Missing: {cfg['file']} (run --backend {key} first)")
sys.exit(1)
with open(cfg["file"]) as f:
data[key] = json.load(f)
a_label = data["trtllm"]["backend"]
b_label = data["llamacpp"]["backend"]
print(f"\n{'='*80}")
print(f" COMPARISON: {a_label} vs {b_label}")
print(f"{'='*80}")
print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}")
print(f"{'':22} {'TRT-LLM':>10} {'llama.cpp':>10} {'TRT-LLM':>12} {'llama.cpp':>12} {'winner':>7}")
print("-"*80)
for label, _, _ in PROMPTS:
a = data["trtllm"]["results"].get(label)
b = data["llamacpp"]["results"].get(label)
if not a or not b:
print(f" {label:<20} (missing data)")
continue
a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"]
a_tps, b_tps = a["avg_tps"], b["avg_tps"]
ttft_winner = "TRT-LLM" if a_ttft < b_ttft else "llama.cpp"
tps_winner = "TRT-LLM" if a_tps > b_tps else "llama.cpp"
ttft_ratio = b_ttft / a_ttft if a_ttft > 0 else 0
tps_ratio = a_tps / b_tps if b_tps > 0 else 0
print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}")
print()
# summary speedup
a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v]
b_all_ttft = [v["avg_ttft_ms"] for v in data["llamacpp"]["results"].values() if v]
a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v]
b_all_tps = [v["avg_tps"] for v in data["llamacpp"]["results"].values() if v]
if a_all_ttft and b_all_ttft:
ttft_speedup = statistics.mean(b_all_ttft) / statistics.mean(a_all_ttft)
tps_speedup = statistics.mean(a_all_tps) / statistics.mean(b_all_tps)
print(f" Overall: TRT-LLM is {ttft_speedup:.2f}x faster TTFT, {tps_speedup:.2f}x faster throughput")
# ── FP8 vs INT4 compare ──────────────────────────────────────────────────────
def compare_fp8_int4():
data = {}
for key in ("trtllm", "int4"):
cfg = BACKENDS[key]
if not os.path.exists(cfg["file"]):
print(f"Missing: {cfg['file']} (run --backend {key} first)")
sys.exit(1)
with open(cfg["file"]) as f:
data[key] = json.load(f)
a_label = data["trtllm"]["backend"]
b_label = data["int4"]["backend"]
print(f"\n{'='*80}")
print(f" COMPARISON: {a_label} vs {b_label}")
print(f"{'='*80}")
print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}")
print(f"{'':22} {'FP8':>10} {'INT4':>10} {'FP8':>12} {'INT4':>12} {'winner':>7}")
print("-"*80)
for label, _, _ in PROMPTS:
a = data["trtllm"]["results"].get(label)
b = data["int4"]["results"].get(label)
if not a or not b:
print(f" {label:<20} (missing data)")
continue
a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"]
a_tps, b_tps = a["avg_tps"], b["avg_tps"]
tps_winner = "FP8" if a_tps > b_tps else "INT4"
print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}")
print()
a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v]
b_all_ttft = [v["avg_ttft_ms"] for v in data["int4"]["results"].values() if v]
a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v]
b_all_tps = [v["avg_tps"] for v in data["int4"]["results"].values() if v]
if a_all_tps and b_all_tps:
avg_fp8 = statistics.mean(a_all_tps)
avg_int4 = statistics.mean(b_all_tps)
ratio = avg_int4 / avg_fp8 if avg_fp8 > 0 else 0
winner = "INT4" if avg_int4 > avg_fp8 else "FP8"
print(f" Throughput: FP8={avg_fp8:.1f} tok/s INT4={avg_int4:.1f} tok/s => {winner} is {max(ratio, 1/ratio if ratio>0 else 0):.2f}x faster")
if a_all_ttft and b_all_ttft:
ttft_fp8 = statistics.mean(a_all_ttft)
ttft_int4 = statistics.mean(b_all_ttft)
ttft_winner = "INT4" if ttft_int4 < ttft_fp8 else "FP8"
print(f" TTFT: FP8={ttft_fp8:.0f}ms INT4={ttft_int4:.0f}ms => {ttft_winner} faster to first token")
# ── llama.cpp launcher helper ───────────────────────────────────────────────
def start_llamacpp():
"""Print the command to start llama.cpp on :5004 for benchmarking."""
exe = r"D:\AI\apps\llama.cpp\build\bin\llama-server.exe"
model = r"D:\AI\models\gguf\Josiefied-Qwen3-4B-abliterated-v1.Q5_K_M.gguf"
print("\nRun this in a new terminal to start llama.cpp on :5004:")
print(f'\n "{exe}" --model "{model}" --host 127.0.0.1 --port 5004 --ctx-size 2048 -ngl 99 -fa --alias Josiefied-Qwen3-4B-Q5\n')
print("Then in this terminal:")
print(" python D:\\AI\\models\\benchmark_compare.py --backend llamacpp\n")
# ── main ────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="TRT-LLM FP8 vs llama.cpp Q5_K_M benchmark")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--backend", choices=["trtllm", "llamacpp", "int4"], help="Run benchmark for this backend")
group.add_argument("--compare", action="store_true", help="Compare FP8 vs llama.cpp")
group.add_argument("--compare-fp8-int4", action="store_true", help="Compare FP8 vs INT4")
group.add_argument("--llama-cmd", action="store_true", help="Print llama.cpp start command")
args = parser.parse_args()
if args.compare:
compare()
elif args.compare_fp8_int4:
compare_fp8_int4()
elif args.llama_cmd:
start_llamacpp()
else:
run_backend(BACKENDS[args.backend])
if __name__ == "__main__":
main()