Instructions to use xThr45hx/TensorRT-LLM-Windows-RTX40 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- TensorRT
How to use xThr45hx/TensorRT-LLM-Windows-RTX40 with TensorRT:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| """ | |
| Head-to-head benchmark: TRT-LLM FP8 vs llama.cpp Q5_K_M vs INT4 W4A16 (Josiefied-Qwen3-4B) | |
| Workflow (can't run both at once β VRAM conflict): | |
| FP8 vs llama.cpp: | |
| STEP 1 β start FP8 server (serve_fp8_5000.ps1, port 5000), then: | |
| python D:\AI\models\benchmark_compare.py --backend trtllm | |
| STEP 2 β stop FP8, start llama.cpp GPU (port 5004), then: | |
| python D:\AI\models\benchmark_compare.py --backend llamacpp | |
| STEP 3 β compare: | |
| python D:\AI\models\benchmark_compare.py --compare | |
| FP8 vs INT4: | |
| STEP 1 β start FP8 server (serve_fp8_5000.ps1, port 5000), then: | |
| python D:\AI\models\benchmark_compare.py --backend trtllm | |
| STEP 2 β stop FP8, start INT4 server (serve_int4.ps1, port 5001), then: | |
| python D:\AI\models\benchmark_compare.py --backend int4 | |
| STEP 3 β compare: | |
| python D:\AI\models\benchmark_compare.py --compare-fp8-int4 | |
| Results saved to D:\AI\models\bench_trtllm.json / bench_llamacpp.json / bench_int4.json | |
| NOTE: if re-running after a fix, run both backends again so results are comparable. | |
| """ | |
| import argparse, json, os, statistics, sys, time | |
| import httpx | |
| import openai | |
| RESULTS_DIR = r"D:\AI\models" | |
| BACKENDS = { | |
| "trtllm": { | |
| "url": "http://localhost:5000/v1", | |
| "model": "Josiefied-Qwen3-4B-fp8", | |
| "label": "TRT-LLM FP8 16k", | |
| "file": os.path.join(RESULTS_DIR, "bench_trtllm.json"), | |
| "think_overhead": 0, # reasoning_parser handles this server-side | |
| }, | |
| "llamacpp": { | |
| "url": "http://localhost:5004/v1", | |
| "model": "Josiefied-Qwen3-4B-Q5", | |
| "label": "llama.cpp Q5_K_M", | |
| "file": os.path.join(RESULTS_DIR, "bench_llamacpp.json"), | |
| "think_overhead": 512, # thinking tokens count against budget here | |
| }, | |
| "int4": { | |
| "url": "http://localhost:5001/v1", | |
| "model": "Josiefied-Qwen3-4B-int4", | |
| "label": "TRT-LLM INT4 W4A16", | |
| "file": os.path.join(RESULTS_DIR, "bench_int4.json"), | |
| "think_overhead": 0, # reasoning_parser handles this server-side | |
| }, | |
| } | |
| PROMPTS = [ | |
| # (label, prompt, max_tokens) | |
| ("short / short", "What is 2+2?", 32), | |
| ("short / long", "Write a poem about autumn leaves.", 256), | |
| ("medium / long", "Explain how transformers work in machine learning, covering attention, positional encoding, and why they beat RNNs.", 512), | |
| ("long / medium", " ".join(["The quick brown fox jumps over the lazy dog."] * 40) + " Summarize the above in two sentences.", 128), | |
| ] | |
| RUNS = 3 | |
| # ββ benchmark one backend βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_backend(cfg): | |
| client = openai.OpenAI(api_key="none", base_url=cfg["url"]) | |
| # health check | |
| try: | |
| client.models.list() | |
| except Exception as e: | |
| print(f"ERROR: can't reach {cfg['label']} at {cfg['url']}") | |
| print(f" {e}") | |
| sys.exit(1) | |
| print(f"\n{'='*60}") | |
| print(f" {cfg['label']} ({cfg['url']})") | |
| print(f"{'='*60}\n") | |
| results = {} | |
| for label, prompt, max_tokens in PROMPTS: | |
| print(f"[{label}] max_tokens={max_tokens}") | |
| runs_data = [] | |
| for r in range(RUNS): | |
| t_start = time.perf_counter() | |
| t_first = None # first any token (thinking or response) | |
| t_resp = None # first response (content) token | |
| think_chars = 0 | |
| resp_chars = 0 | |
| payload = { | |
| "model": cfg["model"], | |
| "messages": [{"role": "user", "content": prompt}], | |
| "max_tokens": max_tokens + cfg.get("think_overhead", 0), | |
| "temperature": 0.0, | |
| "stream": True, | |
| } | |
| with httpx.Client(timeout=120.0) as http: | |
| with http.stream("POST", cfg["url"] + "/chat/completions", | |
| json=payload, | |
| headers={"Content-Type": "application/json"}) as resp: | |
| for line in resp.iter_lines(): | |
| if not line.startswith("data:"): | |
| continue | |
| raw = line[5:].strip() | |
| if raw == "[DONE]": | |
| break | |
| try: | |
| obj = json.loads(raw) | |
| except json.JSONDecodeError: | |
| continue | |
| choices = obj.get("choices", []) | |
| if not choices: | |
| continue | |
| delta = choices[0].get("delta", {}) | |
| rc = delta.get("reasoning_content") or "" | |
| ct = delta.get("content") or "" | |
| if rc or ct: | |
| if t_first is None: | |
| t_first = time.perf_counter() | |
| if rc: | |
| think_chars += len(rc) | |
| if ct: | |
| if t_resp is None: | |
| t_resp = time.perf_counter() | |
| resp_chars += len(ct) | |
| t_end = time.perf_counter() | |
| if t_first is None: | |
| print(f" run {r+1}: no tokens β skip") | |
| continue | |
| # if model never left thinking phase (max_tokens too small), use thinking TTFT | |
| if t_resp is None: | |
| ttft_ms = (t_first - t_start) * 1000 | |
| total_chars = think_chars | |
| note = " (all-thinking)" | |
| else: | |
| ttft_ms = (t_resp - t_start) * 1000 | |
| total_chars = resp_chars | |
| note = "" | |
| total_s = t_end - t_start | |
| anchor = t_resp if t_resp else t_first | |
| decode_s = t_end - anchor | |
| tokens = max(1, total_chars // 4) | |
| tps = tokens / decode_s if decode_s > 0 else 0 | |
| runs_data.append({ | |
| "ttft_ms": round(ttft_ms, 1), | |
| "total_s": round(total_s, 2), | |
| "tokens": tokens, | |
| "tps": round(tps, 1), | |
| }) | |
| print(f" run {r+1}: TTFT={ttft_ms:6.0f}ms tokens~{tokens:4d} {tps:.1f} tok/s{note}") | |
| if runs_data: | |
| avg_ttft = statistics.mean(d["ttft_ms"] for d in runs_data) | |
| avg_tps = statistics.mean(d["tps"] for d in runs_data) | |
| print(f" => avg TTFT={avg_ttft:.0f}ms avg {avg_tps:.1f} tok/s\n") | |
| results[label] = {"runs": runs_data, "avg_ttft_ms": round(avg_ttft, 1), "avg_tps": round(avg_tps, 1)} | |
| else: | |
| results[label] = None | |
| with open(cfg["file"], "w") as f: | |
| json.dump({"backend": cfg["label"], "results": results}, f, indent=2) | |
| print(f"Results saved -> {cfg['file']}") | |
| # ββ compare both ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compare(): | |
| data = {} | |
| for key, cfg in BACKENDS.items(): | |
| if not os.path.exists(cfg["file"]): | |
| print(f"Missing: {cfg['file']} (run --backend {key} first)") | |
| sys.exit(1) | |
| with open(cfg["file"]) as f: | |
| data[key] = json.load(f) | |
| a_label = data["trtllm"]["backend"] | |
| b_label = data["llamacpp"]["backend"] | |
| print(f"\n{'='*80}") | |
| print(f" COMPARISON: {a_label} vs {b_label}") | |
| print(f"{'='*80}") | |
| print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}") | |
| print(f"{'':22} {'TRT-LLM':>10} {'llama.cpp':>10} {'TRT-LLM':>12} {'llama.cpp':>12} {'winner':>7}") | |
| print("-"*80) | |
| for label, _, _ in PROMPTS: | |
| a = data["trtllm"]["results"].get(label) | |
| b = data["llamacpp"]["results"].get(label) | |
| if not a or not b: | |
| print(f" {label:<20} (missing data)") | |
| continue | |
| a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"] | |
| a_tps, b_tps = a["avg_tps"], b["avg_tps"] | |
| ttft_winner = "TRT-LLM" if a_ttft < b_ttft else "llama.cpp" | |
| tps_winner = "TRT-LLM" if a_tps > b_tps else "llama.cpp" | |
| ttft_ratio = b_ttft / a_ttft if a_ttft > 0 else 0 | |
| tps_ratio = a_tps / b_tps if b_tps > 0 else 0 | |
| print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}") | |
| print() | |
| # summary speedup | |
| a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v] | |
| b_all_ttft = [v["avg_ttft_ms"] for v in data["llamacpp"]["results"].values() if v] | |
| a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v] | |
| b_all_tps = [v["avg_tps"] for v in data["llamacpp"]["results"].values() if v] | |
| if a_all_ttft and b_all_ttft: | |
| ttft_speedup = statistics.mean(b_all_ttft) / statistics.mean(a_all_ttft) | |
| tps_speedup = statistics.mean(a_all_tps) / statistics.mean(b_all_tps) | |
| print(f" Overall: TRT-LLM is {ttft_speedup:.2f}x faster TTFT, {tps_speedup:.2f}x faster throughput") | |
| # ββ FP8 vs INT4 compare ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compare_fp8_int4(): | |
| data = {} | |
| for key in ("trtllm", "int4"): | |
| cfg = BACKENDS[key] | |
| if not os.path.exists(cfg["file"]): | |
| print(f"Missing: {cfg['file']} (run --backend {key} first)") | |
| sys.exit(1) | |
| with open(cfg["file"]) as f: | |
| data[key] = json.load(f) | |
| a_label = data["trtllm"]["backend"] | |
| b_label = data["int4"]["backend"] | |
| print(f"\n{'='*80}") | |
| print(f" COMPARISON: {a_label} vs {b_label}") | |
| print(f"{'='*80}") | |
| print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}") | |
| print(f"{'':22} {'FP8':>10} {'INT4':>10} {'FP8':>12} {'INT4':>12} {'winner':>7}") | |
| print("-"*80) | |
| for label, _, _ in PROMPTS: | |
| a = data["trtllm"]["results"].get(label) | |
| b = data["int4"]["results"].get(label) | |
| if not a or not b: | |
| print(f" {label:<20} (missing data)") | |
| continue | |
| a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"] | |
| a_tps, b_tps = a["avg_tps"], b["avg_tps"] | |
| tps_winner = "FP8" if a_tps > b_tps else "INT4" | |
| print(f" {label:<20} {a_ttft:>10.0f} {b_ttft:>10.0f} {a_tps:>12.1f} {b_tps:>12.1f} {tps_winner:>9}") | |
| print() | |
| a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v] | |
| b_all_ttft = [v["avg_ttft_ms"] for v in data["int4"]["results"].values() if v] | |
| a_all_tps = [v["avg_tps"] for v in data["trtllm"]["results"].values() if v] | |
| b_all_tps = [v["avg_tps"] for v in data["int4"]["results"].values() if v] | |
| if a_all_tps and b_all_tps: | |
| avg_fp8 = statistics.mean(a_all_tps) | |
| avg_int4 = statistics.mean(b_all_tps) | |
| ratio = avg_int4 / avg_fp8 if avg_fp8 > 0 else 0 | |
| winner = "INT4" if avg_int4 > avg_fp8 else "FP8" | |
| print(f" Throughput: FP8={avg_fp8:.1f} tok/s INT4={avg_int4:.1f} tok/s => {winner} is {max(ratio, 1/ratio if ratio>0 else 0):.2f}x faster") | |
| if a_all_ttft and b_all_ttft: | |
| ttft_fp8 = statistics.mean(a_all_ttft) | |
| ttft_int4 = statistics.mean(b_all_ttft) | |
| ttft_winner = "INT4" if ttft_int4 < ttft_fp8 else "FP8" | |
| print(f" TTFT: FP8={ttft_fp8:.0f}ms INT4={ttft_int4:.0f}ms => {ttft_winner} faster to first token") | |
| # ββ llama.cpp launcher helper βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def start_llamacpp(): | |
| """Print the command to start llama.cpp on :5004 for benchmarking.""" | |
| exe = r"D:\AI\apps\llama.cpp\build\bin\llama-server.exe" | |
| model = r"D:\AI\models\gguf\Josiefied-Qwen3-4B-abliterated-v1.Q5_K_M.gguf" | |
| print("\nRun this in a new terminal to start llama.cpp on :5004:") | |
| print(f'\n "{exe}" --model "{model}" --host 127.0.0.1 --port 5004 --ctx-size 2048 -ngl 99 -fa --alias Josiefied-Qwen3-4B-Q5\n') | |
| print("Then in this terminal:") | |
| print(" python D:\\AI\\models\\benchmark_compare.py --backend llamacpp\n") | |
| # ββ main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| parser = argparse.ArgumentParser(description="TRT-LLM FP8 vs llama.cpp Q5_K_M benchmark") | |
| group = parser.add_mutually_exclusive_group(required=True) | |
| group.add_argument("--backend", choices=["trtllm", "llamacpp", "int4"], help="Run benchmark for this backend") | |
| group.add_argument("--compare", action="store_true", help="Compare FP8 vs llama.cpp") | |
| group.add_argument("--compare-fp8-int4", action="store_true", help="Compare FP8 vs INT4") | |
| group.add_argument("--llama-cmd", action="store_true", help="Print llama.cpp start command") | |
| args = parser.parse_args() | |
| if args.compare: | |
| compare() | |
| elif args.compare_fp8_int4: | |
| compare_fp8_int4() | |
| elif args.llama_cmd: | |
| start_llamacpp() | |
| else: | |
| run_backend(BACKENDS[args.backend]) | |
| if __name__ == "__main__": | |
| main() | |