"""
Head-to-head benchmark: TRT-LLM FP8 vs llama.cpp Q5_K_M vs INT4 W4A16 (Josiefied-Qwen3-4B)

Workflow (can't run both at once — VRAM conflict):

  FP8 vs llama.cpp:
    STEP 1 — start FP8 server (serve_fp8_5000.ps1, port 5000), then:
      python D:\AI\models\benchmark_compare.py --backend trtllm
    STEP 2 — stop FP8, start llama.cpp GPU (port 5004), then:
      python D:\AI\models\benchmark_compare.py --backend llamacpp
    STEP 3 — compare:
      python D:\AI\models\benchmark_compare.py --compare

  FP8 vs INT4:
    STEP 1 — start FP8 server (serve_fp8_5000.ps1, port 5000), then:
      python D:\AI\models\benchmark_compare.py --backend trtllm
    STEP 2 — stop FP8, start INT4 server (serve_int4.ps1, port 5001), then:
      python D:\AI\models\benchmark_compare.py --backend int4
    STEP 3 — compare:
      python D:\AI\models\benchmark_compare.py --compare-fp8-int4

Results saved to D:\AI\models\bench_trtllm.json / bench_llamacpp.json / bench_int4.json

NOTE: if re-running after a fix, run both backends again so results are comparable.
"""
import argparse, json, os, statistics, sys, time
import httpx
import openai

RESULTS_DIR = r"D:\AI\models"

BACKENDS = {
    "trtllm": {
        "url":            "http://localhost:5000/v1",
        "model":          "Josiefied-Qwen3-4B-fp8",
        "label":          "TRT-LLM FP8 16k",
        "file":           os.path.join(RESULTS_DIR, "bench_trtllm.json"),
        "think_overhead": 0,    # reasoning_parser handles this server-side
    },
    "llamacpp": {
        "url":            "http://localhost:5004/v1",
        "model":          "Josiefied-Qwen3-4B-Q5",
        "label":          "llama.cpp Q5_K_M",
        "file":           os.path.join(RESULTS_DIR, "bench_llamacpp.json"),
        "think_overhead": 512,  # thinking tokens count against budget here
    },
    "int4": {
        "url":            "http://localhost:5001/v1",
        "model":          "Josiefied-Qwen3-4B-int4",
        "label":          "TRT-LLM INT4 W4A16",
        "file":           os.path.join(RESULTS_DIR, "bench_int4.json"),
        "think_overhead": 0,    # reasoning_parser handles this server-side
    },
}

PROMPTS = [
    # (label, prompt, max_tokens)
    ("short / short",   "What is 2+2?",                                                                                                                   32),
    ("short / long",    "Write a poem about autumn leaves.",                                                                                             256),
    ("medium / long",   "Explain how transformers work in machine learning, covering attention, positional encoding, and why they beat RNNs.",            512),
    ("long / medium",   " ".join(["The quick brown fox jumps over the lazy dog."] * 40) + " Summarize the above in two sentences.",                      128),
]

RUNS = 3


# ── benchmark one backend ───────────────────────────────────────────────────

def run_backend(cfg):
    client = openai.OpenAI(api_key="none", base_url=cfg["url"])

    # health check
    try:
        client.models.list()
    except Exception as e:
        print(f"ERROR: can't reach {cfg['label']} at {cfg['url']}")
        print(f"  {e}")
        sys.exit(1)

    print(f"\n{'='*60}")
    print(f"  {cfg['label']}  ({cfg['url']})")
    print(f"{'='*60}\n")

    results = {}

    for label, prompt, max_tokens in PROMPTS:
        print(f"[{label}]  max_tokens={max_tokens}")
        runs_data = []

        for r in range(RUNS):
            t_start  = time.perf_counter()
            t_first  = None   # first any token (thinking or response)
            t_resp   = None   # first response (content) token
            think_chars = 0
            resp_chars  = 0

            payload = {
                "model":       cfg["model"],
                "messages":    [{"role": "user", "content": prompt}],
                "max_tokens":  max_tokens + cfg.get("think_overhead", 0),
                "temperature": 0.0,
                "stream":      True,
            }

            with httpx.Client(timeout=120.0) as http:
                with http.stream("POST", cfg["url"] + "/chat/completions",
                                 json=payload,
                                 headers={"Content-Type": "application/json"}) as resp:
                    for line in resp.iter_lines():
                        if not line.startswith("data:"):
                            continue
                        raw = line[5:].strip()
                        if raw == "[DONE]":
                            break
                        try:
                            obj = json.loads(raw)
                        except json.JSONDecodeError:
                            continue
                        choices = obj.get("choices", [])
                        if not choices:
                            continue
                        delta = choices[0].get("delta", {})
                        rc = delta.get("reasoning_content") or ""
                        ct = delta.get("content") or ""
                        if rc or ct:
                            if t_first is None:
                                t_first = time.perf_counter()
                        if rc:
                            think_chars += len(rc)
                        if ct:
                            if t_resp is None:
                                t_resp = time.perf_counter()
                            resp_chars += len(ct)

            t_end = time.perf_counter()

            if t_first is None:
                print(f"  run {r+1}: no tokens — skip")
                continue

            # if model never left thinking phase (max_tokens too small), use thinking TTFT
            if t_resp is None:
                ttft_ms = (t_first - t_start) * 1000
                total_chars = think_chars
                note = " (all-thinking)"
            else:
                ttft_ms = (t_resp - t_start) * 1000
                total_chars = resp_chars
                note = ""

            total_s  = t_end - t_start
            anchor   = t_resp if t_resp else t_first
            decode_s = t_end - anchor
            tokens   = max(1, total_chars // 4)
            tps      = tokens / decode_s if decode_s > 0 else 0

            runs_data.append({
                "ttft_ms": round(ttft_ms, 1),
                "total_s": round(total_s, 2),
                "tokens":  tokens,
                "tps":     round(tps, 1),
            })
            print(f"  run {r+1}: TTFT={ttft_ms:6.0f}ms  tokens~{tokens:4d}  {tps:.1f} tok/s{note}")

        if runs_data:
            avg_ttft = statistics.mean(d["ttft_ms"] for d in runs_data)
            avg_tps  = statistics.mean(d["tps"]     for d in runs_data)
            print(f"  => avg TTFT={avg_ttft:.0f}ms  avg {avg_tps:.1f} tok/s\n")
            results[label] = {"runs": runs_data, "avg_ttft_ms": round(avg_ttft, 1), "avg_tps": round(avg_tps, 1)}
        else:
            results[label] = None

    with open(cfg["file"], "w") as f:
        json.dump({"backend": cfg["label"], "results": results}, f, indent=2)

    print(f"Results saved -> {cfg['file']}")


# ── compare both ────────────────────────────────────────────────────────────

def compare():
    data = {}
    for key, cfg in BACKENDS.items():
        if not os.path.exists(cfg["file"]):
            print(f"Missing: {cfg['file']}  (run --backend {key} first)")
            sys.exit(1)
        with open(cfg["file"]) as f:
            data[key] = json.load(f)

    a_label = data["trtllm"]["backend"]
    b_label = data["llamacpp"]["backend"]

    print(f"\n{'='*80}")
    print(f"  COMPARISON: {a_label}  vs  {b_label}")
    print(f"{'='*80}")
    print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}")
    print(f"{'':22}   {'TRT-LLM':>10}  {'llama.cpp':>10}   {'TRT-LLM':>12}  {'llama.cpp':>12}  {'winner':>7}")
    print("-"*80)

    for label, _, _ in PROMPTS:
        a = data["trtllm"]["results"].get(label)
        b = data["llamacpp"]["results"].get(label)
        if not a or not b:
            print(f"  {label:<20}   (missing data)")
            continue

        a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"]
        a_tps,  b_tps  = a["avg_tps"],     b["avg_tps"]

        ttft_winner = "TRT-LLM" if a_ttft < b_ttft else "llama.cpp"
        tps_winner  = "TRT-LLM" if a_tps  > b_tps  else "llama.cpp"

        ttft_ratio = b_ttft / a_ttft if a_ttft > 0 else 0
        tps_ratio  = a_tps  / b_tps  if b_tps  > 0 else 0

        print(f"  {label:<20}   {a_ttft:>10.0f}  {b_ttft:>10.0f}   {a_tps:>12.1f}  {b_tps:>12.1f}  {tps_winner:>9}")

    print()
    # summary speedup
    a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values()  if v]
    b_all_ttft = [v["avg_ttft_ms"] for v in data["llamacpp"]["results"].values() if v]
    a_all_tps  = [v["avg_tps"]     for v in data["trtllm"]["results"].values()  if v]
    b_all_tps  = [v["avg_tps"]     for v in data["llamacpp"]["results"].values() if v]

    if a_all_ttft and b_all_ttft:
        ttft_speedup = statistics.mean(b_all_ttft) / statistics.mean(a_all_ttft)
        tps_speedup  = statistics.mean(a_all_tps)  / statistics.mean(b_all_tps)
        print(f"  Overall: TRT-LLM is {ttft_speedup:.2f}x faster TTFT, {tps_speedup:.2f}x faster throughput")


# ── FP8 vs INT4 compare ──────────────────────────────────────────────────────

def compare_fp8_int4():
    data = {}
    for key in ("trtllm", "int4"):
        cfg = BACKENDS[key]
        if not os.path.exists(cfg["file"]):
            print(f"Missing: {cfg['file']}  (run --backend {key} first)")
            sys.exit(1)
        with open(cfg["file"]) as f:
            data[key] = json.load(f)

    a_label = data["trtllm"]["backend"]
    b_label = data["int4"]["backend"]

    print(f"\n{'='*80}")
    print(f"  COMPARISON: {a_label}  vs  {b_label}")
    print(f"{'='*80}")
    print(f"\n{'Prompt':<22} {'':^2} {'TTFT (ms)':^22} {'':^2} {'Throughput (tok/s)':^26}")
    print(f"{'':22}   {'FP8':>10}  {'INT4':>10}   {'FP8':>12}  {'INT4':>12}  {'winner':>7}")
    print("-"*80)

    for label, _, _ in PROMPTS:
        a = data["trtllm"]["results"].get(label)
        b = data["int4"]["results"].get(label)
        if not a or not b:
            print(f"  {label:<20}   (missing data)")
            continue

        a_ttft, b_ttft = a["avg_ttft_ms"], b["avg_ttft_ms"]
        a_tps,  b_tps  = a["avg_tps"],     b["avg_tps"]

        tps_winner = "FP8" if a_tps > b_tps else "INT4"
        print(f"  {label:<20}   {a_ttft:>10.0f}  {b_ttft:>10.0f}   {a_tps:>12.1f}  {b_tps:>12.1f}  {tps_winner:>9}")

    print()
    a_all_ttft = [v["avg_ttft_ms"] for v in data["trtllm"]["results"].values() if v]
    b_all_ttft = [v["avg_ttft_ms"] for v in data["int4"]["results"].values()   if v]
    a_all_tps  = [v["avg_tps"]     for v in data["trtllm"]["results"].values() if v]
    b_all_tps  = [v["avg_tps"]     for v in data["int4"]["results"].values()   if v]

    if a_all_tps and b_all_tps:
        avg_fp8  = statistics.mean(a_all_tps)
        avg_int4 = statistics.mean(b_all_tps)
        ratio    = avg_int4 / avg_fp8 if avg_fp8 > 0 else 0
        winner   = "INT4" if avg_int4 > avg_fp8 else "FP8"
        print(f"  Throughput: FP8={avg_fp8:.1f} tok/s  INT4={avg_int4:.1f} tok/s  => {winner} is {max(ratio, 1/ratio if ratio>0 else 0):.2f}x faster")
    if a_all_ttft and b_all_ttft:
        ttft_fp8  = statistics.mean(a_all_ttft)
        ttft_int4 = statistics.mean(b_all_ttft)
        ttft_winner = "INT4" if ttft_int4 < ttft_fp8 else "FP8"
        print(f"  TTFT:       FP8={ttft_fp8:.0f}ms  INT4={ttft_int4:.0f}ms  => {ttft_winner} faster to first token")


# ── llama.cpp launcher helper ───────────────────────────────────────────────

def start_llamacpp():
    """Print the command to start llama.cpp on :5004 for benchmarking."""
    exe   = r"D:\AI\apps\llama.cpp\build\bin\llama-server.exe"
    model = r"D:\AI\models\gguf\Josiefied-Qwen3-4B-abliterated-v1.Q5_K_M.gguf"
    print("\nRun this in a new terminal to start llama.cpp on :5004:")
    print(f'\n  "{exe}" --model "{model}" --host 127.0.0.1 --port 5004 --ctx-size 2048 -ngl 99 -fa --alias Josiefied-Qwen3-4B-Q5\n')
    print("Then in this terminal:")
    print("  python D:\\AI\\models\\benchmark_compare.py --backend llamacpp\n")


# ── main ────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="TRT-LLM FP8 vs llama.cpp Q5_K_M benchmark")
    group  = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--backend",        choices=["trtllm", "llamacpp", "int4"], help="Run benchmark for this backend")
    group.add_argument("--compare",        action="store_true", help="Compare FP8 vs llama.cpp")
    group.add_argument("--compare-fp8-int4", action="store_true", help="Compare FP8 vs INT4")
    group.add_argument("--llama-cmd",      action="store_true", help="Print llama.cpp start command")
    args = parser.parse_args()

    if args.compare:
        compare()
    elif args.compare_fp8_int4:
        compare_fp8_int4()
    elif args.llama_cmd:
        start_llamacpp()
    else:
        run_backend(BACKENDS[args.backend])


if __name__ == "__main__":
    main()