File size: 6,340 Bytes

#!/usr/bin/env python3
"""
serve_vllm.py — VENUE ONLY (Prime Intellect, CUDA GPU). DOES NOT RUN ON THE MAC.

This is a thin, documented wrapper that prints (and optionally execs) the exact
`vllm serve` command for three configs:

  1. baseline  — Laguna XS.2 alone (the speed floor).
  2. dflash    — Laguna XS.2 + the DFlash speculator (the speed we're claiming).
  3. quant     — a quantized Laguna checkpoint (FP8/INT4/NVFP4) + FP8 KV cache.
                 This is the FALLBACK lane (see FALLBACK_QUANT.md): if DFlash hits
                 a vLLM-version/draft-model snag at the venue, a quantized weights
                 checkpoint still tells a clean single-GPU story (smaller footprint,
                 FP8 KV cache ~doubles concurrent trajectories per the [TR]).

baseline vs dflash are IDENTICAL except for --speculative-config — flip one flag,
get faster tokens, same greedy output. quant is a different lever (shrink each
pass instead of cutting passes); the two can stack, but the fallback keeps it
simple with quant alone.

Grounding (cite at the demo):
  - DFlash config shape is from the HF model card
    huggingface.co/poolside/Laguna-XS.2-speculator.dflash:
        --speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash",
                               "num_speculative_tokens":7,"method":"dflash"}'
  - num_speculative_tokens = 7 is the card's value (this is gamma, the draft length).
  - vLLM >= 0.21.0 and VLLM_USE_DEEP_GEMM=0 per the card.
  - parsers --tool-call-parser poolside_v1 / --reasoning-parser poolside_v1 per the card.

VERIFY AT ONBOARDING: exact vLLM version on the PI image, whether
--trust-remote-code is required, and whether `method` is spelled "dflash"
in the build you get. The card is authoritative; confirm against `vllm serve --help`.

Usage (on Prime Intellect):
  python scripts/serve_vllm.py --mode baseline --print     # show the command
  python scripts/serve_vllm.py --mode dflash --run         # actually serve
"""
from __future__ import annotations

import argparse
import json
import os
import shlex
import subprocess
import sys

MODEL = os.environ.get("LAGUNA_MODEL", "poolside/Laguna-XS.2")
SPECULATOR = os.environ.get("LAGUNA_SPECULATOR", "poolside/Laguna-XS.2-speculator.dflash")

# Draft length gamma. Per the DFlash model card.
NUM_SPECULATIVE_TOKENS = 7

# Quantized checkpoints for the fallback lane. The [TR] says XS.2 ships FP8 (W8A8),
# INT4 (W4A16/AWQ) and NVFP4 quants in the HF collection. EXACT repo names are NOT
# confirmed pre-event — these are documented placeholders; VERIFY AT ONBOARDING
# against huggingface.co/collections/poolside/laguna-xs2 (or override via env).
QUANT_MODELS = {
    "fp8":   os.environ.get("LAGUNA_FP8_MODEL",   "poolside/Laguna-XS.2-FP8"),
    "int4":  os.environ.get("LAGUNA_INT4_MODEL",  "poolside/Laguna-XS.2-INT4"),
    "nvfp4": os.environ.get("LAGUNA_NVFP4_MODEL", "poolside/Laguna-XS.2-NVFP4"),
}


def build_cmd(mode: str, max_model_len: int, tp: int, quant: str) -> list[str]:
    model = QUANT_MODELS[quant] if mode == "quant" else MODEL
    base = [
        "vllm", "serve", model,
        "--tensor-parallel-size", str(tp),
        "--max-model-len", str(max_model_len),
        "--served-model-name", "laguna",
        # Poolside-specific parsers (from the model card):
        "--tool-call-parser", "poolside_v1",
        "--reasoning-parser", "poolside_v1",
        "--enable-auto-tool-choice",
        # enable_thinking: the Laguna chat template defaults this FALSE. Keep it false so
        # rollouts/decode are non-thinking (fewer tokens, faster) and the greedy A/B stays clean.
        # NOTE: the hosted pinference endpoint IGNORES this flag (verified — see
        # autoresearch/findings.md); it only takes effect on a self-served vLLM like this one.
        # Override with LAGUNA_ENABLE_THINKING=true.
        "--default-chat-template-kwargs",
        json.dumps({"enable_thinking": os.environ.get("LAGUNA_ENABLE_THINKING", "false").lower() == "true"}),
    ]
    if mode == "dflash":
        spec = {
            "model": SPECULATOR,
            "num_speculative_tokens": NUM_SPECULATIVE_TOKENS,
            "method": "dflash",
        }
        base += ["--speculative-config", json.dumps(spec)]
    if mode == "quant":
        # FP8 KV cache is the high-leverage single-GPU win ([TR]: ~2x concurrent
        # trajectories). Weight quant is auto-detected from the checkpoint config.
        base += ["--kv-cache-dtype", "fp8"]
    return base


def main() -> None:
    if sys.platform == "darwin":
        print("[serve_vllm] REFUSING TO RUN: this is a Mac. vLLM needs CUDA.\n"
              "             Run this on Prime Intellect. Use --print to inspect the command here.",
              file=sys.stderr)
        # Still allow --print on Mac for inspection; block --run.

    p = argparse.ArgumentParser(description="Print/run the vLLM serve command for Laguna (baseline / dflash / quant).")
    p.add_argument("--mode", choices=["baseline", "dflash", "quant"], required=True)
    p.add_argument("--quant", choices=["fp8", "int4", "nvfp4"], default="fp8",
                   help="Quant format for --mode quant (the fallback lane). Default fp8.")
    p.add_argument("--max-model-len", type=int, default=16384,
                   help="Card example uses 16384; raise toward 131072/262144 if VRAM allows. Verify at onboarding.")
    p.add_argument("--tensor-parallel-size", type=int, default=1,
                   help="Single GPU = 1. The whole hook is one-GPU serving.")
    g = p.add_mutually_exclusive_group(required=True)
    g.add_argument("--print", action="store_true", help="Print the command only.")
    g.add_argument("--run", action="store_true", help="Actually exec vllm serve (venue only).")
    args = p.parse_args()

    cmd = build_cmd(args.mode, args.max_model_len, args.tensor_parallel_size, args.quant)
    env_prefix = "VLLM_USE_DEEP_GEMM=0"
    printable = f"{env_prefix} " + " ".join(shlex.quote(c) for c in cmd)
    print(printable)

    if args.run:
        if sys.platform == "darwin":
            print("[serve_vllm] --run blocked on Mac.", file=sys.stderr)
            sys.exit(2)
        env = dict(os.environ)
        env["VLLM_USE_DEEP_GEMM"] = "0"  # per the model card
        os.execvpe(cmd[0], cmd, env)


if __name__ == "__main__":
    main()