| |
| """ |
| serve_vllm.py — VENUE ONLY (Prime Intellect, CUDA GPU). DOES NOT RUN ON THE MAC. |
| |
| This is a thin, documented wrapper that prints (and optionally execs) the exact |
| `vllm serve` command for three configs: |
| |
| 1. baseline — Laguna XS.2 alone (the speed floor). |
| 2. dflash — Laguna XS.2 + the DFlash speculator (the speed we're claiming). |
| 3. quant — a quantized Laguna checkpoint (FP8/INT4/NVFP4) + FP8 KV cache. |
| This is the FALLBACK lane (see FALLBACK_QUANT.md): if DFlash hits |
| a vLLM-version/draft-model snag at the venue, a quantized weights |
| checkpoint still tells a clean single-GPU story (smaller footprint, |
| FP8 KV cache ~doubles concurrent trajectories per the [TR]). |
| |
| baseline vs dflash are IDENTICAL except for --speculative-config — flip one flag, |
| get faster tokens, same greedy output. quant is a different lever (shrink each |
| pass instead of cutting passes); the two can stack, but the fallback keeps it |
| simple with quant alone. |
| |
| Grounding (cite at the demo): |
| - DFlash config shape is from the HF model card |
| huggingface.co/poolside/Laguna-XS.2-speculator.dflash: |
| --speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash", |
| "num_speculative_tokens":7,"method":"dflash"}' |
| - num_speculative_tokens = 7 is the card's value (this is gamma, the draft length). |
| - vLLM >= 0.21.0 and VLLM_USE_DEEP_GEMM=0 per the card. |
| - parsers --tool-call-parser poolside_v1 / --reasoning-parser poolside_v1 per the card. |
| |
| VERIFY AT ONBOARDING: exact vLLM version on the PI image, whether |
| --trust-remote-code is required, and whether `method` is spelled "dflash" |
| in the build you get. The card is authoritative; confirm against `vllm serve --help`. |
| |
| Usage (on Prime Intellect): |
| python scripts/serve_vllm.py --mode baseline --print # show the command |
| python scripts/serve_vllm.py --mode dflash --run # actually serve |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import shlex |
| import subprocess |
| import sys |
|
|
| MODEL = os.environ.get("LAGUNA_MODEL", "poolside/Laguna-XS.2") |
| SPECULATOR = os.environ.get("LAGUNA_SPECULATOR", "poolside/Laguna-XS.2-speculator.dflash") |
|
|
| |
| NUM_SPECULATIVE_TOKENS = 7 |
|
|
| |
| |
| |
| |
| QUANT_MODELS = { |
| "fp8": os.environ.get("LAGUNA_FP8_MODEL", "poolside/Laguna-XS.2-FP8"), |
| "int4": os.environ.get("LAGUNA_INT4_MODEL", "poolside/Laguna-XS.2-INT4"), |
| "nvfp4": os.environ.get("LAGUNA_NVFP4_MODEL", "poolside/Laguna-XS.2-NVFP4"), |
| } |
|
|
|
|
| def build_cmd(mode: str, max_model_len: int, tp: int, quant: str) -> list[str]: |
| model = QUANT_MODELS[quant] if mode == "quant" else MODEL |
| base = [ |
| "vllm", "serve", model, |
| "--tensor-parallel-size", str(tp), |
| "--max-model-len", str(max_model_len), |
| "--served-model-name", "laguna", |
| |
| "--tool-call-parser", "poolside_v1", |
| "--reasoning-parser", "poolside_v1", |
| "--enable-auto-tool-choice", |
| |
| |
| |
| |
| |
| "--default-chat-template-kwargs", |
| json.dumps({"enable_thinking": os.environ.get("LAGUNA_ENABLE_THINKING", "false").lower() == "true"}), |
| ] |
| if mode == "dflash": |
| spec = { |
| "model": SPECULATOR, |
| "num_speculative_tokens": NUM_SPECULATIVE_TOKENS, |
| "method": "dflash", |
| } |
| base += ["--speculative-config", json.dumps(spec)] |
| if mode == "quant": |
| |
| |
| base += ["--kv-cache-dtype", "fp8"] |
| return base |
|
|
|
|
| def main() -> None: |
| if sys.platform == "darwin": |
| print("[serve_vllm] REFUSING TO RUN: this is a Mac. vLLM needs CUDA.\n" |
| " Run this on Prime Intellect. Use --print to inspect the command here.", |
| file=sys.stderr) |
| |
|
|
| p = argparse.ArgumentParser(description="Print/run the vLLM serve command for Laguna (baseline / dflash / quant).") |
| p.add_argument("--mode", choices=["baseline", "dflash", "quant"], required=True) |
| p.add_argument("--quant", choices=["fp8", "int4", "nvfp4"], default="fp8", |
| help="Quant format for --mode quant (the fallback lane). Default fp8.") |
| p.add_argument("--max-model-len", type=int, default=16384, |
| help="Card example uses 16384; raise toward 131072/262144 if VRAM allows. Verify at onboarding.") |
| p.add_argument("--tensor-parallel-size", type=int, default=1, |
| help="Single GPU = 1. The whole hook is one-GPU serving.") |
| g = p.add_mutually_exclusive_group(required=True) |
| g.add_argument("--print", action="store_true", help="Print the command only.") |
| g.add_argument("--run", action="store_true", help="Actually exec vllm serve (venue only).") |
| args = p.parse_args() |
|
|
| cmd = build_cmd(args.mode, args.max_model_len, args.tensor_parallel_size, args.quant) |
| env_prefix = "VLLM_USE_DEEP_GEMM=0" |
| printable = f"{env_prefix} " + " ".join(shlex.quote(c) for c in cmd) |
| print(printable) |
|
|
| if args.run: |
| if sys.platform == "darwin": |
| print("[serve_vllm] --run blocked on Mac.", file=sys.stderr) |
| sys.exit(2) |
| env = dict(os.environ) |
| env["VLLM_USE_DEEP_GEMM"] = "0" |
| os.execvpe(cmd[0], cmd, env) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|