lean-laguna / scripts /serve_vllm.py
art87able's picture
Upload folder using huggingface_hub
8beaa8b verified
#!/usr/bin/env python3
"""
serve_vllm.py — VENUE ONLY (Prime Intellect, CUDA GPU). DOES NOT RUN ON THE MAC.
This is a thin, documented wrapper that prints (and optionally execs) the exact
`vllm serve` command for three configs:
1. baseline — Laguna XS.2 alone (the speed floor).
2. dflash — Laguna XS.2 + the DFlash speculator (the speed we're claiming).
3. quant — a quantized Laguna checkpoint (FP8/INT4/NVFP4) + FP8 KV cache.
This is the FALLBACK lane (see FALLBACK_QUANT.md): if DFlash hits
a vLLM-version/draft-model snag at the venue, a quantized weights
checkpoint still tells a clean single-GPU story (smaller footprint,
FP8 KV cache ~doubles concurrent trajectories per the [TR]).
baseline vs dflash are IDENTICAL except for --speculative-config — flip one flag,
get faster tokens, same greedy output. quant is a different lever (shrink each
pass instead of cutting passes); the two can stack, but the fallback keeps it
simple with quant alone.
Grounding (cite at the demo):
- DFlash config shape is from the HF model card
huggingface.co/poolside/Laguna-XS.2-speculator.dflash:
--speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash",
"num_speculative_tokens":7,"method":"dflash"}'
- num_speculative_tokens = 7 is the card's value (this is gamma, the draft length).
- vLLM >= 0.21.0 and VLLM_USE_DEEP_GEMM=0 per the card.
- parsers --tool-call-parser poolside_v1 / --reasoning-parser poolside_v1 per the card.
VERIFY AT ONBOARDING: exact vLLM version on the PI image, whether
--trust-remote-code is required, and whether `method` is spelled "dflash"
in the build you get. The card is authoritative; confirm against `vllm serve --help`.
Usage (on Prime Intellect):
python scripts/serve_vllm.py --mode baseline --print # show the command
python scripts/serve_vllm.py --mode dflash --run # actually serve
"""
from __future__ import annotations
import argparse
import json
import os
import shlex
import subprocess
import sys
MODEL = os.environ.get("LAGUNA_MODEL", "poolside/Laguna-XS.2")
SPECULATOR = os.environ.get("LAGUNA_SPECULATOR", "poolside/Laguna-XS.2-speculator.dflash")
# Draft length gamma. Per the DFlash model card.
NUM_SPECULATIVE_TOKENS = 7
# Quantized checkpoints for the fallback lane. The [TR] says XS.2 ships FP8 (W8A8),
# INT4 (W4A16/AWQ) and NVFP4 quants in the HF collection. EXACT repo names are NOT
# confirmed pre-event — these are documented placeholders; VERIFY AT ONBOARDING
# against huggingface.co/collections/poolside/laguna-xs2 (or override via env).
QUANT_MODELS = {
"fp8": os.environ.get("LAGUNA_FP8_MODEL", "poolside/Laguna-XS.2-FP8"),
"int4": os.environ.get("LAGUNA_INT4_MODEL", "poolside/Laguna-XS.2-INT4"),
"nvfp4": os.environ.get("LAGUNA_NVFP4_MODEL", "poolside/Laguna-XS.2-NVFP4"),
}
def build_cmd(mode: str, max_model_len: int, tp: int, quant: str) -> list[str]:
model = QUANT_MODELS[quant] if mode == "quant" else MODEL
base = [
"vllm", "serve", model,
"--tensor-parallel-size", str(tp),
"--max-model-len", str(max_model_len),
"--served-model-name", "laguna",
# Poolside-specific parsers (from the model card):
"--tool-call-parser", "poolside_v1",
"--reasoning-parser", "poolside_v1",
"--enable-auto-tool-choice",
# enable_thinking: the Laguna chat template defaults this FALSE. Keep it false so
# rollouts/decode are non-thinking (fewer tokens, faster) and the greedy A/B stays clean.
# NOTE: the hosted pinference endpoint IGNORES this flag (verified — see
# autoresearch/findings.md); it only takes effect on a self-served vLLM like this one.
# Override with LAGUNA_ENABLE_THINKING=true.
"--default-chat-template-kwargs",
json.dumps({"enable_thinking": os.environ.get("LAGUNA_ENABLE_THINKING", "false").lower() == "true"}),
]
if mode == "dflash":
spec = {
"model": SPECULATOR,
"num_speculative_tokens": NUM_SPECULATIVE_TOKENS,
"method": "dflash",
}
base += ["--speculative-config", json.dumps(spec)]
if mode == "quant":
# FP8 KV cache is the high-leverage single-GPU win ([TR]: ~2x concurrent
# trajectories). Weight quant is auto-detected from the checkpoint config.
base += ["--kv-cache-dtype", "fp8"]
return base
def main() -> None:
if sys.platform == "darwin":
print("[serve_vllm] REFUSING TO RUN: this is a Mac. vLLM needs CUDA.\n"
" Run this on Prime Intellect. Use --print to inspect the command here.",
file=sys.stderr)
# Still allow --print on Mac for inspection; block --run.
p = argparse.ArgumentParser(description="Print/run the vLLM serve command for Laguna (baseline / dflash / quant).")
p.add_argument("--mode", choices=["baseline", "dflash", "quant"], required=True)
p.add_argument("--quant", choices=["fp8", "int4", "nvfp4"], default="fp8",
help="Quant format for --mode quant (the fallback lane). Default fp8.")
p.add_argument("--max-model-len", type=int, default=16384,
help="Card example uses 16384; raise toward 131072/262144 if VRAM allows. Verify at onboarding.")
p.add_argument("--tensor-parallel-size", type=int, default=1,
help="Single GPU = 1. The whole hook is one-GPU serving.")
g = p.add_mutually_exclusive_group(required=True)
g.add_argument("--print", action="store_true", help="Print the command only.")
g.add_argument("--run", action="store_true", help="Actually exec vllm serve (venue only).")
args = p.parse_args()
cmd = build_cmd(args.mode, args.max_model_len, args.tensor_parallel_size, args.quant)
env_prefix = "VLLM_USE_DEEP_GEMM=0"
printable = f"{env_prefix} " + " ".join(shlex.quote(c) for c in cmd)
print(printable)
if args.run:
if sys.platform == "darwin":
print("[serve_vllm] --run blocked on Mac.", file=sys.stderr)
sys.exit(2)
env = dict(os.environ)
env["VLLM_USE_DEEP_GEMM"] = "0" # per the model card
os.execvpe(cmd[0], cmd, env)
if __name__ == "__main__":
main()