Upload folder using huggingface_hub

8beaa8b verified about 5 hours ago

6.34 kB

	#!/usr/bin/env python3
	"""
	serve_vllm.py — VENUE ONLY (Prime Intellect, CUDA GPU). DOES NOT RUN ON THE MAC.

	This is a thin, documented wrapper that prints (and optionally execs) the exact
	`vllm serve` command for three configs:

	1. baseline — Laguna XS.2 alone (the speed floor).
	2. dflash — Laguna XS.2 + the DFlash speculator (the speed we're claiming).
	3. quant — a quantized Laguna checkpoint (FP8/INT4/NVFP4) + FP8 KV cache.
	This is the FALLBACK lane (see FALLBACK_QUANT.md): if DFlash hits
	a vLLM-version/draft-model snag at the venue, a quantized weights
	checkpoint still tells a clean single-GPU story (smaller footprint,
	FP8 KV cache ~doubles concurrent trajectories per the [TR]).

	baseline vs dflash are IDENTICAL except for --speculative-config — flip one flag,
	get faster tokens, same greedy output. quant is a different lever (shrink each
	pass instead of cutting passes); the two can stack, but the fallback keeps it
	simple with quant alone.

	Grounding (cite at the demo):
	- DFlash config shape is from the HF model card
	huggingface.co/poolside/Laguna-XS.2-speculator.dflash:
	--speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash",
	"num_speculative_tokens":7,"method":"dflash"}'
	- num_speculative_tokens = 7 is the card's value (this is gamma, the draft length).
	- vLLM >= 0.21.0 and VLLM_USE_DEEP_GEMM=0 per the card.
	- parsers --tool-call-parser poolside_v1 / --reasoning-parser poolside_v1 per the card.

	VERIFY AT ONBOARDING: exact vLLM version on the PI image, whether
	--trust-remote-code is required, and whether `method` is spelled "dflash"
	in the build you get. The card is authoritative; confirm against `vllm serve --help`.

	Usage (on Prime Intellect):
	python scripts/serve_vllm.py --mode baseline --print # show the command
	python scripts/serve_vllm.py --mode dflash --run # actually serve
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import shlex
	import subprocess
	import sys

	MODEL = os.environ.get("LAGUNA_MODEL", "poolside/Laguna-XS.2")
	SPECULATOR = os.environ.get("LAGUNA_SPECULATOR", "poolside/Laguna-XS.2-speculator.dflash")

	# Draft length gamma. Per the DFlash model card.
	NUM_SPECULATIVE_TOKENS = 7

	# Quantized checkpoints for the fallback lane. The [TR] says XS.2 ships FP8 (W8A8),
	# INT4 (W4A16/AWQ) and NVFP4 quants in the HF collection. EXACT repo names are NOT
	# confirmed pre-event — these are documented placeholders; VERIFY AT ONBOARDING
	# against huggingface.co/collections/poolside/laguna-xs2 (or override via env).
	QUANT_MODELS = {
	"fp8": os.environ.get("LAGUNA_FP8_MODEL", "poolside/Laguna-XS.2-FP8"),
	"int4": os.environ.get("LAGUNA_INT4_MODEL", "poolside/Laguna-XS.2-INT4"),
	"nvfp4": os.environ.get("LAGUNA_NVFP4_MODEL", "poolside/Laguna-XS.2-NVFP4"),
	}


	def build_cmd(mode: str, max_model_len: int, tp: int, quant: str) -> list[str]:
	model = QUANT_MODELS[quant] if mode == "quant" else MODEL
	base = [
	"vllm", "serve", model,
	"--tensor-parallel-size", str(tp),
	"--max-model-len", str(max_model_len),
	"--served-model-name", "laguna",
	# Poolside-specific parsers (from the model card):
	"--tool-call-parser", "poolside_v1",
	"--reasoning-parser", "poolside_v1",
	"--enable-auto-tool-choice",
	# enable_thinking: the Laguna chat template defaults this FALSE. Keep it false so
	# rollouts/decode are non-thinking (fewer tokens, faster) and the greedy A/B stays clean.
	# NOTE: the hosted pinference endpoint IGNORES this flag (verified — see
	# autoresearch/findings.md); it only takes effect on a self-served vLLM like this one.
	# Override with LAGUNA_ENABLE_THINKING=true.
	"--default-chat-template-kwargs",
	json.dumps({"enable_thinking": os.environ.get("LAGUNA_ENABLE_THINKING", "false").lower() == "true"}),
	]
	if mode == "dflash":
	spec = {
	"model": SPECULATOR,
	"num_speculative_tokens": NUM_SPECULATIVE_TOKENS,
	"method": "dflash",
	}
	base += ["--speculative-config", json.dumps(spec)]
	if mode == "quant":
	# FP8 KV cache is the high-leverage single-GPU win ([TR]: ~2x concurrent
	# trajectories). Weight quant is auto-detected from the checkpoint config.
	base += ["--kv-cache-dtype", "fp8"]
	return base


	def main() -> None:
	if sys.platform == "darwin":
	print("[serve_vllm] REFUSING TO RUN: this is a Mac. vLLM needs CUDA.\n"
	" Run this on Prime Intellect. Use --print to inspect the command here.",
	file=sys.stderr)
	# Still allow --print on Mac for inspection; block --run.

	p = argparse.ArgumentParser(description="Print/run the vLLM serve command for Laguna (baseline / dflash / quant).")
	p.add_argument("--mode", choices=["baseline", "dflash", "quant"], required=True)
	p.add_argument("--quant", choices=["fp8", "int4", "nvfp4"], default="fp8",
	help="Quant format for --mode quant (the fallback lane). Default fp8.")
	p.add_argument("--max-model-len", type=int, default=16384,
	help="Card example uses 16384; raise toward 131072/262144 if VRAM allows. Verify at onboarding.")
	p.add_argument("--tensor-parallel-size", type=int, default=1,
	help="Single GPU = 1. The whole hook is one-GPU serving.")
	g = p.add_mutually_exclusive_group(required=True)
	g.add_argument("--print", action="store_true", help="Print the command only.")
	g.add_argument("--run", action="store_true", help="Actually exec vllm serve (venue only).")
	args = p.parse_args()

	cmd = build_cmd(args.mode, args.max_model_len, args.tensor_parallel_size, args.quant)
	env_prefix = "VLLM_USE_DEEP_GEMM=0"
	printable = f"{env_prefix} " + " ".join(shlex.quote(c) for c in cmd)
	print(printable)

	if args.run:
	if sys.platform == "darwin":
	print("[serve_vllm] --run blocked on Mac.", file=sys.stderr)
	sys.exit(2)
	env = dict(os.environ)
	env["VLLM_USE_DEEP_GEMM"] = "0" # per the model card
	os.execvpe(cmd[0], cmd, env)


	if __name__ == "__main__":
	main()