acnagle's picture
Upload serve.py with huggingface_hub
36a3310 verified
#!/usr/bin/env python3
"""
vLLM API server launcher for Qwen3TerminatorForCausalLM.
Imports vllm_terminator BEFORE vLLM initialises, which registers
Qwen3TerminatorForCausalLM with vLLM's ModelRegistry.
NOTE: Terminator currently supports single-GPU, single-sequence inference only.
Tensor parallelism and concurrent sequences are not supported.
Environment variables:
VLLM_MODEL β€” path to terminator model directory (required)
VLLM_PORT β€” port (default 8000)
VLLM_GPU_UTIL β€” GPU memory fraction (default 0.90)
VLLM_MAX_MODEL_LEN β€” max context length
VLLM_DTYPE β€” dtype (default "auto")
VLLM_API_KEY β€” require this API key from clients
VLLM_SERVED_NAME β€” override served model name
VLLM_HOST β€” bind address (default 0.0.0.0)
NO_PREFIX_CACHING β€” set to 1 to disable prefix caching
VLLM_ENFORCE_EAGER β€” set to 1 to disable CUDA graphs (default 0)
REASONING_PARSER β€” set to "qwen3" to enable <think>/</think> parsing
(splits `reasoning` from `content` in API responses)
Example:
VLLM_MODEL=./model_dir python serve.py
"""
import os
import runpy
import sys
# -----------------------------------------------------------------------
# CRITICAL: import vllm_terminator HERE, before any vLLM code runs.
# This registers Qwen3TerminatorForCausalLM with vLLM's ModelRegistry.
# -----------------------------------------------------------------------
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import vllm_terminator # noqa: F401 (registers the model as a side effect)
def env(name, default=None, required=False):
v = os.environ.get(name, default)
if required and (v is None or v == ""):
print(f"Missing required env var: {name}", file=sys.stderr)
sys.exit(2)
return v
def main():
model = env("VLLM_MODEL", required=True)
host = env("VLLM_HOST", "0.0.0.0")
port = env("VLLM_PORT", "8000")
max_len = env("VLLM_MAX_MODEL_LEN", None)
gpu_util = env("VLLM_GPU_UTIL", "0.90")
served_name = env("VLLM_SERVED_NAME", None)
dtype = env("VLLM_DTYPE", "auto")
api_key = env("VLLM_API_KEY", None)
no_prefix_caching = env("NO_PREFIX_CACHING", "0")
enforce_eager = env("VLLM_ENFORCE_EAGER", "0")
reasoning_parser = env("REASONING_PARSER", None)
argv = [
"vllm.entrypoints.openai.api_server",
"--model", model,
"--host", host,
"--port", str(port),
"--dtype", dtype,
"--gpu-memory-utilization", str(gpu_util),
"--tensor-parallel-size", "1",
"--max-num-seqs", "1",
]
if served_name:
argv += ["--served-model-name", served_name]
if max_len:
argv += ["--max-model-len", str(max_len)]
if api_key:
argv += ["--api-key", api_key]
if no_prefix_caching == "1":
argv += ["--no-enable-prefix-caching"]
if enforce_eager == "1":
argv += ["--enforce-eager"]
if reasoning_parser:
argv += ["--reasoning-parser", reasoning_parser]
print(f"Launching vLLM Terminator server with:\n " + " ".join(argv[1:]), flush=True)
# Replace sys.argv so vLLM's argparse sees these arguments, then run the
# server module in-process (so vllm_terminator registration persists).
sys.argv = argv
runpy.run_module("vllm.entrypoints.openai.api_server", run_name="__main__")
if __name__ == "__main__":
main()