| |
| """ |
| vLLM API server launcher for Qwen3TerminatorForCausalLM. |
| |
| Imports vllm_terminator BEFORE vLLM initialises, which registers |
| Qwen3TerminatorForCausalLM with vLLM's ModelRegistry. |
| |
| NOTE: Terminator currently supports single-GPU, single-sequence inference only. |
| Tensor parallelism and concurrent sequences are not supported. |
| |
| Environment variables: |
| VLLM_MODEL β path to terminator model directory (required) |
| VLLM_PORT β port (default 8000) |
| VLLM_GPU_UTIL β GPU memory fraction (default 0.90) |
| VLLM_MAX_MODEL_LEN β max context length |
| VLLM_DTYPE β dtype (default "auto") |
| VLLM_API_KEY β require this API key from clients |
| VLLM_SERVED_NAME β override served model name |
| VLLM_HOST β bind address (default 0.0.0.0) |
| NO_PREFIX_CACHING β set to 1 to disable prefix caching |
| VLLM_ENFORCE_EAGER β set to 1 to disable CUDA graphs (default 0) |
| REASONING_PARSER β set to "qwen3" to enable <think>/</think> parsing |
| (splits `reasoning` from `content` in API responses) |
| |
| Example: |
| VLLM_MODEL=./model_dir python serve.py |
| """ |
|
|
| import os |
| import runpy |
| import sys |
|
|
| |
| |
| |
| |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) |
| import vllm_terminator |
|
|
|
|
| def env(name, default=None, required=False): |
| v = os.environ.get(name, default) |
| if required and (v is None or v == ""): |
| print(f"Missing required env var: {name}", file=sys.stderr) |
| sys.exit(2) |
| return v |
|
|
|
|
| def main(): |
| model = env("VLLM_MODEL", required=True) |
| host = env("VLLM_HOST", "0.0.0.0") |
| port = env("VLLM_PORT", "8000") |
| max_len = env("VLLM_MAX_MODEL_LEN", None) |
| gpu_util = env("VLLM_GPU_UTIL", "0.90") |
| served_name = env("VLLM_SERVED_NAME", None) |
| dtype = env("VLLM_DTYPE", "auto") |
| api_key = env("VLLM_API_KEY", None) |
| no_prefix_caching = env("NO_PREFIX_CACHING", "0") |
| enforce_eager = env("VLLM_ENFORCE_EAGER", "0") |
| reasoning_parser = env("REASONING_PARSER", None) |
|
|
| argv = [ |
| "vllm.entrypoints.openai.api_server", |
| "--model", model, |
| "--host", host, |
| "--port", str(port), |
| "--dtype", dtype, |
| "--gpu-memory-utilization", str(gpu_util), |
| "--tensor-parallel-size", "1", |
| "--max-num-seqs", "1", |
| ] |
|
|
| if served_name: |
| argv += ["--served-model-name", served_name] |
| if max_len: |
| argv += ["--max-model-len", str(max_len)] |
| if api_key: |
| argv += ["--api-key", api_key] |
| if no_prefix_caching == "1": |
| argv += ["--no-enable-prefix-caching"] |
| if enforce_eager == "1": |
| argv += ["--enforce-eager"] |
| if reasoning_parser: |
| argv += ["--reasoning-parser", reasoning_parser] |
|
|
| print(f"Launching vLLM Terminator server with:\n " + " ".join(argv[1:]), flush=True) |
|
|
| |
| |
| sys.argv = argv |
| runpy.run_module("vllm.entrypoints.openai.api_server", run_name="__main__") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|