VideoAgent_V01 / VideoAgent /_server /vlm_server.py
H022329's picture
Upload folder using huggingface_hub
6a5243f verified
import subprocess
import sys
import os
from dotenv import load_dotenv
load_dotenv()
MODEL_PATH = os.getenv("VLM_MODEL_PATH", "")
MODEL_NAME = os.getenv("VLM_MODEL_NAME", "")
GPU_DEVICE_ID = os.getenv("CUDA_VISIBLE_DEVICES", "1") # Default to GPU 0 if not specified
PORT = os.getenv("VLM_API_PORT", 8008)
def start_llm_server():
"""
Start the vLLM server with Qwen3 model on specified GPU
"""
# Set CUDA_VISIBLE_DEVICES to specify which GPU to use
env = os.environ.copy()
env["CUDA_VISIBLE_DEVICES"] = GPU_DEVICE_ID
cmd = [
"vllm", "serve", MODEL_PATH,
"--trust-remote-code",
"--dtype", "half",
"--port", str(PORT),
"--max-model-len", "4096",
"--served-model-name", MODEL_NAME,
"--gpu-memory-utilization", "0.35",
# "--max-num-batched-tokens", "1024"
]
print(f"Starting vLLM server on GPU {GPU_DEVICE_ID}")
# print("Command:", " ".join(cmd))
try:
# Execute the command with the specified GPU environment
process = subprocess.Popen(cmd, env=env)
# Wait for the process to complete (this will run indefinitely since it's a server)
process.wait()
except KeyboardInterrupt:
print("\nServer stopped by user.")
process.terminate()
process.wait()
if __name__ == "__main__":
start_llm_server()