AXERA-TECH
/

VideoAgent-AX650N

Model card Files Files and versions

VideoAgent-AX650N / VideoAgent /_server /llm_server.py

H022329's picture

Upload folder using huggingface_hub

9cf08e9 verified about 1 month ago

History Blame Contribute Delete

1.38 kB

	import subprocess
	import sys
	import os
	from dotenv import load_dotenv

	load_dotenv()
	MODEL_PATH = os.getenv("LLM_MODEL_PATH", "")
	MODEL_NAME = os.getenv("LLM_MODEL_NAME", "")
	GPU_DEVICE_ID = os.getenv("CUDA_VISIBLE_DEVICES", "1") # Default to GPU 0 if not specified
	PORT = os.getenv("LLM_API_PORT", 8009)
	def start_llm_server():
	"""
	Start the vLLM server with Qwen3 model on specified GPU
	"""
	# Set CUDA_VISIBLE_DEVICES to specify which GPU to use
	env = os.environ.copy()
	env["CUDA_VISIBLE_DEVICES"] = GPU_DEVICE_ID

	cmd = [
	"vllm", "serve", MODEL_PATH,
	"--trust-remote-code",
	"--dtype", "half",
	"--port", str(PORT),
	"--max-model-len", "4096",
	"--served-model-name", MODEL_NAME,
	"--gpu-memory-utilization", "0.5",
	# "--max-num-batched-tokens", "1024"
	]

	print(f"Starting vLLM server on GPU {GPU_DEVICE_ID}")
	# print("Command:", " ".join(cmd))

	try:
	# Execute the command with the specified GPU environment
	process = subprocess.Popen(cmd, env=env)

	# Wait for the process to complete (this will run indefinitely since it's a server)
	process.wait()
	except KeyboardInterrupt:
	print("\nServer stopped by user.")
	process.terminate()
	process.wait()

	if __name__ == "__main__":
	start_llm_server()