groxaxo
/

DeepSWE-Preview-FP8

software-engineering

derived-from-agentica-org/DeepSWE-Preview

compressed-tensors

Model card Files Files and versions

DeepSWE-Preview-FP8 / serve_model.py

groxaxo's picture

Upload folder using huggingface_hub

f0d12fa verified 6 months ago

history blame contribute delete

1.42 kB

	#!/usr/bin/env python3
	"""
	Script to serve the DeepSWE-Preview-FP8 model using vLLM with specific configurations:
	- CUDA devices 1,2
	- Max model length 32000
	- Tensor parallel size 2
	"""

	import os
	import subprocess
	import sys

	def serve_model():
	# Set CUDA_VISIBLE_DEVICES to use only GPUs 1 and 2
	os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

	# Build the vLLM command
	cmd = [
	"python", "-m", "vllm.entrypoints.openai.api_server",
	"--host", "0.0.0.0",
	"--port", "8550",
	"--model", "/home/op/DeepSWE-Preview-FP8", # Current directory
	"--max-model-len", "32000",
	"--tensor-parallel-size", "2",
	"--pipeline-parallel-size", "1",
	]

	print("Starting vLLM server with the following configuration:")
	print(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")
	print(f"Model path: /home/op/DeepSWE-Preview-FP8")
	print(f"Max model length: 32000")
	print(f"Tensor parallel size: 2")
	print(f"Pipeline parallel size: 1")
	print("\nCommand:", " ".join(cmd))
	print("\n" + "="*50)

	# Run the command
	try:
	subprocess.run(cmd, check=True)
	except subprocess.CalledProcessError as e:
	print(f"Error running vLLM server: {e}")
	sys.exit(1)
	except KeyboardInterrupt:
	print("\nServer stopped by user")
	sys.exit(0)

	if __name__ == "__main__":
	serve_model()