DeepSWE-Preview-FP8 / serve_model.py
groxaxo's picture
Upload folder using huggingface_hub
f0d12fa verified
#!/usr/bin/env python3
"""
Script to serve the DeepSWE-Preview-FP8 model using vLLM with specific configurations:
- CUDA devices 1,2
- Max model length 32000
- Tensor parallel size 2
"""
import os
import subprocess
import sys
def serve_model():
# Set CUDA_VISIBLE_DEVICES to use only GPUs 1 and 2
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
# Build the vLLM command
cmd = [
"python", "-m", "vllm.entrypoints.openai.api_server",
"--host", "0.0.0.0",
"--port", "8550",
"--model", "/home/op/DeepSWE-Preview-FP8", # Current directory
"--max-model-len", "32000",
"--tensor-parallel-size", "2",
"--pipeline-parallel-size", "1",
]
print("Starting vLLM server with the following configuration:")
print(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")
print(f"Model path: /home/op/DeepSWE-Preview-FP8")
print(f"Max model length: 32000")
print(f"Tensor parallel size: 2")
print(f"Pipeline parallel size: 1")
print("\nCommand:", " ".join(cmd))
print("\n" + "="*50)
# Run the command
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
print(f"Error running vLLM server: {e}")
sys.exit(1)
except KeyboardInterrupt:
print("\nServer stopped by user")
sys.exit(0)
if __name__ == "__main__":
serve_model()