| | |
| | """ |
| | Script to serve the DeepSWE-Preview-FP8 model using vLLM with specific configurations: |
| | - CUDA devices 1,2 |
| | - Max model length 32000 |
| | - Tensor parallel size 2 |
| | """ |
| |
|
| | import os |
| | import subprocess |
| | import sys |
| |
|
| | def serve_model(): |
| | |
| | os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" |
| | |
| | |
| | cmd = [ |
| | "python", "-m", "vllm.entrypoints.openai.api_server", |
| | "--host", "0.0.0.0", |
| | "--port", "8550", |
| | "--model", "/home/op/DeepSWE-Preview-FP8", |
| | "--max-model-len", "32000", |
| | "--tensor-parallel-size", "2", |
| | "--pipeline-parallel-size", "1", |
| | ] |
| | |
| | print("Starting vLLM server with the following configuration:") |
| | print(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}") |
| | print(f"Model path: /home/op/DeepSWE-Preview-FP8") |
| | print(f"Max model length: 32000") |
| | print(f"Tensor parallel size: 2") |
| | print(f"Pipeline parallel size: 1") |
| | print("\nCommand:", " ".join(cmd)) |
| | print("\n" + "="*50) |
| | |
| | |
| | try: |
| | subprocess.run(cmd, check=True) |
| | except subprocess.CalledProcessError as e: |
| | print(f"Error running vLLM server: {e}") |
| | sys.exit(1) |
| | except KeyboardInterrupt: |
| | print("\nServer stopped by user") |
| | sys.exit(0) |
| |
|
| | if __name__ == "__main__": |
| | serve_model() |
| |
|