| export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | |
| export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 | |
| export VLLM_USE_V1=0 | |
| MODEL_PATH='' | |
| vllm serve $MODEL_PATH \ | |
| --served-model-name medguide \ | |
| --port 8231 \ | |
| --host 0.0.0.0 \ | |
| --dtype bfloat16 \ | |
| --max-model-len 12000 \ | |
| --gpu-memory-utilization 0.9 \ | |
| --tensor-parallel-size 8 \ | |
| --api-key medguide |