vllm-llama2 / start_server.sh
binary1ne's picture
Create start_server.sh
995218c verified
raw
history blame
665 Bytes
#!/bin/bash
set -e
# Defaults if not passed in
MODEL_NAME="${MODEL_NAME:-unsloth/llama-2-7b-bnb-4bit}"
HOST="${HOST:-0.0.0.0}"
VLLM_PORT="${VLLM_PORT:-8000}"
TP_SIZE="${TP_SIZE:-1}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
echo "[vLLM] Starting server with:"
echo " MODEL_NAME=$MODEL_NAME"
echo " HOST=$HOST"
echo " VLLM_PORT=$VLLM_PORT"
echo " TP_SIZE=$TP_SIZE"
echo " GPU_MEMORY_UTILIZATION=$GPU_MEMORY_UTILIZATION"
exec python3 -m vllm.entrypoints.openai.api_server \
--model "$MODEL_NAME" \
--host "$HOST" \
--port "$VLLM_PORT" \
--tensor-parallel-size "$TP_SIZE" \
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"