Spaces:
Paused
Paused
| set -e | |
| # Defaults if not passed in | |
| MODEL_NAME="${MODEL_NAME:-unsloth/llama-2-7b-bnb-4bit}" | |
| HOST="${HOST:-0.0.0.0}" | |
| VLLM_PORT="${VLLM_PORT:-8000}" | |
| TP_SIZE="${TP_SIZE:-1}" | |
| GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" | |
| echo "[vLLM] Starting server with:" | |
| echo " MODEL_NAME=$MODEL_NAME" | |
| echo " HOST=$HOST" | |
| echo " VLLM_PORT=$VLLM_PORT" | |
| echo " TP_SIZE=$TP_SIZE" | |
| echo " GPU_MEMORY_UTILIZATION=$GPU_MEMORY_UTILIZATION" | |
| exec python3 -m vllm.entrypoints.openai.api_server \ | |
| --model "$MODEL_NAME" \ | |
| --host "$HOST" \ | |
| --port "$VLLM_PORT" \ | |
| --tensor-parallel-size "$TP_SIZE" \ | |
| --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" | |