Spaces:
Paused
Paused
Create start_server.sh
Browse files- start_server.sh +23 -0
start_server.sh
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
# Defaults if not passed in
|
| 5 |
+
MODEL_NAME="${MODEL_NAME:-unsloth/llama-2-7b-bnb-4bit}"
|
| 6 |
+
HOST="${HOST:-0.0.0.0}"
|
| 7 |
+
VLLM_PORT="${VLLM_PORT:-8000}"
|
| 8 |
+
TP_SIZE="${TP_SIZE:-1}"
|
| 9 |
+
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
|
| 10 |
+
|
| 11 |
+
echo "[vLLM] Starting server with:"
|
| 12 |
+
echo " MODEL_NAME=$MODEL_NAME"
|
| 13 |
+
echo " HOST=$HOST"
|
| 14 |
+
echo " VLLM_PORT=$VLLM_PORT"
|
| 15 |
+
echo " TP_SIZE=$TP_SIZE"
|
| 16 |
+
echo " GPU_MEMORY_UTILIZATION=$GPU_MEMORY_UTILIZATION"
|
| 17 |
+
|
| 18 |
+
exec python3 -m vllm.entrypoints.openai.api_server \
|
| 19 |
+
--model "$MODEL_NAME" \
|
| 20 |
+
--host "$HOST" \
|
| 21 |
+
--port "$VLLM_PORT" \
|
| 22 |
+
--tensor-parallel-size "$TP_SIZE" \
|
| 23 |
+
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
|