vllm-llama2 / start_server.sh
binary1ne's picture
Update start_server.sh
dc731dd verified
raw
history blame
1.15 kB
#!/bin/bash
set -e
export PYTHONUNBUFFERED=1
# ================================
# Fixed configuration for your setup
# ================================
MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit"
HOST="0.0.0.0"
VLLM_PORT="7860"
CPU_KVCACHE_SPACE="8" # in GiB
DTYPE="auto" # auto, float16, float32, etc.
VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed
echo "[vLLM] Starting server with:"
echo " MODEL_NAME=$MODEL_NAME"
echo " HOST=$HOST"
echo " VLLM_PORT=$VLLM_PORT"
echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB"
echo " DTYPE=$DTYPE"
echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS"
# Warn if /sys not mounted (lscpu detection will fail)
if [ ! -e /sys/devices/system/cpu/possible ]; then
echo "[WARN] /sys not mounted — CPU topology detection may fail."
echo " Run with: docker run -v /sys:/sys:ro ..."
fi
# Start the vLLM CPU server (logs will stream to console)
exec python3 -u -m vllm.entrypoints.openai.api_server \
--model "$MODEL_NAME" \
--host "$HOST" \
--port "$VLLM_PORT" \
--cpu-offload \
--cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \
--dtype "$DTYPE" \
$VLLM_EXTRA_ARGS