Spaces:
Paused
Paused
File size: 1,395 Bytes
995218c dc731dd 995218c 7530fbd 995218c 7530fbd 995218c 7530fbd dc731dd 7530fbd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
#!/bin/bash
set -e
export PYTHONUNBUFFERED=1
# # ================================
# # Fixed configuration for your setup
# # ================================
# MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit"
# HOST="0.0.0.0"
# VLLM_PORT="7860"
# CPU_KVCACHE_SPACE="8" # in GiB
# DTYPE="auto" # auto, float16, float32, etc.
# VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed
# echo "[vLLM] Starting server with:"
# echo " MODEL_NAME=$MODEL_NAME"
# echo " HOST=$HOST"
# echo " VLLM_PORT=$VLLM_PORT"
# echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB"
# echo " DTYPE=$DTYPE"
# echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS"
# # Warn if /sys not mounted (lscpu detection will fail)
# if [ ! -e /sys/devices/system/cpu/possible ]; then
# echo "[WARN] /sys not mounted — CPU topology detection may fail."
# echo " Run with: docker run -v /sys:/sys:ro ..."
# fi
# # Start the vLLM CPU server (logs will stream to console)
# exec python3 -u -m vllm.entrypoints.openai.api_server \
# --model "$MODEL_NAME" \
# --host "$HOST" \
# --port "$VLLM_PORT" \
# --cpu-offload \
# --cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \
# --dtype "$DTYPE" \
# $VLLM_EXTRA_ARGS
python3 -u -m vllm.entrypoints.openai.api_server --model "unsloth/Llama-3.2-1B-bnb-4bit" --host 0.0.0.0 --port 7860 --tensor-parallel-size 1 --gpu-memory-utilization 0.0 2>/dev/null
|