Spaces:
Paused
Paused
| set -e | |
| export PYTHONUNBUFFERED=1 | |
| # ================================ | |
| # Fixed configuration for your setup | |
| # ================================ | |
| MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit" | |
| HOST="0.0.0.0" | |
| VLLM_PORT="7860" | |
| CPU_KVCACHE_SPACE="8" # in GiB | |
| DTYPE="auto" # auto, float16, float32, etc. | |
| VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed | |
| echo "[vLLM] Starting server with:" | |
| echo " MODEL_NAME=$MODEL_NAME" | |
| echo " HOST=$HOST" | |
| echo " VLLM_PORT=$VLLM_PORT" | |
| echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB" | |
| echo " DTYPE=$DTYPE" | |
| echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS" | |
| # Warn if /sys not mounted (lscpu detection will fail) | |
| if [ ! -e /sys/devices/system/cpu/possible ]; then | |
| echo "[WARN] /sys not mounted — CPU topology detection may fail." | |
| echo " Run with: docker run -v /sys:/sys:ro ..." | |
| fi | |
| # Start the vLLM CPU server (logs will stream to console) | |
| exec python3 -u -m vllm.entrypoints.openai.api_server \ | |
| --model "$MODEL_NAME" \ | |
| --host "$HOST" \ | |
| --port "$VLLM_PORT" \ | |
| --cpu-offload \ | |
| --cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \ | |
| --dtype "$DTYPE" \ | |
| $VLLM_EXTRA_ARGS | |