#!/bin/bash set -e export PYTHONUNBUFFERED=1 # ================================ # Fixed configuration for your setup # ================================ MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit" HOST="0.0.0.0" VLLM_PORT="7860" CPU_KVCACHE_SPACE="8" # in GiB DTYPE="auto" # auto, float16, float32, etc. VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed echo "[vLLM] Starting server with:" echo " MODEL_NAME=$MODEL_NAME" echo " HOST=$HOST" echo " VLLM_PORT=$VLLM_PORT" echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB" echo " DTYPE=$DTYPE" echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS" # Warn if /sys not mounted (lscpu detection will fail) if [ ! -e /sys/devices/system/cpu/possible ]; then echo "[WARN] /sys not mounted — CPU topology detection may fail." echo " Run with: docker run -v /sys:/sys:ro ..." fi # Start the vLLM CPU server (logs will stream to console) exec python3 -u -m vllm.entrypoints.openai.api_server \ --model "$MODEL_NAME" \ --host "$HOST" \ --port "$VLLM_PORT" \ --cpu-offload \ --cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \ --dtype "$DTYPE" \ $VLLM_EXTRA_ARGS