Spaces:
Paused
Paused
| set -e | |
| export PYTHONUNBUFFERED=1 | |
| # # ================================ | |
| # # Fixed configuration for your setup | |
| # # ================================ | |
| # MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit" | |
| # HOST="0.0.0.0" | |
| # VLLM_PORT="7860" | |
| # CPU_KVCACHE_SPACE="8" # in GiB | |
| # DTYPE="auto" # auto, float16, float32, etc. | |
| # VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed | |
| # echo "[vLLM] Starting server with:" | |
| # echo " MODEL_NAME=$MODEL_NAME" | |
| # echo " HOST=$HOST" | |
| # echo " VLLM_PORT=$VLLM_PORT" | |
| # echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB" | |
| # echo " DTYPE=$DTYPE" | |
| # echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS" | |
| # # Warn if /sys not mounted (lscpu detection will fail) | |
| # if [ ! -e /sys/devices/system/cpu/possible ]; then | |
| # echo "[WARN] /sys not mounted — CPU topology detection may fail." | |
| # echo " Run with: docker run -v /sys:/sys:ro ..." | |
| # fi | |
| # # Start the vLLM CPU server (logs will stream to console) | |
| # exec python3 -u -m vllm.entrypoints.openai.api_server \ | |
| # --model "$MODEL_NAME" \ | |
| # --host "$HOST" \ | |
| # --port "$VLLM_PORT" \ | |
| # --cpu-offload \ | |
| # --cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \ | |
| # --dtype "$DTYPE" \ | |
| # $VLLM_EXTRA_ARGS | |
| python3 -u -m vllm.entrypoints.openai.api_server --model "unsloth/Llama-3.2-1B-bnb-4bit" --host 0.0.0.0 --port 7860 --tensor-parallel-size 1 --gpu-memory-utilization 0.0 2>/dev/null | |