binary1ne commited on
Commit
dc731dd
·
verified ·
1 Parent(s): bd77f13

Update start_server.sh

Browse files
Files changed (1) hide show
  1. start_server.sh +25 -11
start_server.sh CHANGED
@@ -1,23 +1,37 @@
1
  #!/bin/bash
2
  set -e
 
3
 
4
- # Defaults if not passed in
5
- MODEL_NAME="${MODEL_NAME:-unsloth/llama-2-7b-bnb-4bit}"
6
- HOST="${HOST:-0.0.0.0}"
7
- VLLM_PORT="${VLLM_PORT:-8000}"
8
- TP_SIZE="${TP_SIZE:-1}"
9
- GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
 
 
 
10
 
11
  echo "[vLLM] Starting server with:"
12
  echo " MODEL_NAME=$MODEL_NAME"
13
  echo " HOST=$HOST"
14
  echo " VLLM_PORT=$VLLM_PORT"
15
- echo " TP_SIZE=$TP_SIZE"
16
- echo " GPU_MEMORY_UTILIZATION=$GPU_MEMORY_UTILIZATION"
 
17
 
18
- exec python3 -m vllm.entrypoints.openai.api_server \
 
 
 
 
 
 
 
19
  --model "$MODEL_NAME" \
20
  --host "$HOST" \
21
  --port "$VLLM_PORT" \
22
- --tensor-parallel-size "$TP_SIZE" \
23
- --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
 
 
 
1
  #!/bin/bash
2
  set -e
3
+ export PYTHONUNBUFFERED=1
4
 
5
+ # ================================
6
+ # Fixed configuration for your setup
7
+ # ================================
8
+ MODEL_NAME="unsloth/Llama-3.2-3B-bnb-4bit"
9
+ HOST="0.0.0.0"
10
+ VLLM_PORT="7860"
11
+ CPU_KVCACHE_SPACE="8" # in GiB
12
+ DTYPE="auto" # auto, float16, float32, etc.
13
+ VLLM_EXTRA_ARGS="" # add extra vLLM flags here if needed
14
 
15
  echo "[vLLM] Starting server with:"
16
  echo " MODEL_NAME=$MODEL_NAME"
17
  echo " HOST=$HOST"
18
  echo " VLLM_PORT=$VLLM_PORT"
19
+ echo " CPU_KVCACHE_SPACE=${CPU_KVCACHE_SPACE}GiB"
20
+ echo " DTYPE=$DTYPE"
21
+ echo " EXTRA_ARGS=$VLLM_EXTRA_ARGS"
22
 
23
+ # Warn if /sys not mounted (lscpu detection will fail)
24
+ if [ ! -e /sys/devices/system/cpu/possible ]; then
25
+ echo "[WARN] /sys not mounted — CPU topology detection may fail."
26
+ echo " Run with: docker run -v /sys:/sys:ro ..."
27
+ fi
28
+
29
+ # Start the vLLM CPU server (logs will stream to console)
30
+ exec python3 -u -m vllm.entrypoints.openai.api_server \
31
  --model "$MODEL_NAME" \
32
  --host "$HOST" \
33
  --port "$VLLM_PORT" \
34
+ --cpu-offload \
35
+ --cpu-kv-cache-space "$CPU_KVCACHE_SPACE" \
36
+ --dtype "$DTYPE" \
37
+ $VLLM_EXTRA_ARGS