File size: 4,391 Bytes
c5fd9c1
 
 
 
b5e31d0
c5fd9c1
 
 
 
 
 
 
 
 
 
 
 
 
d2585c1
c5fd9c1
 
 
 
 
 
d2585c1
c5fd9c1
 
 
 
 
 
 
 
 
 
 
 
 
d2585c1
c5fd9c1
79159da
c5fd9c1
 
 
79159da
c5fd9c1
79159da
c5fd9c1
 
 
 
 
 
 
 
b5e31d0
 
 
 
 
 
c5fd9c1
a69d616
c5fd9c1
 
 
d2585c1
 
 
 
 
 
 
a69d616
d2585c1
a69d616
 
b5e31d0
a69d616
b5e31d0
a69d616
b5e31d0
 
a69d616
 
c5fd9c1
a69d616
c5fd9c1
 
 
 
 
 
d2585c1
 
 
c5fd9c1
 
 
 
 
 
 
 
 
 
 
 
b5e31d0
 
d2585c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env bash
# GovOn Runtime Entrypoint
# 1) vLLM OpenAI-compatible ์„œ๋ฒ„๋ฅผ ๋ฐฑ๊ทธ๋ผ์šด๋“œ๋กœ ๊ธฐ๋™
# 2) health check๋กœ ์ค€๋น„ ์™„๋ฃŒ ๋Œ€๊ธฐ
# 3) FastAPI ์„œ๋ฒ„ ์‹คํ–‰ (foreground, GPU ์ ‘๊ทผ ์ฐจ๋‹จ)
set -euo pipefail

VLLM_PORT="${VLLM_PORT:-8000}"
MODEL="${MODEL_PATH:-LGAI-EXAONE/EXAONE-4.0-32B-AWQ}"
GPU_UTIL="${GPU_UTILIZATION:-0.90}"
MAX_LEN="${MAX_MODEL_LEN:-8192}"
DTYPE="${MODEL_DTYPE:-half}"
KV_DTYPE="${KV_CACHE_DTYPE:-auto}"
SKIP_MODEL="${SKIP_MODEL_LOAD:-false}"

# SKIP_MODEL_LOAD ์‹œ vLLM ์„œ๋ฒ„ ์—†์ด FastAPI๋งŒ ์‹คํ–‰
if [ "$SKIP_MODEL" = "true" ] || [ "$SKIP_MODEL" = "1" ]; then
    echo "[entrypoint] SKIP_MODEL_LOAD=true: FastAPI๋งŒ ์‹คํ–‰"
    CUDA_VISIBLE_DEVICES="" exec python3.10 -m src.inference.api_server
fi

# --- vLLM ์„œ๋ฒ„ ๊ธฐ๋™ ---
VLLM_ARGS=(
    --model "$MODEL"
    --port "$VLLM_PORT"
    --host 127.0.0.1
    --dtype "$DTYPE"
    --gpu-memory-utilization "$GPU_UTIL"
    --max-model-len "$MAX_LEN"
    --kv-cache-dtype "$KV_DTYPE"
    --trust-remote-code
    --enable-auto-tool-choice
    --tool-call-parser hermes
)

# LoRA ์–ด๋Œ‘ํ„ฐ ์„ค์ • (ADAPTER_PATHS ํ™˜๊ฒฝ๋ณ€์ˆ˜์—์„œ ํŒŒ์‹ฑ)
if [ -n "${ADAPTER_PATHS:-}" ]; then
    VLLM_ARGS+=(--enable-lora --max-loras 4 --max-lora-rank 64)
    # ADAPTER_PATHS ํ˜•์‹: "civil=repo/path,legal=repo/path"
    # vLLM 0.19: --lora-modules๋ฅผ ํ•œ ๋ฒˆ๋งŒ ์‚ฌ์šฉ, ์—ฌ๋Ÿฌ ์–ด๋Œ‘ํ„ฐ๋Š” ๋ฐฐ์—ด ์ „๊ฐœ๋กœ ๊ฐœ๋ณ„ ์ธ์ž ์ „๋‹ฌ
    IFS=',' read -ra PAIRS <<< "$ADAPTER_PATHS"
    LORA_MODULES=()
    for pair in "${PAIRS[@]}"; do
        name="${pair%%=*}"
        path="${pair#*=}"
        LORA_MODULES+=("${name}=${path}")
    done
    VLLM_ARGS+=(--lora-modules "${LORA_MODULES[@]}")
fi

echo "[entrypoint] vLLM ์„œ๋ฒ„ ๊ธฐ๋™: port=$VLLM_PORT model=$MODEL"
echo "[entrypoint] args: ${VLLM_ARGS[*]}"

python3.10 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" &
VLLM_PID=$!

# --- vLLM health check ---
# CUDA_VISIBLE_DEVICES="": health check python ํ”„๋กœ์„ธ์Šค์—์„œ GPU ์ ‘๊ทผ ์ฐจ๋‹จ
#   โ†’ torch/vllm import ์‹œ CUDA ์ดˆ๊ธฐํ™” hang ๋ฐฉ์ง€
# except Exception: bare except(except:) ์‚ฌ์šฉ ๊ธˆ์ง€
#   โ†’ sys.exit()์ด raiseํ•˜๋Š” SystemExit์„ ์žก์•„๋ฒ„๋ ค ํ•ญ์ƒ ์‹คํŒจ ๋ฐ˜ํ™˜
# timeout 10: ํ”„๋กœ์„ธ์Šค-๋ ˆ๋ฒจ ํƒ€์ž„์•„์›ƒ (urllib timeout๊ณผ ๋ณ„๊ฐœ)
echo "[entrypoint] vLLM ์„œ๋ฒ„ ์ค€๋น„ ๋Œ€๊ธฐ ์ค‘..."
MAX_WAIT=900
WAITED=0
INTERVAL=5

# nvidia/cuda ์ด๋ฏธ์ง€์— coreutils(timeout)๊ฐ€ ์—†์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์กฐ๊ฑด๋ถ€ ์‚ฌ์šฉ
if command -v timeout &>/dev/null; then
    TIMEOUT_CMD="timeout 10"
else
    TIMEOUT_CMD=""
fi

_health_check() {
    CUDA_VISIBLE_DEVICES="" $TIMEOUT_CMD python3.10 -c "
import urllib.request, sys
try:
    r = urllib.request.urlopen('http://localhost:${VLLM_PORT}/health', timeout=5)
    sys.exit(0 if r.status == 200 else 1)
except Exception:
    sys.exit(1)
" 2>&1
    return $?
}

while [ $WAITED -lt $MAX_WAIT ]; do
    if _health_check; then
        echo "[entrypoint] vLLM ์„œ๋ฒ„ ์ค€๋น„ ์™„๋ฃŒ (${WAITED}s)"
        break
    fi
    # vLLM ํ”„๋กœ์„ธ์Šค๊ฐ€ ์ฃฝ์—ˆ๋Š”์ง€ ํ™•์ธ
    if ! kill -0 $VLLM_PID 2>/dev/null; then
        echo "[entrypoint] ERROR: vLLM ํ”„๋กœ์„ธ์Šค ์ข…๋ฃŒ๋จ"
        wait $VLLM_PID; VLLM_EXIT=$?
        echo "[entrypoint] vLLM exit code=$VLLM_EXIT"
        exit $VLLM_EXIT
    fi
    sleep $INTERVAL
    WAITED=$((WAITED + INTERVAL))
done

if [ $WAITED -ge $MAX_WAIT ]; then
    echo "[entrypoint] ERROR: vLLM ์„œ๋ฒ„ ์‹œ์ž‘ ํƒ€์ž„์•„์›ƒ (${MAX_WAIT}s)"
    kill $VLLM_PID 2>/dev/null || true
    exit 1
fi

# --- FastAPI ์„œ๋ฒ„ ์‹คํ–‰ (foreground) ---
# CUDA_VISIBLE_DEVICES="": FastAPI๋Š” httpx๋กœ vLLM API๋งŒ ํ˜ธ์ถœํ•˜๋ฏ€๋กœ GPU ๋ถˆํ•„์š”
#   โ†’ vLLM import ์‹œ CUDA context ์ƒ์„ฑ ๋ฐฉ์ง€, GPU ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
# exec ๋Œ€์‹  ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์‹คํ–‰ ํ›„ wait: SIGTERM์„ vLLM/FastAPI ์–‘์ชฝ์— ์ „ํŒŒํ•˜๊ธฐ ์œ„ํ•จ
cleanup() {
    echo "[entrypoint] Shutting down..."
    kill $FASTAPI_PID 2>/dev/null || true
    kill $VLLM_PID 2>/dev/null || true
    wait $FASTAPI_PID 2>/dev/null || true
    wait $VLLM_PID 2>/dev/null || true
}
trap cleanup EXIT SIGTERM SIGINT

echo "[entrypoint] FastAPI ์„œ๋ฒ„ ๊ธฐ๋™: port=${PORT:-7860}"
CUDA_VISIBLE_DEVICES="" python3.10 -m src.inference.api_server &
FASTAPI_PID=$!

# ๋‘ ์ž์‹ ์ค‘ ๋จผ์ € ์ข…๋ฃŒ๋œ ํ”„๋กœ์„ธ์Šค๋ฅผ ๊ฐ์ง€ํ•˜์—ฌ ๋‚˜๋จธ์ง€๋„ ์ •๋ฆฌ
wait -n $FASTAPI_PID $VLLM_PID 2>/dev/null || true
EXITED=$?
echo "[entrypoint] ํ”„๋กœ์„ธ์Šค ์ข…๋ฃŒ ๊ฐ์ง€ (exit=$EXITED), cleanup ์ง„ํ–‰"