Spaces:
Paused
Paused
File size: 4,391 Bytes
c5fd9c1 b5e31d0 c5fd9c1 d2585c1 c5fd9c1 d2585c1 c5fd9c1 d2585c1 c5fd9c1 79159da c5fd9c1 79159da c5fd9c1 79159da c5fd9c1 b5e31d0 c5fd9c1 a69d616 c5fd9c1 d2585c1 a69d616 d2585c1 a69d616 b5e31d0 a69d616 b5e31d0 a69d616 b5e31d0 a69d616 c5fd9c1 a69d616 c5fd9c1 d2585c1 c5fd9c1 b5e31d0 d2585c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | #!/usr/bin/env bash
# GovOn Runtime Entrypoint
# 1) vLLM OpenAI-compatible ์๋ฒ๋ฅผ ๋ฐฑ๊ทธ๋ผ์ด๋๋ก ๊ธฐ๋
# 2) health check๋ก ์ค๋น ์๋ฃ ๋๊ธฐ
# 3) FastAPI ์๋ฒ ์คํ (foreground, GPU ์ ๊ทผ ์ฐจ๋จ)
set -euo pipefail
VLLM_PORT="${VLLM_PORT:-8000}"
MODEL="${MODEL_PATH:-LGAI-EXAONE/EXAONE-4.0-32B-AWQ}"
GPU_UTIL="${GPU_UTILIZATION:-0.90}"
MAX_LEN="${MAX_MODEL_LEN:-8192}"
DTYPE="${MODEL_DTYPE:-half}"
KV_DTYPE="${KV_CACHE_DTYPE:-auto}"
SKIP_MODEL="${SKIP_MODEL_LOAD:-false}"
# SKIP_MODEL_LOAD ์ vLLM ์๋ฒ ์์ด FastAPI๋ง ์คํ
if [ "$SKIP_MODEL" = "true" ] || [ "$SKIP_MODEL" = "1" ]; then
echo "[entrypoint] SKIP_MODEL_LOAD=true: FastAPI๋ง ์คํ"
CUDA_VISIBLE_DEVICES="" exec python3.10 -m src.inference.api_server
fi
# --- vLLM ์๋ฒ ๊ธฐ๋ ---
VLLM_ARGS=(
--model "$MODEL"
--port "$VLLM_PORT"
--host 127.0.0.1
--dtype "$DTYPE"
--gpu-memory-utilization "$GPU_UTIL"
--max-model-len "$MAX_LEN"
--kv-cache-dtype "$KV_DTYPE"
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
)
# LoRA ์ด๋ํฐ ์ค์ (ADAPTER_PATHS ํ๊ฒฝ๋ณ์์์ ํ์ฑ)
if [ -n "${ADAPTER_PATHS:-}" ]; then
VLLM_ARGS+=(--enable-lora --max-loras 4 --max-lora-rank 64)
# ADAPTER_PATHS ํ์: "civil=repo/path,legal=repo/path"
# vLLM 0.19: --lora-modules๋ฅผ ํ ๋ฒ๋ง ์ฌ์ฉ, ์ฌ๋ฌ ์ด๋ํฐ๋ ๋ฐฐ์ด ์ ๊ฐ๋ก ๊ฐ๋ณ ์ธ์ ์ ๋ฌ
IFS=',' read -ra PAIRS <<< "$ADAPTER_PATHS"
LORA_MODULES=()
for pair in "${PAIRS[@]}"; do
name="${pair%%=*}"
path="${pair#*=}"
LORA_MODULES+=("${name}=${path}")
done
VLLM_ARGS+=(--lora-modules "${LORA_MODULES[@]}")
fi
echo "[entrypoint] vLLM ์๋ฒ ๊ธฐ๋: port=$VLLM_PORT model=$MODEL"
echo "[entrypoint] args: ${VLLM_ARGS[*]}"
python3.10 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" &
VLLM_PID=$!
# --- vLLM health check ---
# CUDA_VISIBLE_DEVICES="": health check python ํ๋ก์ธ์ค์์ GPU ์ ๊ทผ ์ฐจ๋จ
# โ torch/vllm import ์ CUDA ์ด๊ธฐํ hang ๋ฐฉ์ง
# except Exception: bare except(except:) ์ฌ์ฉ ๊ธ์ง
# โ sys.exit()์ด raiseํ๋ SystemExit์ ์ก์๋ฒ๋ ค ํญ์ ์คํจ ๋ฐํ
# timeout 10: ํ๋ก์ธ์ค-๋ ๋ฒจ ํ์์์ (urllib timeout๊ณผ ๋ณ๊ฐ)
echo "[entrypoint] vLLM ์๋ฒ ์ค๋น ๋๊ธฐ ์ค..."
MAX_WAIT=900
WAITED=0
INTERVAL=5
# nvidia/cuda ์ด๋ฏธ์ง์ coreutils(timeout)๊ฐ ์์ ์ ์์ผ๋ฏ๋ก ์กฐ๊ฑด๋ถ ์ฌ์ฉ
if command -v timeout &>/dev/null; then
TIMEOUT_CMD="timeout 10"
else
TIMEOUT_CMD=""
fi
_health_check() {
CUDA_VISIBLE_DEVICES="" $TIMEOUT_CMD python3.10 -c "
import urllib.request, sys
try:
r = urllib.request.urlopen('http://localhost:${VLLM_PORT}/health', timeout=5)
sys.exit(0 if r.status == 200 else 1)
except Exception:
sys.exit(1)
" 2>&1
return $?
}
while [ $WAITED -lt $MAX_WAIT ]; do
if _health_check; then
echo "[entrypoint] vLLM ์๋ฒ ์ค๋น ์๋ฃ (${WAITED}s)"
break
fi
# vLLM ํ๋ก์ธ์ค๊ฐ ์ฃฝ์๋์ง ํ์ธ
if ! kill -0 $VLLM_PID 2>/dev/null; then
echo "[entrypoint] ERROR: vLLM ํ๋ก์ธ์ค ์ข
๋ฃ๋จ"
wait $VLLM_PID; VLLM_EXIT=$?
echo "[entrypoint] vLLM exit code=$VLLM_EXIT"
exit $VLLM_EXIT
fi
sleep $INTERVAL
WAITED=$((WAITED + INTERVAL))
done
if [ $WAITED -ge $MAX_WAIT ]; then
echo "[entrypoint] ERROR: vLLM ์๋ฒ ์์ ํ์์์ (${MAX_WAIT}s)"
kill $VLLM_PID 2>/dev/null || true
exit 1
fi
# --- FastAPI ์๋ฒ ์คํ (foreground) ---
# CUDA_VISIBLE_DEVICES="": FastAPI๋ httpx๋ก vLLM API๋ง ํธ์ถํ๋ฏ๋ก GPU ๋ถํ์
# โ vLLM import ์ CUDA context ์์ฑ ๋ฐฉ์ง, GPU ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ
# exec ๋์ ๋ฐฑ๊ทธ๋ผ์ด๋ ์คํ ํ wait: SIGTERM์ vLLM/FastAPI ์์ชฝ์ ์ ํํ๊ธฐ ์ํจ
cleanup() {
echo "[entrypoint] Shutting down..."
kill $FASTAPI_PID 2>/dev/null || true
kill $VLLM_PID 2>/dev/null || true
wait $FASTAPI_PID 2>/dev/null || true
wait $VLLM_PID 2>/dev/null || true
}
trap cleanup EXIT SIGTERM SIGINT
echo "[entrypoint] FastAPI ์๋ฒ ๊ธฐ๋: port=${PORT:-7860}"
CUDA_VISIBLE_DEVICES="" python3.10 -m src.inference.api_server &
FASTAPI_PID=$!
# ๋ ์์ ์ค ๋จผ์ ์ข
๋ฃ๋ ํ๋ก์ธ์ค๋ฅผ ๊ฐ์งํ์ฌ ๋๋จธ์ง๋ ์ ๋ฆฌ
wait -n $FASTAPI_PID $VLLM_PID 2>/dev/null || true
EXITED=$?
echo "[entrypoint] ํ๋ก์ธ์ค ์ข
๋ฃ ๊ฐ์ง (exit=$EXITED), cleanup ์งํ"
|