govon-runtime / scripts /entrypoint.sh
umyunsang's picture
Upload folder using huggingface_hub
d2585c1 verified
#!/usr/bin/env bash
# GovOn Runtime Entrypoint
# 1) vLLM OpenAI-compatible ์„œ๋ฒ„๋ฅผ ๋ฐฑ๊ทธ๋ผ์šด๋“œ๋กœ ๊ธฐ๋™
# 2) health check๋กœ ์ค€๋น„ ์™„๋ฃŒ ๋Œ€๊ธฐ
# 3) FastAPI ์„œ๋ฒ„ ์‹คํ–‰ (foreground, GPU ์ ‘๊ทผ ์ฐจ๋‹จ)
set -euo pipefail
VLLM_PORT="${VLLM_PORT:-8000}"
MODEL="${MODEL_PATH:-LGAI-EXAONE/EXAONE-4.0-32B-AWQ}"
GPU_UTIL="${GPU_UTILIZATION:-0.90}"
MAX_LEN="${MAX_MODEL_LEN:-8192}"
DTYPE="${MODEL_DTYPE:-half}"
KV_DTYPE="${KV_CACHE_DTYPE:-auto}"
SKIP_MODEL="${SKIP_MODEL_LOAD:-false}"
# SKIP_MODEL_LOAD ์‹œ vLLM ์„œ๋ฒ„ ์—†์ด FastAPI๋งŒ ์‹คํ–‰
if [ "$SKIP_MODEL" = "true" ] || [ "$SKIP_MODEL" = "1" ]; then
echo "[entrypoint] SKIP_MODEL_LOAD=true: FastAPI๋งŒ ์‹คํ–‰"
CUDA_VISIBLE_DEVICES="" exec python3.10 -m src.inference.api_server
fi
# --- vLLM ์„œ๋ฒ„ ๊ธฐ๋™ ---
VLLM_ARGS=(
--model "$MODEL"
--port "$VLLM_PORT"
--host 127.0.0.1
--dtype "$DTYPE"
--gpu-memory-utilization "$GPU_UTIL"
--max-model-len "$MAX_LEN"
--kv-cache-dtype "$KV_DTYPE"
--trust-remote-code
--enable-auto-tool-choice
--tool-call-parser hermes
)
# LoRA ์–ด๋Œ‘ํ„ฐ ์„ค์ • (ADAPTER_PATHS ํ™˜๊ฒฝ๋ณ€์ˆ˜์—์„œ ํŒŒ์‹ฑ)
if [ -n "${ADAPTER_PATHS:-}" ]; then
VLLM_ARGS+=(--enable-lora --max-loras 4 --max-lora-rank 64)
# ADAPTER_PATHS ํ˜•์‹: "civil=repo/path,legal=repo/path"
# vLLM 0.19: --lora-modules๋ฅผ ํ•œ ๋ฒˆ๋งŒ ์‚ฌ์šฉ, ์—ฌ๋Ÿฌ ์–ด๋Œ‘ํ„ฐ๋Š” ๋ฐฐ์—ด ์ „๊ฐœ๋กœ ๊ฐœ๋ณ„ ์ธ์ž ์ „๋‹ฌ
IFS=',' read -ra PAIRS <<< "$ADAPTER_PATHS"
LORA_MODULES=()
for pair in "${PAIRS[@]}"; do
name="${pair%%=*}"
path="${pair#*=}"
LORA_MODULES+=("${name}=${path}")
done
VLLM_ARGS+=(--lora-modules "${LORA_MODULES[@]}")
fi
echo "[entrypoint] vLLM ์„œ๋ฒ„ ๊ธฐ๋™: port=$VLLM_PORT model=$MODEL"
echo "[entrypoint] args: ${VLLM_ARGS[*]}"
python3.10 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" &
VLLM_PID=$!
# --- vLLM health check ---
# CUDA_VISIBLE_DEVICES="": health check python ํ”„๋กœ์„ธ์Šค์—์„œ GPU ์ ‘๊ทผ ์ฐจ๋‹จ
# โ†’ torch/vllm import ์‹œ CUDA ์ดˆ๊ธฐํ™” hang ๋ฐฉ์ง€
# except Exception: bare except(except:) ์‚ฌ์šฉ ๊ธˆ์ง€
# โ†’ sys.exit()์ด raiseํ•˜๋Š” SystemExit์„ ์žก์•„๋ฒ„๋ ค ํ•ญ์ƒ ์‹คํŒจ ๋ฐ˜ํ™˜
# timeout 10: ํ”„๋กœ์„ธ์Šค-๋ ˆ๋ฒจ ํƒ€์ž„์•„์›ƒ (urllib timeout๊ณผ ๋ณ„๊ฐœ)
echo "[entrypoint] vLLM ์„œ๋ฒ„ ์ค€๋น„ ๋Œ€๊ธฐ ์ค‘..."
MAX_WAIT=900
WAITED=0
INTERVAL=5
# nvidia/cuda ์ด๋ฏธ์ง€์— coreutils(timeout)๊ฐ€ ์—†์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์กฐ๊ฑด๋ถ€ ์‚ฌ์šฉ
if command -v timeout &>/dev/null; then
TIMEOUT_CMD="timeout 10"
else
TIMEOUT_CMD=""
fi
_health_check() {
CUDA_VISIBLE_DEVICES="" $TIMEOUT_CMD python3.10 -c "
import urllib.request, sys
try:
r = urllib.request.urlopen('http://localhost:${VLLM_PORT}/health', timeout=5)
sys.exit(0 if r.status == 200 else 1)
except Exception:
sys.exit(1)
" 2>&1
return $?
}
while [ $WAITED -lt $MAX_WAIT ]; do
if _health_check; then
echo "[entrypoint] vLLM ์„œ๋ฒ„ ์ค€๋น„ ์™„๋ฃŒ (${WAITED}s)"
break
fi
# vLLM ํ”„๋กœ์„ธ์Šค๊ฐ€ ์ฃฝ์—ˆ๋Š”์ง€ ํ™•์ธ
if ! kill -0 $VLLM_PID 2>/dev/null; then
echo "[entrypoint] ERROR: vLLM ํ”„๋กœ์„ธ์Šค ์ข…๋ฃŒ๋จ"
wait $VLLM_PID; VLLM_EXIT=$?
echo "[entrypoint] vLLM exit code=$VLLM_EXIT"
exit $VLLM_EXIT
fi
sleep $INTERVAL
WAITED=$((WAITED + INTERVAL))
done
if [ $WAITED -ge $MAX_WAIT ]; then
echo "[entrypoint] ERROR: vLLM ์„œ๋ฒ„ ์‹œ์ž‘ ํƒ€์ž„์•„์›ƒ (${MAX_WAIT}s)"
kill $VLLM_PID 2>/dev/null || true
exit 1
fi
# --- FastAPI ์„œ๋ฒ„ ์‹คํ–‰ (foreground) ---
# CUDA_VISIBLE_DEVICES="": FastAPI๋Š” httpx๋กœ vLLM API๋งŒ ํ˜ธ์ถœํ•˜๋ฏ€๋กœ GPU ๋ถˆํ•„์š”
# โ†’ vLLM import ์‹œ CUDA context ์ƒ์„ฑ ๋ฐฉ์ง€, GPU ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ
# exec ๋Œ€์‹  ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์‹คํ–‰ ํ›„ wait: SIGTERM์„ vLLM/FastAPI ์–‘์ชฝ์— ์ „ํŒŒํ•˜๊ธฐ ์œ„ํ•จ
cleanup() {
echo "[entrypoint] Shutting down..."
kill $FASTAPI_PID 2>/dev/null || true
kill $VLLM_PID 2>/dev/null || true
wait $FASTAPI_PID 2>/dev/null || true
wait $VLLM_PID 2>/dev/null || true
}
trap cleanup EXIT SIGTERM SIGINT
echo "[entrypoint] FastAPI ์„œ๋ฒ„ ๊ธฐ๋™: port=${PORT:-7860}"
CUDA_VISIBLE_DEVICES="" python3.10 -m src.inference.api_server &
FASTAPI_PID=$!
# ๋‘ ์ž์‹ ์ค‘ ๋จผ์ € ์ข…๋ฃŒ๋œ ํ”„๋กœ์„ธ์Šค๋ฅผ ๊ฐ์ง€ํ•˜์—ฌ ๋‚˜๋จธ์ง€๋„ ์ •๋ฆฌ
wait -n $FASTAPI_PID $VLLM_PID 2>/dev/null || true
EXITED=$?
echo "[entrypoint] ํ”„๋กœ์„ธ์Šค ์ข…๋ฃŒ ๊ฐ์ง€ (exit=$EXITED), cleanup ์ง„ํ–‰"