Spaces:
Paused
Paused
| # GovOn Runtime Entrypoint | |
| # 1) vLLM OpenAI-compatible ์๋ฒ๋ฅผ ๋ฐฑ๊ทธ๋ผ์ด๋๋ก ๊ธฐ๋ | |
| # 2) health check๋ก ์ค๋น ์๋ฃ ๋๊ธฐ | |
| # 3) FastAPI ์๋ฒ ์คํ (foreground, GPU ์ ๊ทผ ์ฐจ๋จ) | |
| set -euo pipefail | |
| VLLM_PORT="${VLLM_PORT:-8000}" | |
| MODEL="${MODEL_PATH:-LGAI-EXAONE/EXAONE-4.0-32B-AWQ}" | |
| GPU_UTIL="${GPU_UTILIZATION:-0.90}" | |
| MAX_LEN="${MAX_MODEL_LEN:-8192}" | |
| DTYPE="${MODEL_DTYPE:-half}" | |
| KV_DTYPE="${KV_CACHE_DTYPE:-auto}" | |
| SKIP_MODEL="${SKIP_MODEL_LOAD:-false}" | |
| # SKIP_MODEL_LOAD ์ vLLM ์๋ฒ ์์ด FastAPI๋ง ์คํ | |
| if [ "$SKIP_MODEL" = "true" ] || [ "$SKIP_MODEL" = "1" ]; then | |
| echo "[entrypoint] SKIP_MODEL_LOAD=true: FastAPI๋ง ์คํ" | |
| CUDA_VISIBLE_DEVICES="" exec python3.10 -m src.inference.api_server | |
| fi | |
| # --- vLLM ์๋ฒ ๊ธฐ๋ --- | |
| VLLM_ARGS=( | |
| --model "$MODEL" | |
| --port "$VLLM_PORT" | |
| --host 127.0.0.1 | |
| --dtype "$DTYPE" | |
| --gpu-memory-utilization "$GPU_UTIL" | |
| --max-model-len "$MAX_LEN" | |
| --kv-cache-dtype "$KV_DTYPE" | |
| --trust-remote-code | |
| --enable-auto-tool-choice | |
| --tool-call-parser hermes | |
| ) | |
| # LoRA ์ด๋ํฐ ์ค์ (ADAPTER_PATHS ํ๊ฒฝ๋ณ์์์ ํ์ฑ) | |
| if [ -n "${ADAPTER_PATHS:-}" ]; then | |
| VLLM_ARGS+=(--enable-lora --max-loras 4 --max-lora-rank 64) | |
| # ADAPTER_PATHS ํ์: "civil=repo/path,legal=repo/path" | |
| # vLLM 0.19: --lora-modules๋ฅผ ํ ๋ฒ๋ง ์ฌ์ฉ, ์ฌ๋ฌ ์ด๋ํฐ๋ ๋ฐฐ์ด ์ ๊ฐ๋ก ๊ฐ๋ณ ์ธ์ ์ ๋ฌ | |
| IFS=',' read -ra PAIRS <<< "$ADAPTER_PATHS" | |
| LORA_MODULES=() | |
| for pair in "${PAIRS[@]}"; do | |
| name="${pair%%=*}" | |
| path="${pair#*=}" | |
| LORA_MODULES+=("${name}=${path}") | |
| done | |
| VLLM_ARGS+=(--lora-modules "${LORA_MODULES[@]}") | |
| fi | |
| echo "[entrypoint] vLLM ์๋ฒ ๊ธฐ๋: port=$VLLM_PORT model=$MODEL" | |
| echo "[entrypoint] args: ${VLLM_ARGS[*]}" | |
| python3.10 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" & | |
| VLLM_PID=$! | |
| # --- vLLM health check --- | |
| # CUDA_VISIBLE_DEVICES="": health check python ํ๋ก์ธ์ค์์ GPU ์ ๊ทผ ์ฐจ๋จ | |
| # โ torch/vllm import ์ CUDA ์ด๊ธฐํ hang ๋ฐฉ์ง | |
| # except Exception: bare except(except:) ์ฌ์ฉ ๊ธ์ง | |
| # โ sys.exit()์ด raiseํ๋ SystemExit์ ์ก์๋ฒ๋ ค ํญ์ ์คํจ ๋ฐํ | |
| # timeout 10: ํ๋ก์ธ์ค-๋ ๋ฒจ ํ์์์ (urllib timeout๊ณผ ๋ณ๊ฐ) | |
| echo "[entrypoint] vLLM ์๋ฒ ์ค๋น ๋๊ธฐ ์ค..." | |
| MAX_WAIT=900 | |
| WAITED=0 | |
| INTERVAL=5 | |
| # nvidia/cuda ์ด๋ฏธ์ง์ coreutils(timeout)๊ฐ ์์ ์ ์์ผ๋ฏ๋ก ์กฐ๊ฑด๋ถ ์ฌ์ฉ | |
| if command -v timeout &>/dev/null; then | |
| TIMEOUT_CMD="timeout 10" | |
| else | |
| TIMEOUT_CMD="" | |
| fi | |
| _health_check() { | |
| CUDA_VISIBLE_DEVICES="" $TIMEOUT_CMD python3.10 -c " | |
| import urllib.request, sys | |
| try: | |
| r = urllib.request.urlopen('http://localhost:${VLLM_PORT}/health', timeout=5) | |
| sys.exit(0 if r.status == 200 else 1) | |
| except Exception: | |
| sys.exit(1) | |
| " 2>&1 | |
| return $? | |
| } | |
| while [ $WAITED -lt $MAX_WAIT ]; do | |
| if _health_check; then | |
| echo "[entrypoint] vLLM ์๋ฒ ์ค๋น ์๋ฃ (${WAITED}s)" | |
| break | |
| fi | |
| # vLLM ํ๋ก์ธ์ค๊ฐ ์ฃฝ์๋์ง ํ์ธ | |
| if ! kill -0 $VLLM_PID 2>/dev/null; then | |
| echo "[entrypoint] ERROR: vLLM ํ๋ก์ธ์ค ์ข ๋ฃ๋จ" | |
| wait $VLLM_PID; VLLM_EXIT=$? | |
| echo "[entrypoint] vLLM exit code=$VLLM_EXIT" | |
| exit $VLLM_EXIT | |
| fi | |
| sleep $INTERVAL | |
| WAITED=$((WAITED + INTERVAL)) | |
| done | |
| if [ $WAITED -ge $MAX_WAIT ]; then | |
| echo "[entrypoint] ERROR: vLLM ์๋ฒ ์์ ํ์์์ (${MAX_WAIT}s)" | |
| kill $VLLM_PID 2>/dev/null || true | |
| exit 1 | |
| fi | |
| # --- FastAPI ์๋ฒ ์คํ (foreground) --- | |
| # CUDA_VISIBLE_DEVICES="": FastAPI๋ httpx๋ก vLLM API๋ง ํธ์ถํ๋ฏ๋ก GPU ๋ถํ์ | |
| # โ vLLM import ์ CUDA context ์์ฑ ๋ฐฉ์ง, GPU ๋ฉ๋ชจ๋ฆฌ ์ ์ฝ | |
| # exec ๋์ ๋ฐฑ๊ทธ๋ผ์ด๋ ์คํ ํ wait: SIGTERM์ vLLM/FastAPI ์์ชฝ์ ์ ํํ๊ธฐ ์ํจ | |
| cleanup() { | |
| echo "[entrypoint] Shutting down..." | |
| kill $FASTAPI_PID 2>/dev/null || true | |
| kill $VLLM_PID 2>/dev/null || true | |
| wait $FASTAPI_PID 2>/dev/null || true | |
| wait $VLLM_PID 2>/dev/null || true | |
| } | |
| trap cleanup EXIT SIGTERM SIGINT | |
| echo "[entrypoint] FastAPI ์๋ฒ ๊ธฐ๋: port=${PORT:-7860}" | |
| CUDA_VISIBLE_DEVICES="" python3.10 -m src.inference.api_server & | |
| FASTAPI_PID=$! | |
| # ๋ ์์ ์ค ๋จผ์ ์ข ๋ฃ๋ ํ๋ก์ธ์ค๋ฅผ ๊ฐ์งํ์ฌ ๋๋จธ์ง๋ ์ ๋ฆฌ | |
| wait -n $FASTAPI_PID $VLLM_PID 2>/dev/null || true | |
| EXITED=$? | |
| echo "[entrypoint] ํ๋ก์ธ์ค ์ข ๋ฃ ๊ฐ์ง (exit=$EXITED), cleanup ์งํ" | |