Spaces:
Restarting on A100
Restarting on A100
| """GovOn daemon lifecycle ๊ด๋ฆฌ. | |
| Issue #144: CLI-daemon/LangGraph runtime ์ฐ๋ ๋ฐ session resume. | |
| uvicorn์ผ๋ก ๋ฐฑ๊ทธ๋ผ์ด๋์์ GovOn API ์๋ฒ๋ฅผ ๊ธฐ๋ํ๊ณ , | |
| PID ํ์ผ๋ก ํ๋ก์ธ์ค ์ํ๋ฅผ ์ถ์ ํ๋ค. | |
| .. note:: | |
| ์ด ๋ชจ๋์ **๋ก์ปฌ daemon ์ ์ฉ**์ ๋๋ค. | |
| ์๊ฒฉ ์๋ฒ์ ์ฐ๊ฒฐํ ๋๋ ``GOVON_RUNTIME_URL`` ํ๊ฒฝ๋ณ์๋ฅผ ์ค์ ํ๋ฉด | |
| ``shell.py``์ ``main()``์ด ์ด ๋ชจ๋์ ์์ ํ ๊ฑด๋๋ฐ๊ณ ์ง์ ๋ URL์ | |
| ์ง์ ์ฐ๊ฒฐํฉ๋๋ค. Docker, ํด๋ผ์ฐ๋ ๋ฐฐํฌ, CI ํ๊ฒฝ์์๋ ํด๋น ๋ฐฉ์์ | |
| ์ฌ์ฉํ๋ ๊ฒ์ ๊ถ์ฅํฉ๋๋ค. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import signal | |
| import subprocess | |
| import sys | |
| import time | |
| from pathlib import Path | |
| from typing import Optional | |
| import httpx | |
| from loguru import logger | |
| class DaemonManager: | |
| """GovOn API ์๋ฒ daemon lifecycle ๊ด๋ฆฌ์. | |
| PID ํ์ผ๊ณผ /health ์๋ํฌ์ธํธ๋ฅผ ๊ฒฐํฉํ์ฌ daemon ์ํ๋ฅผ ํ์ธํ๊ณ , | |
| ํ์ ์ uvicorn์ผ๋ก ๋ฐฑ๊ทธ๋ผ์ด๋ ๊ธฐ๋ํ๋ค. | |
| ํ๊ฒฝ๋ณ์ ``GOVON_PORT``๋ก ํฌํธ๋ฅผ ์ค๋ฒ๋ผ์ด๋ํ ์ ์๋ค (๊ธฐ๋ณธ: 8000). | |
| """ | |
| GOVON_HOME = Path.home() / ".govon" | |
| _HEALTH_CHECK_TIMEOUT = 120 # ์ต๋ ๋๊ธฐ ์ด | |
| _HEALTH_CHECK_INTERVAL = 1 # ์ฌ์๋ ๊ฐ๊ฒฉ (์ด) | |
| def __init__(self) -> None: | |
| self.GOVON_HOME.mkdir(parents=True, exist_ok=True) | |
| self.port: int = int(os.environ.get("GOVON_PORT", "8000")) | |
| self.pid_path: Path = self.GOVON_HOME / "daemon.pid" | |
| self.log_path: Path = self.GOVON_HOME / "daemon.log" | |
| def get_base_url(self) -> str: | |
| """daemon base URL์ ๋ฐํํ๋ค.""" | |
| return f"http://127.0.0.1:{self.port}" | |
| def is_running(self) -> bool: | |
| """daemon์ด ์คํ ์ค์ธ์ง ํ์ธํ๋ค. | |
| PID ํ์ผ์ด ์กด์ฌํ๊ณ ํด๋น ํ๋ก์ธ์ค๊ฐ ์ด์ ์์ผ๋ฉฐ, | |
| /health ์๋ํฌ์ธํธ๊ฐ ์๋ตํ ๋ True๋ฅผ ๋ฐํํ๋ค. | |
| """ | |
| pid = self._read_pid() | |
| if pid is None: | |
| return False | |
| # PID ํ๋ก์ธ์ค ์์กด ํ์ธ | |
| if not self._pid_alive(pid): | |
| logger.debug(f"[daemon] PID {pid} ํ๋ก์ธ์ค๊ฐ ์์. PID ํ์ผ ์ ๊ฑฐ.") | |
| self._remove_pid() | |
| return False | |
| # /health HTTP ํ์ธ | |
| try: | |
| with httpx.Client(timeout=5.0) as client: | |
| resp = client.get(f"{self.get_base_url()}/health") | |
| return resp.status_code == 200 | |
| except (httpx.ConnectError, httpx.TimeoutException, Exception): | |
| return False | |
| def start(self) -> bool: | |
| """uvicorn์ ๋ฐฑ๊ทธ๋ผ์ด๋๋ก ๊ธฐ๋ํ๊ณ PID๋ฅผ ๊ธฐ๋กํ๋ค. | |
| Returns | |
| ------- | |
| bool | |
| ๊ธฐ๋ ์ฑ๊ณต ์ฌ๋ถ (health check ํต๊ณผ ์ True). | |
| """ | |
| # ๋ ์ด์ค ์ปจ๋์ ๋ฐฉ์ง: ๊ธฐ๋ ์ ํ ๋ฒ ๋ health check | |
| if self.is_running(): | |
| logger.info("[daemon] ์ด๋ฏธ ์คํ ์ค์ ๋๋ค.") | |
| return True | |
| cmd = [ | |
| sys.executable, | |
| "-m", | |
| "uvicorn", | |
| "src.inference.api_server:app", | |
| "--host", | |
| "127.0.0.1", | |
| "--port", | |
| str(self.port), | |
| ] | |
| if self._port_in_use(): | |
| logger.error(f"[daemon] ํฌํธ {self.port}์ด ์ด๋ฏธ ์ฌ์ฉ ์ค์ ๋๋ค.") | |
| return False | |
| logger.info(f"[daemon] ๊ธฐ๋ ๋ช ๋ น: {' '.join(cmd)}") | |
| with open(self.log_path, "a") as log_file: | |
| proc = subprocess.Popen( | |
| cmd, | |
| stdout=log_file, | |
| stderr=log_file, | |
| start_new_session=True, | |
| ) | |
| self._write_pid(proc.pid) | |
| logger.info(f"[daemon] ํ๋ก์ธ์ค ๊ธฐ๋ ์๋ฃ. PID={proc.pid}") | |
| # health check ๋๊ธฐ | |
| healthy = self._wait_until_healthy() | |
| if not healthy: | |
| logger.error("[daemon] health check ์คํจ. ํ๋ก์ธ์ค๋ฅผ ์ ๋ฆฌํฉ๋๋ค.") | |
| self.stop() | |
| return False | |
| return True | |
| def stop(self) -> None: | |
| """daemon์ ์ ์ ์ข ๋ฃํ๋ค (SIGTERM โ timeout ํ SIGKILL).""" | |
| pid = self._read_pid() | |
| if pid is None: | |
| logger.info("[daemon] PID ํ์ผ์ด ์์ต๋๋ค. ์คํ ์ค์ด ์๋ ๊ฒ์ผ๋ก ๊ฐ์ฃผํฉ๋๋ค.") | |
| return | |
| if not self._pid_alive(pid): | |
| logger.info(f"[daemon] PID {pid} ํ๋ก์ธ์ค๊ฐ ์์ต๋๋ค.") | |
| self._remove_pid() | |
| return | |
| logger.info(f"[daemon] SIGTERM ์ ์ก: PID={pid}") | |
| os.kill(pid, signal.SIGTERM) | |
| # ์ต๋ 10์ด ๋๊ธฐ | |
| for _ in range(10): | |
| time.sleep(1) | |
| if not self._pid_alive(pid): | |
| logger.info(f"[daemon] PID {pid} ์ ์ ์ข ๋ฃ๋จ.") | |
| self._remove_pid() | |
| return | |
| logger.warning(f"[daemon] SIGKILL ์ ์ก: PID={pid}") | |
| try: | |
| os.kill(pid, signal.SIGKILL) | |
| except ProcessLookupError: | |
| pass | |
| self._remove_pid() | |
| def ensure_running(self) -> str: | |
| """daemon์ด ์คํ ์ค์์ ๋ณด์ฅํ๊ณ base URL์ ๋ฐํํ๋ค. | |
| ์คํ ์ค์ด ์๋๋ฉด start()๋ฅผ ํธ์ถํ๋ค. | |
| Returns | |
| ------- | |
| str | |
| daemon base URL (์: "http://127.0.0.1:8000"). | |
| Raises | |
| ------ | |
| RuntimeError | |
| daemon ๊ธฐ๋์ ์คํจํ ๊ฒฝ์ฐ. | |
| """ | |
| if not self.is_running(): | |
| success = self.start() | |
| if not success: | |
| raise RuntimeError( | |
| "GovOn daemon ๊ธฐ๋์ ์คํจํ์ต๋๋ค. " f"๋ก๊ทธ๋ฅผ ํ์ธํ์ธ์: {self.log_path}" | |
| ) | |
| return self.get_base_url() | |
| def _port_in_use(self) -> bool: | |
| """ํฌํธ๊ฐ ์ด๋ฏธ ์ฌ์ฉ ์ค์ธ์ง ํ์ธํ๋ค.""" | |
| import socket | |
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
| return s.connect_ex(("127.0.0.1", self.port)) == 0 | |
| # ------------------------------------------------------------------ | |
| # ๋ด๋ถ ํฌํผ | |
| # ------------------------------------------------------------------ | |
| def _read_pid(self) -> Optional[int]: | |
| """PID ํ์ผ์์ PID๋ฅผ ์ฝ๋๋ค. ํ์ผ์ด ์์ผ๋ฉด None.""" | |
| if not self.pid_path.exists(): | |
| return None | |
| try: | |
| first_line = self.pid_path.read_text().strip().splitlines()[0] | |
| return int(first_line.split()[0]) | |
| except (ValueError, OSError, IndexError): | |
| return None | |
| def _write_pid(self, pid: int) -> None: | |
| """PID์ ๊ธฐ๋ ์๊ฐ(epoch timestamp)์ ํ์ผ์ ๊ธฐ๋กํ๋ค.""" | |
| self.pid_path.write_text(f"{pid} {int(time.time())}") | |
| def _remove_pid(self) -> None: | |
| """PID ํ์ผ์ ์ ๊ฑฐํ๋ค.""" | |
| try: | |
| self.pid_path.unlink() | |
| except FileNotFoundError: | |
| pass | |
| def _pid_alive(pid: int) -> bool: | |
| """ํ๋ก์ธ์ค๊ฐ ์ด์ ์๋์ง ํ์ธํ๋ค.""" | |
| try: | |
| os.kill(pid, 0) | |
| return True | |
| except ProcessLookupError: | |
| return False | |
| except PermissionError: | |
| # ํ๋ก์ธ์ค๊ฐ ์กด์ฌํ์ง๋ง ๊ถํ์ด ์๋ ๊ฒฝ์ฐ โ ์ด์ ์์์ผ๋ก ๊ฐ์ฃผ | |
| return True | |
| def _wait_until_healthy(self) -> bool: | |
| """health check๊ฐ ํต๊ณผํ ๋๊น์ง ์ต๋ 120์ด ๋๊ธฐํ๋ค.""" | |
| deadline = time.monotonic() + self._HEALTH_CHECK_TIMEOUT | |
| while time.monotonic() < deadline: | |
| try: | |
| with httpx.Client(timeout=3.0) as client: | |
| resp = client.get(f"{self.get_base_url()}/health") | |
| if resp.status_code == 200: | |
| logger.info("[daemon] health check ํต๊ณผ.") | |
| return True | |
| except (httpx.ConnectError, httpx.TimeoutException, Exception): | |
| pass | |
| time.sleep(self._HEALTH_CHECK_INTERVAL) | |
| logger.error("[daemon] health check timeout (120์ด).") | |
| return False | |