"""GovOn Runtime serving profile and model configuration. Defines profiles for local development, single-server (production), and air-gapped installations based on environment variables. Standardises generation defaults and timeout settings. Usage: config = RuntimeConfig.from_env() config.log_summary() # Unified hyperparameter config (YAML + env var overrides): govon_config = GovOnConfig.load() """ import os from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional from loguru import logger # Project root path (src/inference/runtime_config.py → ../../..) _PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent class ServingProfile(str, Enum): """서빙 프로필. SERVING_PROFILE 환경변수로 선택한다.""" LOCAL = "local" # 로컬 개발 환경 SINGLE = "single" # 단일 서버 프로덕션 CONTAINER = "container" # Docker / Cloud Run / 오프라인 패키지 AIRGAP = "airgap" # 폐쇄망 설치 # --------------------------------------------------------------------------- # 프로필별 기본값 정의 # --------------------------------------------------------------------------- _PROFILE_DEFAULTS: Dict[ServingProfile, Dict] = { ServingProfile.LOCAL: { "host": "127.0.0.1", "port": 8000, "workers": 1, "gpu_utilization": 0.7, "max_model_len": 4096, "log_level": "DEBUG", "reload": True, "rate_limit_enabled": False, "request_timeout_sec": 120, "cors_origins": ["http://localhost:3000", "http://127.0.0.1:3000"], }, ServingProfile.SINGLE: { "host": "0.0.0.0", "port": 8000, "workers": 1, "gpu_utilization": 0.85, "max_model_len": 8192, "log_level": "INFO", "reload": False, "rate_limit_enabled": True, "request_timeout_sec": 60, "cors_origins": [], }, ServingProfile.CONTAINER: { "host": "0.0.0.0", "port": 8000, "workers": 1, "gpu_utilization": 0.85, "max_model_len": 8192, "log_level": "INFO", "reload": False, "rate_limit_enabled": True, "request_timeout_sec": 60, "cors_origins": [], }, ServingProfile.AIRGAP: { "host": "0.0.0.0", "port": 8000, "workers": 1, "gpu_utilization": 0.8, "max_model_len": 8192, "log_level": "INFO", "reload": False, "rate_limit_enabled": True, "request_timeout_sec": 90, "cors_origins": [], }, } _CONTAINER_PLATFORM_ENV_MARKERS = ( "K_SERVICE", "K_REVISION", "K_CONFIGURATION", "KUBERNETES_SERVICE_HOST", "SPACE_ID", # HuggingFace Spaces ) def _resolve_serving_profile() -> ServingProfile: """환경과 명시값을 기준으로 서빙 프로필을 결정한다.""" profile_name = os.getenv("SERVING_PROFILE") if profile_name: try: return ServingProfile(profile_name.lower()) except ValueError: logger.warning(f"알 수 없는 SERVING_PROFILE '{profile_name}', 기본값 'local' 사용") return ServingProfile.LOCAL if any(os.getenv(marker) for marker in _CONTAINER_PLATFORM_ENV_MARKERS): logger.info("컨테이너 런타임 환경을 감지하여 'container' 프로필을 사용합니다.") return ServingProfile.CONTAINER return ServingProfile.LOCAL # --------------------------------------------------------------------------- # Generation Defaults # --------------------------------------------------------------------------- @dataclass(frozen=True) class GenerationDefaults: """텍스트 생성 기본 파라미터. 엔드포인트 요청에 값이 없을 때 사용된다.""" max_tokens: int = 512 temperature: float = 0.7 top_p: float = 0.9 repetition_penalty: float = 1.1 stop_sequences: List[str] = field(default_factory=lambda: ["[|endofturn|]"]) @classmethod def from_env(cls) -> "GenerationDefaults": return cls( max_tokens=int(os.getenv("GEN_MAX_TOKENS", "512")), temperature=float(os.getenv("GEN_TEMPERATURE", "0.7")), top_p=float(os.getenv("GEN_TOP_P", "0.9")), repetition_penalty=float(os.getenv("GEN_REPETITION_PENALTY", "1.1")), ) # --------------------------------------------------------------------------- # Model Configuration # --------------------------------------------------------------------------- @dataclass(frozen=True) class ModelConfig: """모델 및 어댑터 설정. 베이스 모델: LGAI-EXAONE/EXAONE-4.0-32B-AWQ (단일 vLLM 인스턴스, ~20GB VRAM) - tool calling 네이티브 지원 (BFCL 65.2) - vLLM 서빙 옵션: --enable-auto-tool-choice --tool-call-parser hermes Multi-LoRA 어댑터: - public_admin-adapter (LoRA #1): domain_adapter 용도 학습 데이터: umyunsang/govon-civil-response-data (74K건), QLoRA on AWQ base HF Hub: umyunsang/GovOn-EXAONE-LoRA-v2 - legal-adapter (LoRA #2): domain_adapter 용도 학습 데이터: umyunsang/govon-legal-response-data (243K건), QLoRA on AWQ base HuggingFace: siwo/govon-legal-adapter - 나머지 capability (api_lookup, synthesis 등)는 LoRA 없이 base model 사용 adapter_paths: Dict[str, str] 형식의 어댑터 이름-경로 매핑. 환경변수 ADAPTER_PATHS="public_admin=/path/to/public_admin,legal=/path/to/legal" 형식으로 설정. 예: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"} """ model_path: str = "LGAI-EXAONE/EXAONE-4.0-32B-AWQ" trust_remote_code: bool = True dtype: str = "half" enforce_eager: bool = True # Multi-LoRA 어댑터 경로: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"} # 환경변수 ADAPTER_PATHS="public_admin=/path/to/public_admin,legal=/path/to/legal" 형식으로 설정 adapter_paths: Dict[str, str] = field(default_factory=dict) @classmethod def from_env(cls) -> "ModelConfig": adapter_paths = cls._parse_adapter_paths(os.getenv("ADAPTER_PATHS", "")) return cls( model_path=os.getenv("MODEL_PATH", "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"), trust_remote_code=os.getenv("TRUST_REMOTE_CODE", "true").lower() in ("true", "1", "yes"), dtype=os.getenv("MODEL_DTYPE", "half"), enforce_eager=os.getenv("ENFORCE_EAGER", "true").lower() in ("true", "1", "yes"), adapter_paths=adapter_paths, ) @staticmethod def _parse_adapter_paths(raw: str) -> Dict[str, str]: """ADAPTER_PATHS 환경변수를 파싱한다. 형식: "public_admin=/path/to/public_admin,legal=/path/to/legal" 반환: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"} 잘못된 항목은 경고 후 무시한다. """ if not raw or not raw.strip(): return {} result: Dict[str, str] = {} for entry in raw.split(","): entry = entry.strip() if not entry: continue if "=" not in entry: logger.warning(f"ADAPTER_PATHS 항목 형식 오류 (name=path 필요): {entry!r}") continue name, path = entry.split("=", 1) name, path = name.strip(), path.strip() if not name or not path: logger.warning(f"ADAPTER_PATHS 항목에 빈 이름 또는 경로: {entry!r}") continue result[name] = path return result # --------------------------------------------------------------------------- # Path Configuration # --------------------------------------------------------------------------- @dataclass class PathConfig: """데이터·인덱스·로그 경로 설정.""" data_path: str = "" index_path: str = "" faiss_index_dir: str = "" bm25_index_dir: str = "" local_docs_root: str = "" agents_dir: str = "" log_dir: str = "" cache_dir: str = "" @classmethod def from_env(cls) -> "PathConfig": project_root = str(_PROJECT_ROOT) return cls( data_path=os.getenv("DATA_PATH", ""), index_path=os.getenv("INDEX_PATH", "models/faiss_index/complaints.index"), faiss_index_dir=os.getenv("FAISS_INDEX_DIR", "models/faiss_index"), bm25_index_dir=os.getenv("BM25_INDEX_DIR", "models/bm25_index"), local_docs_root=os.getenv("LOCAL_DOCS_ROOT", ""), agents_dir=os.getenv("AGENTS_DIR", os.path.join(project_root, "agents")), log_dir=os.getenv("LOG_DIR", os.path.join(project_root, "logs")), cache_dir=os.getenv("CACHE_DIR", os.path.join(project_root, ".cache")), ) # --------------------------------------------------------------------------- # Healthcheck Configuration # --------------------------------------------------------------------------- @dataclass(frozen=True) class HealthcheckConfig: """헬스체크 설정. shell client readiness probe 용도.""" endpoint: str = "/health" interval_sec: int = 30 timeout_sec: int = 10 startup_probe_path: str = "/health" readiness_probe_path: str = "/health" @classmethod def from_env(cls) -> "HealthcheckConfig": return cls( interval_sec=int(os.getenv("HEALTH_INTERVAL_SEC", "30")), timeout_sec=int(os.getenv("HEALTH_TIMEOUT_SEC", "10")), ) # --------------------------------------------------------------------------- # RuntimeConfig (통합 설정) # --------------------------------------------------------------------------- @dataclass class RuntimeConfig: """GovOn Runtime 통합 설정. SERVING_PROFILE 환경변수에 따라 프로필별 기본값을 로드하고, 개별 환경변수로 오버라이드할 수 있다. """ # 서빙 프로필 profile: ServingProfile = ServingProfile.LOCAL # 서버 설정 host: str = "127.0.0.1" port: int = 8000 workers: int = 1 log_level: str = "DEBUG" reload: bool = True # GPU / vLLM 설정 gpu_utilization: float = 0.7 max_model_len: int = 4096 skip_model_load: bool = False # 보안 api_key: Optional[str] = None cors_origins: List[str] = field(default_factory=list) rate_limit_enabled: bool = False # 타임아웃 request_timeout_sec: int = 120 # 하위 설정 객체 model: ModelConfig = field(default_factory=ModelConfig) paths: PathConfig = field(default_factory=PathConfig) generation: GenerationDefaults = field(default_factory=GenerationDefaults) healthcheck: HealthcheckConfig = field(default_factory=HealthcheckConfig) @classmethod def from_env(cls) -> "RuntimeConfig": """환경변수에서 전체 런타임 설정을 로드한다. 1. SERVING_PROFILE에 따른 프로필 기본값 적용 2. 개별 환경변수로 오버라이드 """ profile = _resolve_serving_profile() defaults = _PROFILE_DEFAULTS[profile] skip_model_load = os.getenv("SKIP_MODEL_LOAD", "false").lower() in ( "true", "1", "yes", ) # CORS: 환경변수가 있으면 우선, 없으면 프로필 기본값 cors_env = os.getenv("CORS_ORIGINS", "") if cors_env: cors_origins = [o.strip() for o in cors_env.split(",") if o.strip()] else: cors_origins = defaults["cors_origins"] return cls( profile=profile, host=os.getenv("HOST", defaults["host"]), port=int(os.getenv("PORT", str(defaults["port"]))), workers=int(os.getenv("WORKERS", str(defaults["workers"]))), log_level=os.getenv("LOG_LEVEL", defaults["log_level"]), reload=os.getenv("RELOAD", str(defaults["reload"])).lower() in ("true", "1", "yes"), gpu_utilization=float(os.getenv("GPU_UTILIZATION", str(defaults["gpu_utilization"]))), max_model_len=int(os.getenv("MAX_MODEL_LEN", str(defaults["max_model_len"]))), skip_model_load=skip_model_load, api_key=os.getenv("API_KEY"), cors_origins=cors_origins, rate_limit_enabled=os.getenv( "RATE_LIMIT_ENABLED", str(defaults["rate_limit_enabled"]) ).lower() in ("true", "1", "yes"), request_timeout_sec=int( os.getenv("REQUEST_TIMEOUT_SEC", str(defaults["request_timeout_sec"])) ), model=ModelConfig.from_env(), paths=PathConfig.from_env(), generation=GenerationDefaults.from_env(), healthcheck=HealthcheckConfig.from_env(), ) def log_summary(self) -> None: """현재 설정 요약을 로그로 출력한다.""" logger.info("=" * 60) logger.info("GovOn Runtime Configuration") logger.info("=" * 60) logger.info(f" Profile : {self.profile.value}") logger.info(f" Host : {self.host}:{self.port}") logger.info(f" Workers : {self.workers}") logger.info(f" Log Level : {self.log_level}") logger.info(f" GPU Util : {self.gpu_utilization}") logger.info(f" Max Model Len : {self.max_model_len}") logger.info(f" Model Path : {self.model.model_path}") logger.info(f" Adapter Paths : {self.model.adapter_paths or '(none)'}") logger.info(f" Skip Model : {self.skip_model_load}") logger.info(f" Request Timeout: {self.request_timeout_sec}s") logger.info(f" Rate Limit : {self.rate_limit_enabled}") logger.info(f" CORS Origins : {self.cors_origins}") logger.info(f" Healthcheck : {self.healthcheck.endpoint}") logger.info(f" Data Path : {self.paths.data_path}") logger.info(f" Index Path : {self.paths.index_path}") logger.info(f" Local Docs : {self.paths.local_docs_root or '(disabled)'}") logger.info(f" Log Dir : {self.paths.log_dir}") logger.info("=" * 60) def to_uvicorn_kwargs(self) -> Dict: """uvicorn.run()에 전달할 키워드 인자를 반환한다.""" kwargs = { "host": self.host, "port": self.port, "workers": self.workers, "log_level": self.log_level.lower(), "timeout_keep_alive": self.request_timeout_sec, } if self.reload: kwargs["reload"] = True return kwargs # --------------------------------------------------------------------------- # GovOnConfig — unified hyperparameter config (YAML + env var overrides) # --------------------------------------------------------------------------- _GOVON_YAML_PATH = _PROJECT_ROOT / "config" / "govon.yaml" def _load_yaml(path: Path) -> Dict[str, Any]: """Load a YAML file and return its contents as a dict. Returns an empty dict if the file does not exist or PyYAML is not installed. """ if not path.exists(): logger.warning(f"[GovOnConfig] config file not found: {path}. Using defaults.") return {} try: import yaml # type: ignore with open(path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) or {} logger.debug(f"[GovOnConfig] loaded config from {path}") return data except ImportError: logger.warning("[GovOnConfig] PyYAML not installed; falling back to defaults.") return {} except Exception as exc: logger.warning(f"[GovOnConfig] failed to load {path}: {exc}. Using defaults.") return {} def _env(key: str, default: Any, cast=None) -> Any: """Read an environment variable and cast it to the required type. Returns *default* when the variable is absent or empty. """ raw = os.getenv(key) if raw is None or raw == "": return default if cast is not None: try: return cast(raw) except (ValueError, TypeError) as exc: logger.warning(f"[GovOnConfig] invalid env {key}={raw!r}: {exc}. Using default.") return default return raw @dataclass(frozen=True) class _GenerationConfig: """LLM generation hyperparameters.""" max_tokens: int = 512 temperature: float = 0.7 top_p: float = 0.9 repetition_penalty: float = 1.1 stop_sequences: List[str] = field(default_factory=lambda: ["[|endofturn|]"]) agent_temperature: float = 0.0 @dataclass(frozen=True) class _ServingConfig: """vLLM / GPU serving hyperparameters.""" gpu_memory_utilization: float = 0.90 max_model_len: int = 8192 max_loras: int = 4 max_lora_rank: int = 64 kv_cache_dtype: str = "auto" vllm_request_timeout: float = 300.0 vllm_connect_timeout: float = 30.0 vllm_startup_max_wait: int = 900 health_check_timeout: int = 10 @dataclass(frozen=True) class _ContextConfig: """Context window management hyperparameters.""" agent_input_budget: int = 4500 max_message_tokens: int = 4500 keep_recent_messages: int = 6 summary_threshold_ratio: float = 0.6 max_tool_result_chars: int = 3000 system_prompt_overhead: int = 2000 max_consecutive_rejections: int = 2 tool_clear_after_iteration: int = 2 tool_keep_recent: int = 2 max_iterations: int = 10 @dataclass(frozen=True) class _ToolDefaultConfig: """Default timeout and retry settings for a single tool.""" timeout_sec: float = 10.0 max_retries: int = 0 @dataclass(frozen=True) class _ToolsConfig: """Tool execution hyperparameters.""" defaults: _ToolDefaultConfig = field(default_factory=_ToolDefaultConfig) overrides: Dict[str, _ToolDefaultConfig] = field(default_factory=dict) def for_tool(self, name: str) -> _ToolDefaultConfig: """Return per-tool config, falling back to defaults.""" return self.overrides.get(name, self.defaults) @dataclass(frozen=True) class _RateLimitConfig: """API rate limiting configuration.""" default: str = "30/minute" @dataclass(frozen=True) class _ValidationConfig: """Request validation limits.""" max_prompt_length: int = 4096 max_tokens_ceiling: int = 4096 @dataclass(frozen=True) class GovOnConfig: """Unified hyperparameter configuration for GovOn. Load via :meth:`GovOnConfig.load` which reads ``config/govon.yaml`` and applies environment variable overrides on top. Environment variables follow the ``GOVON_
_`` naming convention (e.g. ``GOVON_GENERATION_MAX_TOKENS``). Legacy ``GEN_*`` variables are also supported for backward compatibility. """ generation: _GenerationConfig = field(default_factory=_GenerationConfig) serving: _ServingConfig = field(default_factory=_ServingConfig) context: _ContextConfig = field(default_factory=_ContextConfig) tools: _ToolsConfig = field(default_factory=_ToolsConfig) rate_limit: _RateLimitConfig = field(default_factory=_RateLimitConfig) validation: _ValidationConfig = field(default_factory=_ValidationConfig) @classmethod def load(cls, path: Optional[Path] = None) -> "GovOnConfig": """Load config from YAML and apply environment variable overrides. Priority (highest first): 1. Environment variables (GOVON_* or legacy GEN_*) 2. config/govon.yaml values 3. Dataclass defaults (hardcoded fallbacks) """ raw = _load_yaml(path or _GOVON_YAML_PATH) gen_raw = raw.get("generation", {}) srv_raw = raw.get("serving", {}) ctx_raw = raw.get("context", {}) tls_raw = raw.get("tools", {}) rl_raw = raw.get("rate_limit", {}) val_raw = raw.get("validation", {}) # --- generation --- gen = _GenerationConfig( max_tokens=_env( "GOVON_GENERATION_MAX_TOKENS", _env("GEN_MAX_TOKENS", gen_raw.get("max_tokens", 512), int), int, ), temperature=_env( "GOVON_GENERATION_TEMPERATURE", _env("GEN_TEMPERATURE", gen_raw.get("temperature", 0.7), float), float, ), top_p=_env( "GOVON_GENERATION_TOP_P", _env("GEN_TOP_P", gen_raw.get("top_p", 0.9), float), float, ), repetition_penalty=_env( "GOVON_GENERATION_REPETITION_PENALTY", _env( "GEN_REPETITION_PENALTY", gen_raw.get("repetition_penalty", 1.1), float, ), float, ), stop_sequences=gen_raw.get("stop_sequences", ["[|endofturn|]"]), agent_temperature=_env( "GOVON_GENERATION_AGENT_TEMPERATURE", gen_raw.get("agent_temperature", 0.0), float, ), ) # --- serving --- srv = _ServingConfig( gpu_memory_utilization=_env( "GOVON_SERVING_GPU_MEMORY_UTILIZATION", srv_raw.get("gpu_memory_utilization", 0.90), float, ), max_model_len=_env( "GOVON_SERVING_MAX_MODEL_LEN", srv_raw.get("max_model_len", 8192), int, ), max_loras=_env( "GOVON_SERVING_MAX_LORAS", _env("MAX_LORAS", srv_raw.get("max_loras", 4), int), int, ), max_lora_rank=_env( "GOVON_SERVING_MAX_LORA_RANK", _env("MAX_LORA_RANK", srv_raw.get("max_lora_rank", 64), int), int, ), kv_cache_dtype=_env( "GOVON_SERVING_KV_CACHE_DTYPE", srv_raw.get("kv_cache_dtype", "auto"), ), vllm_request_timeout=_env( "GOVON_SERVING_VLLM_REQUEST_TIMEOUT", srv_raw.get("vllm_request_timeout", 300.0), float, ), vllm_connect_timeout=_env( "GOVON_SERVING_VLLM_CONNECT_TIMEOUT", srv_raw.get("vllm_connect_timeout", 30.0), float, ), vllm_startup_max_wait=_env( "GOVON_SERVING_VLLM_STARTUP_MAX_WAIT", srv_raw.get("vllm_startup_max_wait", 900), int, ), health_check_timeout=_env( "GOVON_SERVING_HEALTH_CHECK_TIMEOUT", srv_raw.get("health_check_timeout", 10), int, ), ) # --- context --- ctx = _ContextConfig( agent_input_budget=_env( "GOVON_CONTEXT_AGENT_INPUT_BUDGET", ctx_raw.get("agent_input_budget", 4500), int, ), max_message_tokens=_env( "GOVON_CONTEXT_MAX_MESSAGE_TOKENS", ctx_raw.get("max_message_tokens", 4500), int, ), keep_recent_messages=_env( "GOVON_CONTEXT_KEEP_RECENT_MESSAGES", ctx_raw.get("keep_recent_messages", 6), int, ), summary_threshold_ratio=_env( "GOVON_CONTEXT_SUMMARY_THRESHOLD_RATIO", ctx_raw.get("summary_threshold_ratio", 0.6), float, ), max_tool_result_chars=_env( "GOVON_CONTEXT_MAX_TOOL_RESULT_CHARS", ctx_raw.get("max_tool_result_chars", 3000), int, ), system_prompt_overhead=_env( "GOVON_CONTEXT_SYSTEM_PROMPT_OVERHEAD", ctx_raw.get("system_prompt_overhead", 2000), int, ), max_consecutive_rejections=_env( "GOVON_CONTEXT_MAX_CONSECUTIVE_REJECTIONS", ctx_raw.get("max_consecutive_rejections", 2), int, ), tool_clear_after_iteration=_env( "GOVON_CONTEXT_TOOL_CLEAR_AFTER_ITERATION", ctx_raw.get("tool_clear_after_iteration", 2), int, ), tool_keep_recent=_env( "GOVON_CONTEXT_TOOL_KEEP_RECENT", ctx_raw.get("tool_keep_recent", 2), int, ), max_iterations=_env( "GOVON_CONTEXT_MAX_ITERATIONS", ctx_raw.get("max_iterations", 10), int, ), ) # --- tools --- tls_defaults_raw = tls_raw.get("defaults", {}) tls_overrides_raw = tls_raw.get("overrides", {}) tls_defaults = _ToolDefaultConfig( timeout_sec=_env( "GOVON_TOOLS_DEFAULT_TIMEOUT_SEC", tls_defaults_raw.get("timeout_sec", 10.0), float, ), max_retries=_env( "GOVON_TOOLS_DEFAULT_MAX_RETRIES", tls_defaults_raw.get("max_retries", 0), int, ), ) tls_overrides: Dict[str, _ToolDefaultConfig] = {} for tool_name, override_raw in tls_overrides_raw.items(): tls_overrides[tool_name] = _ToolDefaultConfig( timeout_sec=override_raw.get("timeout_sec", tls_defaults.timeout_sec), max_retries=override_raw.get("max_retries", tls_defaults.max_retries), ) tls = _ToolsConfig(defaults=tls_defaults, overrides=tls_overrides) # --- rate_limit --- rl = _RateLimitConfig( default=_env( "GOVON_RATE_LIMIT_DEFAULT", rl_raw.get("default", "30/minute"), ), ) # --- validation --- val = _ValidationConfig( max_prompt_length=_env( "GOVON_VALIDATION_MAX_PROMPT_LENGTH", val_raw.get("max_prompt_length", 4096), int, ), max_tokens_ceiling=_env( "GOVON_VALIDATION_MAX_TOKENS_CEILING", val_raw.get("max_tokens_ceiling", 4096), int, ), ) return cls( generation=gen, serving=srv, context=ctx, tools=tls, rate_limit=rl, validation=val, ) # Module-level singleton — imported by other modules. govon_config: GovOnConfig = GovOnConfig.load()