govon-runtime / src /inference /runtime_config.py
github-actions
sync: c7bf804
43117af
"""GovOn Runtime serving profile and model configuration.
Defines profiles for local development, single-server (production), and
air-gapped installations based on environment variables. Standardises
generation defaults and timeout settings.
Usage:
config = RuntimeConfig.from_env()
config.log_summary()
# Unified hyperparameter config (YAML + env var overrides):
govon_config = GovOnConfig.load()
"""
import os
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
from loguru import logger
# Project root path (src/inference/runtime_config.py β†’ ../../..)
_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
class ServingProfile(str, Enum):
"""μ„œλΉ™ ν”„λ‘œν•„. SERVING_PROFILE ν™˜κ²½λ³€μˆ˜λ‘œ μ„ νƒν•œλ‹€."""
LOCAL = "local" # 둜컬 개발 ν™˜κ²½
SINGLE = "single" # 단일 μ„œλ²„ ν”„λ‘œλ•μ…˜
CONTAINER = "container" # Docker / Cloud Run / μ˜€ν”„λΌμΈ νŒ¨ν‚€μ§€
AIRGAP = "airgap" # 폐쇄망 μ„€μΉ˜
# ---------------------------------------------------------------------------
# ν”„λ‘œν•„λ³„ κΈ°λ³Έκ°’ μ •μ˜
# ---------------------------------------------------------------------------
_PROFILE_DEFAULTS: Dict[ServingProfile, Dict] = {
ServingProfile.LOCAL: {
"host": "127.0.0.1",
"port": 8000,
"workers": 1,
"gpu_utilization": 0.7,
"max_model_len": 4096,
"log_level": "DEBUG",
"reload": True,
"rate_limit_enabled": False,
"request_timeout_sec": 120,
"cors_origins": ["http://localhost:3000", "http://127.0.0.1:3000"],
},
ServingProfile.SINGLE: {
"host": "0.0.0.0",
"port": 8000,
"workers": 1,
"gpu_utilization": 0.85,
"max_model_len": 8192,
"log_level": "INFO",
"reload": False,
"rate_limit_enabled": True,
"request_timeout_sec": 60,
"cors_origins": [],
},
ServingProfile.CONTAINER: {
"host": "0.0.0.0",
"port": 8000,
"workers": 1,
"gpu_utilization": 0.85,
"max_model_len": 8192,
"log_level": "INFO",
"reload": False,
"rate_limit_enabled": True,
"request_timeout_sec": 60,
"cors_origins": [],
},
ServingProfile.AIRGAP: {
"host": "0.0.0.0",
"port": 8000,
"workers": 1,
"gpu_utilization": 0.8,
"max_model_len": 8192,
"log_level": "INFO",
"reload": False,
"rate_limit_enabled": True,
"request_timeout_sec": 90,
"cors_origins": [],
},
}
_CONTAINER_PLATFORM_ENV_MARKERS = (
"K_SERVICE",
"K_REVISION",
"K_CONFIGURATION",
"KUBERNETES_SERVICE_HOST",
"SPACE_ID", # HuggingFace Spaces
)
def _resolve_serving_profile() -> ServingProfile:
"""ν™˜κ²½κ³Ό λͺ…μ‹œκ°’μ„ κΈ°μ€€μœΌλ‘œ μ„œλΉ™ ν”„λ‘œν•„μ„ κ²°μ •ν•œλ‹€."""
profile_name = os.getenv("SERVING_PROFILE")
if profile_name:
try:
return ServingProfile(profile_name.lower())
except ValueError:
logger.warning(f"μ•Œ 수 μ—†λŠ” SERVING_PROFILE '{profile_name}', κΈ°λ³Έκ°’ 'local' μ‚¬μš©")
return ServingProfile.LOCAL
if any(os.getenv(marker) for marker in _CONTAINER_PLATFORM_ENV_MARKERS):
logger.info("μ»¨ν…Œμ΄λ„ˆ λŸ°νƒ€μž„ ν™˜κ²½μ„ κ°μ§€ν•˜μ—¬ 'container' ν”„λ‘œν•„μ„ μ‚¬μš©ν•©λ‹ˆλ‹€.")
return ServingProfile.CONTAINER
return ServingProfile.LOCAL
# ---------------------------------------------------------------------------
# Generation Defaults
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class GenerationDefaults:
"""ν…μŠ€νŠΈ 생성 κΈ°λ³Έ νŒŒλΌλ―Έν„°. μ—”λ“œν¬μΈνŠΈ μš”μ²­μ— 값이 없을 λ•Œ μ‚¬μš©λœλ‹€."""
max_tokens: int = 512
temperature: float = 0.7
top_p: float = 0.9
repetition_penalty: float = 1.1
stop_sequences: List[str] = field(default_factory=lambda: ["[|endofturn|]"])
@classmethod
def from_env(cls) -> "GenerationDefaults":
return cls(
max_tokens=int(os.getenv("GEN_MAX_TOKENS", "512")),
temperature=float(os.getenv("GEN_TEMPERATURE", "0.7")),
top_p=float(os.getenv("GEN_TOP_P", "0.9")),
repetition_penalty=float(os.getenv("GEN_REPETITION_PENALTY", "1.1")),
)
# ---------------------------------------------------------------------------
# Model Configuration
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class ModelConfig:
"""λͺ¨λΈ 및 μ–΄λŒ‘ν„° μ„€μ •.
베이슀 λͺ¨λΈ: LGAI-EXAONE/EXAONE-4.0-32B-AWQ (단일 vLLM μΈμŠ€ν„΄μŠ€, ~20GB VRAM)
- tool calling λ„€μ΄ν‹°λΈŒ 지원 (BFCL 65.2)
- vLLM μ„œλΉ™ μ˜΅μ…˜: --enable-auto-tool-choice --tool-call-parser hermes
Multi-LoRA μ–΄λŒ‘ν„°:
- public_admin-adapter (LoRA #1): domain_adapter μš©λ„
ν•™μŠ΅ 데이터: umyunsang/govon-civil-response-data (74K건), QLoRA on AWQ base
HF Hub: umyunsang/GovOn-EXAONE-LoRA-v2
- legal-adapter (LoRA #2): domain_adapter μš©λ„
ν•™μŠ΅ 데이터: umyunsang/govon-legal-response-data (243K건), QLoRA on AWQ base
HuggingFace: siwo/govon-legal-adapter
- λ‚˜λ¨Έμ§€ capability (api_lookup, synthesis λ“±)λŠ” LoRA 없이 base model μ‚¬μš©
adapter_paths: Dict[str, str] ν˜•μ‹μ˜ μ–΄λŒ‘ν„° 이름-경둜 λ§€ν•‘.
ν™˜κ²½λ³€μˆ˜ ADAPTER_PATHS="public_admin=/path/to/public_admin,legal=/path/to/legal" ν˜•μ‹μœΌλ‘œ μ„€μ •.
예: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"}
"""
model_path: str = "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"
trust_remote_code: bool = True
dtype: str = "half"
enforce_eager: bool = True
# Multi-LoRA μ–΄λŒ‘ν„° 경둜: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"}
# ν™˜κ²½λ³€μˆ˜ ADAPTER_PATHS="public_admin=/path/to/public_admin,legal=/path/to/legal" ν˜•μ‹μœΌλ‘œ μ„€μ •
adapter_paths: Dict[str, str] = field(default_factory=dict)
@classmethod
def from_env(cls) -> "ModelConfig":
adapter_paths = cls._parse_adapter_paths(os.getenv("ADAPTER_PATHS", ""))
return cls(
model_path=os.getenv("MODEL_PATH", "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"),
trust_remote_code=os.getenv("TRUST_REMOTE_CODE", "true").lower()
in ("true", "1", "yes"),
dtype=os.getenv("MODEL_DTYPE", "half"),
enforce_eager=os.getenv("ENFORCE_EAGER", "true").lower() in ("true", "1", "yes"),
adapter_paths=adapter_paths,
)
@staticmethod
def _parse_adapter_paths(raw: str) -> Dict[str, str]:
"""ADAPTER_PATHS ν™˜κ²½λ³€μˆ˜λ₯Ό νŒŒμ‹±ν•œλ‹€.
ν˜•μ‹: "public_admin=/path/to/public_admin,legal=/path/to/legal"
λ°˜ν™˜: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"}
잘λͺ»λœ ν•­λͺ©μ€ κ²½κ³  ν›„ λ¬΄μ‹œν•œλ‹€.
"""
if not raw or not raw.strip():
return {}
result: Dict[str, str] = {}
for entry in raw.split(","):
entry = entry.strip()
if not entry:
continue
if "=" not in entry:
logger.warning(f"ADAPTER_PATHS ν•­λͺ© ν˜•μ‹ 였λ₯˜ (name=path ν•„μš”): {entry!r}")
continue
name, path = entry.split("=", 1)
name, path = name.strip(), path.strip()
if not name or not path:
logger.warning(f"ADAPTER_PATHS ν•­λͺ©μ— 빈 이름 λ˜λŠ” 경둜: {entry!r}")
continue
result[name] = path
return result
# ---------------------------------------------------------------------------
# Path Configuration
# ---------------------------------------------------------------------------
@dataclass
class PathConfig:
"""λ°μ΄ν„°Β·μΈλ±μŠ€Β·λ‘œκ·Έ 경둜 μ„€μ •."""
data_path: str = ""
index_path: str = ""
faiss_index_dir: str = ""
bm25_index_dir: str = ""
local_docs_root: str = ""
agents_dir: str = ""
log_dir: str = ""
cache_dir: str = ""
@classmethod
def from_env(cls) -> "PathConfig":
project_root = str(_PROJECT_ROOT)
return cls(
data_path=os.getenv("DATA_PATH", ""),
index_path=os.getenv("INDEX_PATH", "models/faiss_index/complaints.index"),
faiss_index_dir=os.getenv("FAISS_INDEX_DIR", "models/faiss_index"),
bm25_index_dir=os.getenv("BM25_INDEX_DIR", "models/bm25_index"),
local_docs_root=os.getenv("LOCAL_DOCS_ROOT", ""),
agents_dir=os.getenv("AGENTS_DIR", os.path.join(project_root, "agents")),
log_dir=os.getenv("LOG_DIR", os.path.join(project_root, "logs")),
cache_dir=os.getenv("CACHE_DIR", os.path.join(project_root, ".cache")),
)
# ---------------------------------------------------------------------------
# Healthcheck Configuration
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class HealthcheckConfig:
"""ν—¬μŠ€μ²΄ν¬ μ„€μ •. shell client readiness probe μš©λ„."""
endpoint: str = "/health"
interval_sec: int = 30
timeout_sec: int = 10
startup_probe_path: str = "/health"
readiness_probe_path: str = "/health"
@classmethod
def from_env(cls) -> "HealthcheckConfig":
return cls(
interval_sec=int(os.getenv("HEALTH_INTERVAL_SEC", "30")),
timeout_sec=int(os.getenv("HEALTH_TIMEOUT_SEC", "10")),
)
# ---------------------------------------------------------------------------
# RuntimeConfig (톡합 μ„€μ •)
# ---------------------------------------------------------------------------
@dataclass
class RuntimeConfig:
"""GovOn Runtime 톡합 μ„€μ •.
SERVING_PROFILE ν™˜κ²½λ³€μˆ˜μ— 따라 ν”„λ‘œν•„λ³„ 기본값을 λ‘œλ“œν•˜κ³ ,
κ°œλ³„ ν™˜κ²½λ³€μˆ˜λ‘œ μ˜€λ²„λΌμ΄λ“œν•  수 μžˆλ‹€.
"""
# μ„œλΉ™ ν”„λ‘œν•„
profile: ServingProfile = ServingProfile.LOCAL
# μ„œλ²„ μ„€μ •
host: str = "127.0.0.1"
port: int = 8000
workers: int = 1
log_level: str = "DEBUG"
reload: bool = True
# GPU / vLLM μ„€μ •
gpu_utilization: float = 0.7
max_model_len: int = 4096
skip_model_load: bool = False
# λ³΄μ•ˆ
api_key: Optional[str] = None
cors_origins: List[str] = field(default_factory=list)
rate_limit_enabled: bool = False
# νƒ€μž„μ•„μ›ƒ
request_timeout_sec: int = 120
# ν•˜μœ„ μ„€μ • 객체
model: ModelConfig = field(default_factory=ModelConfig)
paths: PathConfig = field(default_factory=PathConfig)
generation: GenerationDefaults = field(default_factory=GenerationDefaults)
healthcheck: HealthcheckConfig = field(default_factory=HealthcheckConfig)
@classmethod
def from_env(cls) -> "RuntimeConfig":
"""ν™˜κ²½λ³€μˆ˜μ—μ„œ 전체 λŸ°νƒ€μž„ 섀정을 λ‘œλ“œν•œλ‹€.
1. SERVING_PROFILE에 λ”°λ₯Έ ν”„λ‘œν•„ κΈ°λ³Έκ°’ 적용
2. κ°œλ³„ ν™˜κ²½λ³€μˆ˜λ‘œ μ˜€λ²„λΌμ΄λ“œ
"""
profile = _resolve_serving_profile()
defaults = _PROFILE_DEFAULTS[profile]
skip_model_load = os.getenv("SKIP_MODEL_LOAD", "false").lower() in (
"true",
"1",
"yes",
)
# CORS: ν™˜κ²½λ³€μˆ˜κ°€ 있으면 μš°μ„ , μ—†μœΌλ©΄ ν”„λ‘œν•„ κΈ°λ³Έκ°’
cors_env = os.getenv("CORS_ORIGINS", "")
if cors_env:
cors_origins = [o.strip() for o in cors_env.split(",") if o.strip()]
else:
cors_origins = defaults["cors_origins"]
return cls(
profile=profile,
host=os.getenv("HOST", defaults["host"]),
port=int(os.getenv("PORT", str(defaults["port"]))),
workers=int(os.getenv("WORKERS", str(defaults["workers"]))),
log_level=os.getenv("LOG_LEVEL", defaults["log_level"]),
reload=os.getenv("RELOAD", str(defaults["reload"])).lower() in ("true", "1", "yes"),
gpu_utilization=float(os.getenv("GPU_UTILIZATION", str(defaults["gpu_utilization"]))),
max_model_len=int(os.getenv("MAX_MODEL_LEN", str(defaults["max_model_len"]))),
skip_model_load=skip_model_load,
api_key=os.getenv("API_KEY"),
cors_origins=cors_origins,
rate_limit_enabled=os.getenv(
"RATE_LIMIT_ENABLED", str(defaults["rate_limit_enabled"])
).lower()
in ("true", "1", "yes"),
request_timeout_sec=int(
os.getenv("REQUEST_TIMEOUT_SEC", str(defaults["request_timeout_sec"]))
),
model=ModelConfig.from_env(),
paths=PathConfig.from_env(),
generation=GenerationDefaults.from_env(),
healthcheck=HealthcheckConfig.from_env(),
)
def log_summary(self) -> None:
"""ν˜„μž¬ μ„€μ • μš”μ•½μ„ 둜그둜 좜λ ₯ν•œλ‹€."""
logger.info("=" * 60)
logger.info("GovOn Runtime Configuration")
logger.info("=" * 60)
logger.info(f" Profile : {self.profile.value}")
logger.info(f" Host : {self.host}:{self.port}")
logger.info(f" Workers : {self.workers}")
logger.info(f" Log Level : {self.log_level}")
logger.info(f" GPU Util : {self.gpu_utilization}")
logger.info(f" Max Model Len : {self.max_model_len}")
logger.info(f" Model Path : {self.model.model_path}")
logger.info(f" Adapter Paths : {self.model.adapter_paths or '(none)'}")
logger.info(f" Skip Model : {self.skip_model_load}")
logger.info(f" Request Timeout: {self.request_timeout_sec}s")
logger.info(f" Rate Limit : {self.rate_limit_enabled}")
logger.info(f" CORS Origins : {self.cors_origins}")
logger.info(f" Healthcheck : {self.healthcheck.endpoint}")
logger.info(f" Data Path : {self.paths.data_path}")
logger.info(f" Index Path : {self.paths.index_path}")
logger.info(f" Local Docs : {self.paths.local_docs_root or '(disabled)'}")
logger.info(f" Log Dir : {self.paths.log_dir}")
logger.info("=" * 60)
def to_uvicorn_kwargs(self) -> Dict:
"""uvicorn.run()에 전달할 ν‚€μ›Œλ“œ 인자λ₯Ό λ°˜ν™˜ν•œλ‹€."""
kwargs = {
"host": self.host,
"port": self.port,
"workers": self.workers,
"log_level": self.log_level.lower(),
"timeout_keep_alive": self.request_timeout_sec,
}
if self.reload:
kwargs["reload"] = True
return kwargs
# ---------------------------------------------------------------------------
# GovOnConfig β€” unified hyperparameter config (YAML + env var overrides)
# ---------------------------------------------------------------------------
_GOVON_YAML_PATH = _PROJECT_ROOT / "config" / "govon.yaml"
def _load_yaml(path: Path) -> Dict[str, Any]:
"""Load a YAML file and return its contents as a dict.
Returns an empty dict if the file does not exist or PyYAML is not installed.
"""
if not path.exists():
logger.warning(f"[GovOnConfig] config file not found: {path}. Using defaults.")
return {}
try:
import yaml # type: ignore
with open(path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
logger.debug(f"[GovOnConfig] loaded config from {path}")
return data
except ImportError:
logger.warning("[GovOnConfig] PyYAML not installed; falling back to defaults.")
return {}
except Exception as exc:
logger.warning(f"[GovOnConfig] failed to load {path}: {exc}. Using defaults.")
return {}
def _env(key: str, default: Any, cast=None) -> Any:
"""Read an environment variable and cast it to the required type.
Returns *default* when the variable is absent or empty.
"""
raw = os.getenv(key)
if raw is None or raw == "":
return default
if cast is not None:
try:
return cast(raw)
except (ValueError, TypeError) as exc:
logger.warning(f"[GovOnConfig] invalid env {key}={raw!r}: {exc}. Using default.")
return default
return raw
@dataclass(frozen=True)
class _GenerationConfig:
"""LLM generation hyperparameters."""
max_tokens: int = 512
temperature: float = 0.7
top_p: float = 0.9
repetition_penalty: float = 1.1
stop_sequences: List[str] = field(default_factory=lambda: ["[|endofturn|]"])
agent_temperature: float = 0.0
@dataclass(frozen=True)
class _ServingConfig:
"""vLLM / GPU serving hyperparameters."""
gpu_memory_utilization: float = 0.90
max_model_len: int = 8192
max_loras: int = 4
max_lora_rank: int = 64
kv_cache_dtype: str = "auto"
vllm_request_timeout: float = 300.0
vllm_connect_timeout: float = 30.0
vllm_startup_max_wait: int = 900
health_check_timeout: int = 10
@dataclass(frozen=True)
class _ContextConfig:
"""Context window management hyperparameters."""
agent_input_budget: int = 4500
max_message_tokens: int = 4500
keep_recent_messages: int = 6
summary_threshold_ratio: float = 0.6
max_tool_result_chars: int = 3000
system_prompt_overhead: int = 2000
max_consecutive_rejections: int = 2
tool_clear_after_iteration: int = 2
tool_keep_recent: int = 2
max_iterations: int = 10
@dataclass(frozen=True)
class _ToolDefaultConfig:
"""Default timeout and retry settings for a single tool."""
timeout_sec: float = 10.0
max_retries: int = 0
@dataclass(frozen=True)
class _ToolsConfig:
"""Tool execution hyperparameters."""
defaults: _ToolDefaultConfig = field(default_factory=_ToolDefaultConfig)
overrides: Dict[str, _ToolDefaultConfig] = field(default_factory=dict)
def for_tool(self, name: str) -> _ToolDefaultConfig:
"""Return per-tool config, falling back to defaults."""
return self.overrides.get(name, self.defaults)
@dataclass(frozen=True)
class _RateLimitConfig:
"""API rate limiting configuration."""
default: str = "30/minute"
@dataclass(frozen=True)
class _ValidationConfig:
"""Request validation limits."""
max_prompt_length: int = 4096
max_tokens_ceiling: int = 4096
@dataclass(frozen=True)
class GovOnConfig:
"""Unified hyperparameter configuration for GovOn.
Load via :meth:`GovOnConfig.load` which reads ``config/govon.yaml`` and
applies environment variable overrides on top.
Environment variables follow the ``GOVON_<SECTION>_<KEY>`` naming convention
(e.g. ``GOVON_GENERATION_MAX_TOKENS``). Legacy ``GEN_*`` variables are also
supported for backward compatibility.
"""
generation: _GenerationConfig = field(default_factory=_GenerationConfig)
serving: _ServingConfig = field(default_factory=_ServingConfig)
context: _ContextConfig = field(default_factory=_ContextConfig)
tools: _ToolsConfig = field(default_factory=_ToolsConfig)
rate_limit: _RateLimitConfig = field(default_factory=_RateLimitConfig)
validation: _ValidationConfig = field(default_factory=_ValidationConfig)
@classmethod
def load(cls, path: Optional[Path] = None) -> "GovOnConfig":
"""Load config from YAML and apply environment variable overrides.
Priority (highest first):
1. Environment variables (GOVON_* or legacy GEN_*)
2. config/govon.yaml values
3. Dataclass defaults (hardcoded fallbacks)
"""
raw = _load_yaml(path or _GOVON_YAML_PATH)
gen_raw = raw.get("generation", {})
srv_raw = raw.get("serving", {})
ctx_raw = raw.get("context", {})
tls_raw = raw.get("tools", {})
rl_raw = raw.get("rate_limit", {})
val_raw = raw.get("validation", {})
# --- generation ---
gen = _GenerationConfig(
max_tokens=_env(
"GOVON_GENERATION_MAX_TOKENS",
_env("GEN_MAX_TOKENS", gen_raw.get("max_tokens", 512), int),
int,
),
temperature=_env(
"GOVON_GENERATION_TEMPERATURE",
_env("GEN_TEMPERATURE", gen_raw.get("temperature", 0.7), float),
float,
),
top_p=_env(
"GOVON_GENERATION_TOP_P",
_env("GEN_TOP_P", gen_raw.get("top_p", 0.9), float),
float,
),
repetition_penalty=_env(
"GOVON_GENERATION_REPETITION_PENALTY",
_env(
"GEN_REPETITION_PENALTY",
gen_raw.get("repetition_penalty", 1.1),
float,
),
float,
),
stop_sequences=gen_raw.get("stop_sequences", ["[|endofturn|]"]),
agent_temperature=_env(
"GOVON_GENERATION_AGENT_TEMPERATURE",
gen_raw.get("agent_temperature", 0.0),
float,
),
)
# --- serving ---
srv = _ServingConfig(
gpu_memory_utilization=_env(
"GOVON_SERVING_GPU_MEMORY_UTILIZATION",
srv_raw.get("gpu_memory_utilization", 0.90),
float,
),
max_model_len=_env(
"GOVON_SERVING_MAX_MODEL_LEN",
srv_raw.get("max_model_len", 8192),
int,
),
max_loras=_env(
"GOVON_SERVING_MAX_LORAS",
_env("MAX_LORAS", srv_raw.get("max_loras", 4), int),
int,
),
max_lora_rank=_env(
"GOVON_SERVING_MAX_LORA_RANK",
_env("MAX_LORA_RANK", srv_raw.get("max_lora_rank", 64), int),
int,
),
kv_cache_dtype=_env(
"GOVON_SERVING_KV_CACHE_DTYPE",
srv_raw.get("kv_cache_dtype", "auto"),
),
vllm_request_timeout=_env(
"GOVON_SERVING_VLLM_REQUEST_TIMEOUT",
srv_raw.get("vllm_request_timeout", 300.0),
float,
),
vllm_connect_timeout=_env(
"GOVON_SERVING_VLLM_CONNECT_TIMEOUT",
srv_raw.get("vllm_connect_timeout", 30.0),
float,
),
vllm_startup_max_wait=_env(
"GOVON_SERVING_VLLM_STARTUP_MAX_WAIT",
srv_raw.get("vllm_startup_max_wait", 900),
int,
),
health_check_timeout=_env(
"GOVON_SERVING_HEALTH_CHECK_TIMEOUT",
srv_raw.get("health_check_timeout", 10),
int,
),
)
# --- context ---
ctx = _ContextConfig(
agent_input_budget=_env(
"GOVON_CONTEXT_AGENT_INPUT_BUDGET",
ctx_raw.get("agent_input_budget", 4500),
int,
),
max_message_tokens=_env(
"GOVON_CONTEXT_MAX_MESSAGE_TOKENS",
ctx_raw.get("max_message_tokens", 4500),
int,
),
keep_recent_messages=_env(
"GOVON_CONTEXT_KEEP_RECENT_MESSAGES",
ctx_raw.get("keep_recent_messages", 6),
int,
),
summary_threshold_ratio=_env(
"GOVON_CONTEXT_SUMMARY_THRESHOLD_RATIO",
ctx_raw.get("summary_threshold_ratio", 0.6),
float,
),
max_tool_result_chars=_env(
"GOVON_CONTEXT_MAX_TOOL_RESULT_CHARS",
ctx_raw.get("max_tool_result_chars", 3000),
int,
),
system_prompt_overhead=_env(
"GOVON_CONTEXT_SYSTEM_PROMPT_OVERHEAD",
ctx_raw.get("system_prompt_overhead", 2000),
int,
),
max_consecutive_rejections=_env(
"GOVON_CONTEXT_MAX_CONSECUTIVE_REJECTIONS",
ctx_raw.get("max_consecutive_rejections", 2),
int,
),
tool_clear_after_iteration=_env(
"GOVON_CONTEXT_TOOL_CLEAR_AFTER_ITERATION",
ctx_raw.get("tool_clear_after_iteration", 2),
int,
),
tool_keep_recent=_env(
"GOVON_CONTEXT_TOOL_KEEP_RECENT",
ctx_raw.get("tool_keep_recent", 2),
int,
),
max_iterations=_env(
"GOVON_CONTEXT_MAX_ITERATIONS",
ctx_raw.get("max_iterations", 10),
int,
),
)
# --- tools ---
tls_defaults_raw = tls_raw.get("defaults", {})
tls_overrides_raw = tls_raw.get("overrides", {})
tls_defaults = _ToolDefaultConfig(
timeout_sec=_env(
"GOVON_TOOLS_DEFAULT_TIMEOUT_SEC",
tls_defaults_raw.get("timeout_sec", 10.0),
float,
),
max_retries=_env(
"GOVON_TOOLS_DEFAULT_MAX_RETRIES",
tls_defaults_raw.get("max_retries", 0),
int,
),
)
tls_overrides: Dict[str, _ToolDefaultConfig] = {}
for tool_name, override_raw in tls_overrides_raw.items():
tls_overrides[tool_name] = _ToolDefaultConfig(
timeout_sec=override_raw.get("timeout_sec", tls_defaults.timeout_sec),
max_retries=override_raw.get("max_retries", tls_defaults.max_retries),
)
tls = _ToolsConfig(defaults=tls_defaults, overrides=tls_overrides)
# --- rate_limit ---
rl = _RateLimitConfig(
default=_env(
"GOVON_RATE_LIMIT_DEFAULT",
rl_raw.get("default", "30/minute"),
),
)
# --- validation ---
val = _ValidationConfig(
max_prompt_length=_env(
"GOVON_VALIDATION_MAX_PROMPT_LENGTH",
val_raw.get("max_prompt_length", 4096),
int,
),
max_tokens_ceiling=_env(
"GOVON_VALIDATION_MAX_TOKENS_CEILING",
val_raw.get("max_tokens_ceiling", 4096),
int,
),
)
return cls(
generation=gen,
serving=srv,
context=ctx,
tools=tls,
rate_limit=rl,
validation=val,
)
# Module-level singleton β€” imported by other modules.
govon_config: GovOnConfig = GovOnConfig.load()