Spaces:
Paused
Paused
| """GovOn Runtime serving profile and model configuration. | |
| Defines profiles for local development, single-server (production), and | |
| air-gapped installations based on environment variables. Standardises | |
| generation defaults and timeout settings. | |
| Usage: | |
| config = RuntimeConfig.from_env() | |
| config.log_summary() | |
| # Unified hyperparameter config (YAML + env var overrides): | |
| govon_config = GovOnConfig.load() | |
| """ | |
| import os | |
| from dataclasses import dataclass, field | |
| from enum import Enum | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| from loguru import logger | |
| # Project root path (src/inference/runtime_config.py β ../../..) | |
| _PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent | |
| class ServingProfile(str, Enum): | |
| """μλΉ νλ‘ν. SERVING_PROFILE νκ²½λ³μλ‘ μ ννλ€.""" | |
| LOCAL = "local" # λ‘컬 κ°λ° νκ²½ | |
| SINGLE = "single" # λ¨μΌ μλ² νλ‘λμ | |
| CONTAINER = "container" # Docker / Cloud Run / μ€νλΌμΈ ν¨ν€μ§ | |
| AIRGAP = "airgap" # νμλ§ μ€μΉ | |
| # --------------------------------------------------------------------------- | |
| # νλ‘νλ³ κΈ°λ³Έκ° μ μ | |
| # --------------------------------------------------------------------------- | |
| _PROFILE_DEFAULTS: Dict[ServingProfile, Dict] = { | |
| ServingProfile.LOCAL: { | |
| "host": "127.0.0.1", | |
| "port": 8000, | |
| "workers": 1, | |
| "gpu_utilization": 0.7, | |
| "max_model_len": 4096, | |
| "log_level": "DEBUG", | |
| "reload": True, | |
| "rate_limit_enabled": False, | |
| "request_timeout_sec": 120, | |
| "cors_origins": ["http://localhost:3000", "http://127.0.0.1:3000"], | |
| }, | |
| ServingProfile.SINGLE: { | |
| "host": "0.0.0.0", | |
| "port": 8000, | |
| "workers": 1, | |
| "gpu_utilization": 0.85, | |
| "max_model_len": 8192, | |
| "log_level": "INFO", | |
| "reload": False, | |
| "rate_limit_enabled": True, | |
| "request_timeout_sec": 60, | |
| "cors_origins": [], | |
| }, | |
| ServingProfile.CONTAINER: { | |
| "host": "0.0.0.0", | |
| "port": 8000, | |
| "workers": 1, | |
| "gpu_utilization": 0.85, | |
| "max_model_len": 8192, | |
| "log_level": "INFO", | |
| "reload": False, | |
| "rate_limit_enabled": True, | |
| "request_timeout_sec": 60, | |
| "cors_origins": [], | |
| }, | |
| ServingProfile.AIRGAP: { | |
| "host": "0.0.0.0", | |
| "port": 8000, | |
| "workers": 1, | |
| "gpu_utilization": 0.8, | |
| "max_model_len": 8192, | |
| "log_level": "INFO", | |
| "reload": False, | |
| "rate_limit_enabled": True, | |
| "request_timeout_sec": 90, | |
| "cors_origins": [], | |
| }, | |
| } | |
| _CONTAINER_PLATFORM_ENV_MARKERS = ( | |
| "K_SERVICE", | |
| "K_REVISION", | |
| "K_CONFIGURATION", | |
| "KUBERNETES_SERVICE_HOST", | |
| "SPACE_ID", # HuggingFace Spaces | |
| ) | |
| def _resolve_serving_profile() -> ServingProfile: | |
| """νκ²½κ³Ό λͺ μκ°μ κΈ°μ€μΌλ‘ μλΉ νλ‘νμ κ²°μ νλ€.""" | |
| profile_name = os.getenv("SERVING_PROFILE") | |
| if profile_name: | |
| try: | |
| return ServingProfile(profile_name.lower()) | |
| except ValueError: | |
| logger.warning(f"μ μ μλ SERVING_PROFILE '{profile_name}', κΈ°λ³Έκ° 'local' μ¬μ©") | |
| return ServingProfile.LOCAL | |
| if any(os.getenv(marker) for marker in _CONTAINER_PLATFORM_ENV_MARKERS): | |
| logger.info("컨ν μ΄λ λ°νμ νκ²½μ κ°μ§νμ¬ 'container' νλ‘νμ μ¬μ©ν©λλ€.") | |
| return ServingProfile.CONTAINER | |
| return ServingProfile.LOCAL | |
| # --------------------------------------------------------------------------- | |
| # Generation Defaults | |
| # --------------------------------------------------------------------------- | |
| class GenerationDefaults: | |
| """ν μ€νΈ μμ± κΈ°λ³Έ νλΌλ―Έν°. μλν¬μΈνΈ μμ²μ κ°μ΄ μμ λ μ¬μ©λλ€.""" | |
| max_tokens: int = 512 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| repetition_penalty: float = 1.1 | |
| stop_sequences: List[str] = field(default_factory=lambda: ["[|endofturn|]"]) | |
| def from_env(cls) -> "GenerationDefaults": | |
| return cls( | |
| max_tokens=int(os.getenv("GEN_MAX_TOKENS", "512")), | |
| temperature=float(os.getenv("GEN_TEMPERATURE", "0.7")), | |
| top_p=float(os.getenv("GEN_TOP_P", "0.9")), | |
| repetition_penalty=float(os.getenv("GEN_REPETITION_PENALTY", "1.1")), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Model Configuration | |
| # --------------------------------------------------------------------------- | |
| class ModelConfig: | |
| """λͺ¨λΈ λ° μ΄λν° μ€μ . | |
| λ² μ΄μ€ λͺ¨λΈ: LGAI-EXAONE/EXAONE-4.0-32B-AWQ (λ¨μΌ vLLM μΈμ€ν΄μ€, ~20GB VRAM) | |
| - tool calling λ€μ΄ν°λΈ μ§μ (BFCL 65.2) | |
| - vLLM μλΉ μ΅μ : --enable-auto-tool-choice --tool-call-parser hermes | |
| Multi-LoRA μ΄λν°: | |
| - public_admin-adapter (LoRA #1): domain_adapter μ©λ | |
| νμ΅ λ°μ΄ν°: umyunsang/govon-civil-response-data (74K건), QLoRA on AWQ base | |
| HF Hub: umyunsang/GovOn-EXAONE-LoRA-v2 | |
| - legal-adapter (LoRA #2): domain_adapter μ©λ | |
| νμ΅ λ°μ΄ν°: umyunsang/govon-legal-response-data (243K건), QLoRA on AWQ base | |
| HuggingFace: siwo/govon-legal-adapter | |
| - λλ¨Έμ§ capability (api_lookup, synthesis λ±)λ LoRA μμ΄ base model μ¬μ© | |
| adapter_paths: Dict[str, str] νμμ μ΄λν° μ΄λ¦-κ²½λ‘ λ§€ν. | |
| νκ²½λ³μ ADAPTER_PATHS="public_admin=/path/to/public_admin,legal=/path/to/legal" νμμΌλ‘ μ€μ . | |
| μ: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"} | |
| """ | |
| model_path: str = "LGAI-EXAONE/EXAONE-4.0-32B-AWQ" | |
| trust_remote_code: bool = True | |
| dtype: str = "half" | |
| enforce_eager: bool = True | |
| # Multi-LoRA μ΄λν° κ²½λ‘: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"} | |
| # νκ²½λ³μ ADAPTER_PATHS="public_admin=/path/to/public_admin,legal=/path/to/legal" νμμΌλ‘ μ€μ | |
| adapter_paths: Dict[str, str] = field(default_factory=dict) | |
| def from_env(cls) -> "ModelConfig": | |
| adapter_paths = cls._parse_adapter_paths(os.getenv("ADAPTER_PATHS", "")) | |
| return cls( | |
| model_path=os.getenv("MODEL_PATH", "LGAI-EXAONE/EXAONE-4.0-32B-AWQ"), | |
| trust_remote_code=os.getenv("TRUST_REMOTE_CODE", "true").lower() | |
| in ("true", "1", "yes"), | |
| dtype=os.getenv("MODEL_DTYPE", "half"), | |
| enforce_eager=os.getenv("ENFORCE_EAGER", "true").lower() in ("true", "1", "yes"), | |
| adapter_paths=adapter_paths, | |
| ) | |
| def _parse_adapter_paths(raw: str) -> Dict[str, str]: | |
| """ADAPTER_PATHS νκ²½λ³μλ₯Ό νμ±νλ€. | |
| νμ: "public_admin=/path/to/public_admin,legal=/path/to/legal" | |
| λ°ν: {"public_admin": "/path/to/public_admin", "legal": "/path/to/legal"} | |
| μλͺ»λ νλͺ©μ κ²½κ³ ν 무μνλ€. | |
| """ | |
| if not raw or not raw.strip(): | |
| return {} | |
| result: Dict[str, str] = {} | |
| for entry in raw.split(","): | |
| entry = entry.strip() | |
| if not entry: | |
| continue | |
| if "=" not in entry: | |
| logger.warning(f"ADAPTER_PATHS νλͺ© νμ μ€λ₯ (name=path νμ): {entry!r}") | |
| continue | |
| name, path = entry.split("=", 1) | |
| name, path = name.strip(), path.strip() | |
| if not name or not path: | |
| logger.warning(f"ADAPTER_PATHS νλͺ©μ λΉ μ΄λ¦ λλ κ²½λ‘: {entry!r}") | |
| continue | |
| result[name] = path | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # Path Configuration | |
| # --------------------------------------------------------------------------- | |
| class PathConfig: | |
| """λ°μ΄ν°Β·μΈλ±μ€Β·λ‘κ·Έ κ²½λ‘ μ€μ .""" | |
| data_path: str = "" | |
| index_path: str = "" | |
| faiss_index_dir: str = "" | |
| bm25_index_dir: str = "" | |
| local_docs_root: str = "" | |
| agents_dir: str = "" | |
| log_dir: str = "" | |
| cache_dir: str = "" | |
| def from_env(cls) -> "PathConfig": | |
| project_root = str(_PROJECT_ROOT) | |
| return cls( | |
| data_path=os.getenv("DATA_PATH", ""), | |
| index_path=os.getenv("INDEX_PATH", "models/faiss_index/complaints.index"), | |
| faiss_index_dir=os.getenv("FAISS_INDEX_DIR", "models/faiss_index"), | |
| bm25_index_dir=os.getenv("BM25_INDEX_DIR", "models/bm25_index"), | |
| local_docs_root=os.getenv("LOCAL_DOCS_ROOT", ""), | |
| agents_dir=os.getenv("AGENTS_DIR", os.path.join(project_root, "agents")), | |
| log_dir=os.getenv("LOG_DIR", os.path.join(project_root, "logs")), | |
| cache_dir=os.getenv("CACHE_DIR", os.path.join(project_root, ".cache")), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Healthcheck Configuration | |
| # --------------------------------------------------------------------------- | |
| class HealthcheckConfig: | |
| """ν¬μ€μ²΄ν¬ μ€μ . shell client readiness probe μ©λ.""" | |
| endpoint: str = "/health" | |
| interval_sec: int = 30 | |
| timeout_sec: int = 10 | |
| startup_probe_path: str = "/health" | |
| readiness_probe_path: str = "/health" | |
| def from_env(cls) -> "HealthcheckConfig": | |
| return cls( | |
| interval_sec=int(os.getenv("HEALTH_INTERVAL_SEC", "30")), | |
| timeout_sec=int(os.getenv("HEALTH_TIMEOUT_SEC", "10")), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # RuntimeConfig (ν΅ν© μ€μ ) | |
| # --------------------------------------------------------------------------- | |
| class RuntimeConfig: | |
| """GovOn Runtime ν΅ν© μ€μ . | |
| SERVING_PROFILE νκ²½λ³μμ λ°λΌ νλ‘νλ³ κΈ°λ³Έκ°μ λ‘λνκ³ , | |
| κ°λ³ νκ²½λ³μλ‘ μ€λ²λΌμ΄λν μ μλ€. | |
| """ | |
| # μλΉ νλ‘ν | |
| profile: ServingProfile = ServingProfile.LOCAL | |
| # μλ² μ€μ | |
| host: str = "127.0.0.1" | |
| port: int = 8000 | |
| workers: int = 1 | |
| log_level: str = "DEBUG" | |
| reload: bool = True | |
| # GPU / vLLM μ€μ | |
| gpu_utilization: float = 0.7 | |
| max_model_len: int = 4096 | |
| skip_model_load: bool = False | |
| # 보μ | |
| api_key: Optional[str] = None | |
| cors_origins: List[str] = field(default_factory=list) | |
| rate_limit_enabled: bool = False | |
| # νμμμ | |
| request_timeout_sec: int = 120 | |
| # νμ μ€μ κ°μ²΄ | |
| model: ModelConfig = field(default_factory=ModelConfig) | |
| paths: PathConfig = field(default_factory=PathConfig) | |
| generation: GenerationDefaults = field(default_factory=GenerationDefaults) | |
| healthcheck: HealthcheckConfig = field(default_factory=HealthcheckConfig) | |
| def from_env(cls) -> "RuntimeConfig": | |
| """νκ²½λ³μμμ μ 체 λ°νμ μ€μ μ λ‘λνλ€. | |
| 1. SERVING_PROFILEμ λ°λ₯Έ νλ‘ν κΈ°λ³Έκ° μ μ© | |
| 2. κ°λ³ νκ²½λ³μλ‘ μ€λ²λΌμ΄λ | |
| """ | |
| profile = _resolve_serving_profile() | |
| defaults = _PROFILE_DEFAULTS[profile] | |
| skip_model_load = os.getenv("SKIP_MODEL_LOAD", "false").lower() in ( | |
| "true", | |
| "1", | |
| "yes", | |
| ) | |
| # CORS: νκ²½λ³μκ° μμΌλ©΄ μ°μ , μμΌλ©΄ νλ‘ν κΈ°λ³Έκ° | |
| cors_env = os.getenv("CORS_ORIGINS", "") | |
| if cors_env: | |
| cors_origins = [o.strip() for o in cors_env.split(",") if o.strip()] | |
| else: | |
| cors_origins = defaults["cors_origins"] | |
| return cls( | |
| profile=profile, | |
| host=os.getenv("HOST", defaults["host"]), | |
| port=int(os.getenv("PORT", str(defaults["port"]))), | |
| workers=int(os.getenv("WORKERS", str(defaults["workers"]))), | |
| log_level=os.getenv("LOG_LEVEL", defaults["log_level"]), | |
| reload=os.getenv("RELOAD", str(defaults["reload"])).lower() in ("true", "1", "yes"), | |
| gpu_utilization=float(os.getenv("GPU_UTILIZATION", str(defaults["gpu_utilization"]))), | |
| max_model_len=int(os.getenv("MAX_MODEL_LEN", str(defaults["max_model_len"]))), | |
| skip_model_load=skip_model_load, | |
| api_key=os.getenv("API_KEY"), | |
| cors_origins=cors_origins, | |
| rate_limit_enabled=os.getenv( | |
| "RATE_LIMIT_ENABLED", str(defaults["rate_limit_enabled"]) | |
| ).lower() | |
| in ("true", "1", "yes"), | |
| request_timeout_sec=int( | |
| os.getenv("REQUEST_TIMEOUT_SEC", str(defaults["request_timeout_sec"])) | |
| ), | |
| model=ModelConfig.from_env(), | |
| paths=PathConfig.from_env(), | |
| generation=GenerationDefaults.from_env(), | |
| healthcheck=HealthcheckConfig.from_env(), | |
| ) | |
| def log_summary(self) -> None: | |
| """νμ¬ μ€μ μμ½μ λ‘κ·Έλ‘ μΆλ ₯νλ€.""" | |
| logger.info("=" * 60) | |
| logger.info("GovOn Runtime Configuration") | |
| logger.info("=" * 60) | |
| logger.info(f" Profile : {self.profile.value}") | |
| logger.info(f" Host : {self.host}:{self.port}") | |
| logger.info(f" Workers : {self.workers}") | |
| logger.info(f" Log Level : {self.log_level}") | |
| logger.info(f" GPU Util : {self.gpu_utilization}") | |
| logger.info(f" Max Model Len : {self.max_model_len}") | |
| logger.info(f" Model Path : {self.model.model_path}") | |
| logger.info(f" Adapter Paths : {self.model.adapter_paths or '(none)'}") | |
| logger.info(f" Skip Model : {self.skip_model_load}") | |
| logger.info(f" Request Timeout: {self.request_timeout_sec}s") | |
| logger.info(f" Rate Limit : {self.rate_limit_enabled}") | |
| logger.info(f" CORS Origins : {self.cors_origins}") | |
| logger.info(f" Healthcheck : {self.healthcheck.endpoint}") | |
| logger.info(f" Data Path : {self.paths.data_path}") | |
| logger.info(f" Index Path : {self.paths.index_path}") | |
| logger.info(f" Local Docs : {self.paths.local_docs_root or '(disabled)'}") | |
| logger.info(f" Log Dir : {self.paths.log_dir}") | |
| logger.info("=" * 60) | |
| def to_uvicorn_kwargs(self) -> Dict: | |
| """uvicorn.run()μ μ λ¬ν ν€μλ μΈμλ₯Ό λ°ννλ€.""" | |
| kwargs = { | |
| "host": self.host, | |
| "port": self.port, | |
| "workers": self.workers, | |
| "log_level": self.log_level.lower(), | |
| "timeout_keep_alive": self.request_timeout_sec, | |
| } | |
| if self.reload: | |
| kwargs["reload"] = True | |
| return kwargs | |
| # --------------------------------------------------------------------------- | |
| # GovOnConfig β unified hyperparameter config (YAML + env var overrides) | |
| # --------------------------------------------------------------------------- | |
| _GOVON_YAML_PATH = _PROJECT_ROOT / "config" / "govon.yaml" | |
| def _load_yaml(path: Path) -> Dict[str, Any]: | |
| """Load a YAML file and return its contents as a dict. | |
| Returns an empty dict if the file does not exist or PyYAML is not installed. | |
| """ | |
| if not path.exists(): | |
| logger.warning(f"[GovOnConfig] config file not found: {path}. Using defaults.") | |
| return {} | |
| try: | |
| import yaml # type: ignore | |
| with open(path, "r", encoding="utf-8") as f: | |
| data = yaml.safe_load(f) or {} | |
| logger.debug(f"[GovOnConfig] loaded config from {path}") | |
| return data | |
| except ImportError: | |
| logger.warning("[GovOnConfig] PyYAML not installed; falling back to defaults.") | |
| return {} | |
| except Exception as exc: | |
| logger.warning(f"[GovOnConfig] failed to load {path}: {exc}. Using defaults.") | |
| return {} | |
| def _env(key: str, default: Any, cast=None) -> Any: | |
| """Read an environment variable and cast it to the required type. | |
| Returns *default* when the variable is absent or empty. | |
| """ | |
| raw = os.getenv(key) | |
| if raw is None or raw == "": | |
| return default | |
| if cast is not None: | |
| try: | |
| return cast(raw) | |
| except (ValueError, TypeError) as exc: | |
| logger.warning(f"[GovOnConfig] invalid env {key}={raw!r}: {exc}. Using default.") | |
| return default | |
| return raw | |
| class _GenerationConfig: | |
| """LLM generation hyperparameters.""" | |
| max_tokens: int = 512 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| repetition_penalty: float = 1.1 | |
| stop_sequences: List[str] = field(default_factory=lambda: ["[|endofturn|]"]) | |
| agent_temperature: float = 0.0 | |
| class _ServingConfig: | |
| """vLLM / GPU serving hyperparameters.""" | |
| gpu_memory_utilization: float = 0.90 | |
| max_model_len: int = 8192 | |
| max_loras: int = 4 | |
| max_lora_rank: int = 64 | |
| kv_cache_dtype: str = "auto" | |
| vllm_request_timeout: float = 300.0 | |
| vllm_connect_timeout: float = 30.0 | |
| vllm_startup_max_wait: int = 900 | |
| health_check_timeout: int = 10 | |
| class _ContextConfig: | |
| """Context window management hyperparameters.""" | |
| agent_input_budget: int = 4500 | |
| max_message_tokens: int = 4500 | |
| keep_recent_messages: int = 6 | |
| summary_threshold_ratio: float = 0.6 | |
| max_tool_result_chars: int = 3000 | |
| system_prompt_overhead: int = 2000 | |
| max_consecutive_rejections: int = 2 | |
| tool_clear_after_iteration: int = 2 | |
| tool_keep_recent: int = 2 | |
| max_iterations: int = 10 | |
| class _ToolDefaultConfig: | |
| """Default timeout and retry settings for a single tool.""" | |
| timeout_sec: float = 10.0 | |
| max_retries: int = 0 | |
| class _ToolsConfig: | |
| """Tool execution hyperparameters.""" | |
| defaults: _ToolDefaultConfig = field(default_factory=_ToolDefaultConfig) | |
| overrides: Dict[str, _ToolDefaultConfig] = field(default_factory=dict) | |
| def for_tool(self, name: str) -> _ToolDefaultConfig: | |
| """Return per-tool config, falling back to defaults.""" | |
| return self.overrides.get(name, self.defaults) | |
| class _RateLimitConfig: | |
| """API rate limiting configuration.""" | |
| default: str = "30/minute" | |
| class _ValidationConfig: | |
| """Request validation limits.""" | |
| max_prompt_length: int = 4096 | |
| max_tokens_ceiling: int = 4096 | |
| class GovOnConfig: | |
| """Unified hyperparameter configuration for GovOn. | |
| Load via :meth:`GovOnConfig.load` which reads ``config/govon.yaml`` and | |
| applies environment variable overrides on top. | |
| Environment variables follow the ``GOVON_<SECTION>_<KEY>`` naming convention | |
| (e.g. ``GOVON_GENERATION_MAX_TOKENS``). Legacy ``GEN_*`` variables are also | |
| supported for backward compatibility. | |
| """ | |
| generation: _GenerationConfig = field(default_factory=_GenerationConfig) | |
| serving: _ServingConfig = field(default_factory=_ServingConfig) | |
| context: _ContextConfig = field(default_factory=_ContextConfig) | |
| tools: _ToolsConfig = field(default_factory=_ToolsConfig) | |
| rate_limit: _RateLimitConfig = field(default_factory=_RateLimitConfig) | |
| validation: _ValidationConfig = field(default_factory=_ValidationConfig) | |
| def load(cls, path: Optional[Path] = None) -> "GovOnConfig": | |
| """Load config from YAML and apply environment variable overrides. | |
| Priority (highest first): | |
| 1. Environment variables (GOVON_* or legacy GEN_*) | |
| 2. config/govon.yaml values | |
| 3. Dataclass defaults (hardcoded fallbacks) | |
| """ | |
| raw = _load_yaml(path or _GOVON_YAML_PATH) | |
| gen_raw = raw.get("generation", {}) | |
| srv_raw = raw.get("serving", {}) | |
| ctx_raw = raw.get("context", {}) | |
| tls_raw = raw.get("tools", {}) | |
| rl_raw = raw.get("rate_limit", {}) | |
| val_raw = raw.get("validation", {}) | |
| # --- generation --- | |
| gen = _GenerationConfig( | |
| max_tokens=_env( | |
| "GOVON_GENERATION_MAX_TOKENS", | |
| _env("GEN_MAX_TOKENS", gen_raw.get("max_tokens", 512), int), | |
| int, | |
| ), | |
| temperature=_env( | |
| "GOVON_GENERATION_TEMPERATURE", | |
| _env("GEN_TEMPERATURE", gen_raw.get("temperature", 0.7), float), | |
| float, | |
| ), | |
| top_p=_env( | |
| "GOVON_GENERATION_TOP_P", | |
| _env("GEN_TOP_P", gen_raw.get("top_p", 0.9), float), | |
| float, | |
| ), | |
| repetition_penalty=_env( | |
| "GOVON_GENERATION_REPETITION_PENALTY", | |
| _env( | |
| "GEN_REPETITION_PENALTY", | |
| gen_raw.get("repetition_penalty", 1.1), | |
| float, | |
| ), | |
| float, | |
| ), | |
| stop_sequences=gen_raw.get("stop_sequences", ["[|endofturn|]"]), | |
| agent_temperature=_env( | |
| "GOVON_GENERATION_AGENT_TEMPERATURE", | |
| gen_raw.get("agent_temperature", 0.0), | |
| float, | |
| ), | |
| ) | |
| # --- serving --- | |
| srv = _ServingConfig( | |
| gpu_memory_utilization=_env( | |
| "GOVON_SERVING_GPU_MEMORY_UTILIZATION", | |
| srv_raw.get("gpu_memory_utilization", 0.90), | |
| float, | |
| ), | |
| max_model_len=_env( | |
| "GOVON_SERVING_MAX_MODEL_LEN", | |
| srv_raw.get("max_model_len", 8192), | |
| int, | |
| ), | |
| max_loras=_env( | |
| "GOVON_SERVING_MAX_LORAS", | |
| _env("MAX_LORAS", srv_raw.get("max_loras", 4), int), | |
| int, | |
| ), | |
| max_lora_rank=_env( | |
| "GOVON_SERVING_MAX_LORA_RANK", | |
| _env("MAX_LORA_RANK", srv_raw.get("max_lora_rank", 64), int), | |
| int, | |
| ), | |
| kv_cache_dtype=_env( | |
| "GOVON_SERVING_KV_CACHE_DTYPE", | |
| srv_raw.get("kv_cache_dtype", "auto"), | |
| ), | |
| vllm_request_timeout=_env( | |
| "GOVON_SERVING_VLLM_REQUEST_TIMEOUT", | |
| srv_raw.get("vllm_request_timeout", 300.0), | |
| float, | |
| ), | |
| vllm_connect_timeout=_env( | |
| "GOVON_SERVING_VLLM_CONNECT_TIMEOUT", | |
| srv_raw.get("vllm_connect_timeout", 30.0), | |
| float, | |
| ), | |
| vllm_startup_max_wait=_env( | |
| "GOVON_SERVING_VLLM_STARTUP_MAX_WAIT", | |
| srv_raw.get("vllm_startup_max_wait", 900), | |
| int, | |
| ), | |
| health_check_timeout=_env( | |
| "GOVON_SERVING_HEALTH_CHECK_TIMEOUT", | |
| srv_raw.get("health_check_timeout", 10), | |
| int, | |
| ), | |
| ) | |
| # --- context --- | |
| ctx = _ContextConfig( | |
| agent_input_budget=_env( | |
| "GOVON_CONTEXT_AGENT_INPUT_BUDGET", | |
| ctx_raw.get("agent_input_budget", 4500), | |
| int, | |
| ), | |
| max_message_tokens=_env( | |
| "GOVON_CONTEXT_MAX_MESSAGE_TOKENS", | |
| ctx_raw.get("max_message_tokens", 4500), | |
| int, | |
| ), | |
| keep_recent_messages=_env( | |
| "GOVON_CONTEXT_KEEP_RECENT_MESSAGES", | |
| ctx_raw.get("keep_recent_messages", 6), | |
| int, | |
| ), | |
| summary_threshold_ratio=_env( | |
| "GOVON_CONTEXT_SUMMARY_THRESHOLD_RATIO", | |
| ctx_raw.get("summary_threshold_ratio", 0.6), | |
| float, | |
| ), | |
| max_tool_result_chars=_env( | |
| "GOVON_CONTEXT_MAX_TOOL_RESULT_CHARS", | |
| ctx_raw.get("max_tool_result_chars", 3000), | |
| int, | |
| ), | |
| system_prompt_overhead=_env( | |
| "GOVON_CONTEXT_SYSTEM_PROMPT_OVERHEAD", | |
| ctx_raw.get("system_prompt_overhead", 2000), | |
| int, | |
| ), | |
| max_consecutive_rejections=_env( | |
| "GOVON_CONTEXT_MAX_CONSECUTIVE_REJECTIONS", | |
| ctx_raw.get("max_consecutive_rejections", 2), | |
| int, | |
| ), | |
| tool_clear_after_iteration=_env( | |
| "GOVON_CONTEXT_TOOL_CLEAR_AFTER_ITERATION", | |
| ctx_raw.get("tool_clear_after_iteration", 2), | |
| int, | |
| ), | |
| tool_keep_recent=_env( | |
| "GOVON_CONTEXT_TOOL_KEEP_RECENT", | |
| ctx_raw.get("tool_keep_recent", 2), | |
| int, | |
| ), | |
| max_iterations=_env( | |
| "GOVON_CONTEXT_MAX_ITERATIONS", | |
| ctx_raw.get("max_iterations", 10), | |
| int, | |
| ), | |
| ) | |
| # --- tools --- | |
| tls_defaults_raw = tls_raw.get("defaults", {}) | |
| tls_overrides_raw = tls_raw.get("overrides", {}) | |
| tls_defaults = _ToolDefaultConfig( | |
| timeout_sec=_env( | |
| "GOVON_TOOLS_DEFAULT_TIMEOUT_SEC", | |
| tls_defaults_raw.get("timeout_sec", 10.0), | |
| float, | |
| ), | |
| max_retries=_env( | |
| "GOVON_TOOLS_DEFAULT_MAX_RETRIES", | |
| tls_defaults_raw.get("max_retries", 0), | |
| int, | |
| ), | |
| ) | |
| tls_overrides: Dict[str, _ToolDefaultConfig] = {} | |
| for tool_name, override_raw in tls_overrides_raw.items(): | |
| tls_overrides[tool_name] = _ToolDefaultConfig( | |
| timeout_sec=override_raw.get("timeout_sec", tls_defaults.timeout_sec), | |
| max_retries=override_raw.get("max_retries", tls_defaults.max_retries), | |
| ) | |
| tls = _ToolsConfig(defaults=tls_defaults, overrides=tls_overrides) | |
| # --- rate_limit --- | |
| rl = _RateLimitConfig( | |
| default=_env( | |
| "GOVON_RATE_LIMIT_DEFAULT", | |
| rl_raw.get("default", "30/minute"), | |
| ), | |
| ) | |
| # --- validation --- | |
| val = _ValidationConfig( | |
| max_prompt_length=_env( | |
| "GOVON_VALIDATION_MAX_PROMPT_LENGTH", | |
| val_raw.get("max_prompt_length", 4096), | |
| int, | |
| ), | |
| max_tokens_ceiling=_env( | |
| "GOVON_VALIDATION_MAX_TOKENS_CEILING", | |
| val_raw.get("max_tokens_ceiling", 4096), | |
| int, | |
| ), | |
| ) | |
| return cls( | |
| generation=gen, | |
| serving=srv, | |
| context=ctx, | |
| tools=tls, | |
| rate_limit=rl, | |
| validation=val, | |
| ) | |
| # Module-level singleton β imported by other modules. | |
| govon_config: GovOnConfig = GovOnConfig.load() | |