"""Runtime configuration via pydantic-settings. All models are open-weights and self-run. Settings are read once at startup and are immutable thereafter (``frozen=True``), overridable via ``CASE0_*`` env vars / ``.env``. """ from __future__ import annotations import os from enum import StrEnum from functools import lru_cache from pathlib import Path from pydantic import Field, field_validator from pydantic_settings import BaseSettings, SettingsConfigDict from .constants import MODELS_DIR def effective_cpus() -> int: """The number of CPUs this process can ACTUALLY use. ``os.cpu_count()`` reports the host machine's cores, not the cgroup CPU quota a container is limited to - so on a 2-vCPU Hugging Face Space it can return 8 or 16. Trusting it makes llama.cpp spawn far too many threads for the real quota, which pins the CPU on context-switching and slows every turn down. We read the cgroup quota (v2 then v1), fall back to the CPU affinity mask, then to ``os.cpu_count()``. """ # cgroup v2: " " (or "max " when unlimited). try: raw = Path("/sys/fs/cgroup/cpu.max").read_text().split() if raw and raw[0] != "max": quota, period = int(raw[0]), int(raw[1]) if quota > 0 and period > 0: return max(1, round(quota / period)) except (OSError, ValueError, IndexError): pass # cgroup v1. try: quota = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read_text()) period = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read_text()) if quota > 0 and period > 0: return max(1, round(quota / period)) except (OSError, ValueError): pass # Affinity mask (respects taskset / some container setups). Not on Windows/macOS. try: return max(1, len(os.sched_getaffinity(0))) # type: ignore[attr-defined] except (AttributeError, OSError): pass return os.cpu_count() or 4 class TTSEngine(StrEnum): SUPERTONIC = "supertonic" NULL = "null" class Settings(BaseSettings): """Immutable application settings. Read once at startup via ``get_settings``.""" model_config = SettingsConfigDict( env_prefix="CASE0_", env_file=".env", env_file_encoding="utf-8", frozen=True, extra="ignore", ) # Small + fast (1.5B -> Tiny Titan). The whole game runs on this single model. llm_model_path: Path = MODELS_DIR / "llm" / "qwen2.5-1.5b-instruct-q4_k_m.gguf" llm_n_ctx: int = Field(default=6144, ge=1024, le=32768) # 0 means auto: the validator picks a physical-core estimate (big speed win on # many-core hosts, where a fixed default would leave most of the CPU idle). llm_n_threads: int = Field(default=0, ge=0) seed: int | None = None tts_engine: TTSEngine = TTSEngine.SUPERTONIC @field_validator("llm_n_threads") @classmethod def _cap_threads(cls, value: int) -> int: # Use the REAL usable-core count (cgroup quota), never the host's core count - # over-threading a 2-vCPU Space pins the CPU on context-switching and slows it down. cpu = effective_cpus() if value <= 0: # Auto: above 4 cores assume hyperthreading and use physical cores; at or below # 4 (e.g. a 2-vCPU Space) use them all - that is the CPU llama.cpp sweet spot. return max(1, cpu // 2) if cpu > 4 else cpu return max(1, min(value, cpu)) @lru_cache(maxsize=1) def get_settings() -> Settings: """Return the process-wide settings singleton.""" return Settings()