Spaces:
Running
Running
File size: 3,621 Bytes
414dc55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | """Runtime configuration via pydantic-settings.
All models are open-weights and self-run. Settings are read once at startup and are
immutable thereafter (``frozen=True``), overridable via ``CASE0_*`` env vars / ``.env``.
"""
from __future__ import annotations
import os
from enum import StrEnum
from functools import lru_cache
from pathlib import Path
from pydantic import Field, field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
from .constants import MODELS_DIR
def effective_cpus() -> int:
"""The number of CPUs this process can ACTUALLY use.
``os.cpu_count()`` reports the host machine's cores, not the cgroup CPU quota a
container is limited to - so on a 2-vCPU Hugging Face Space it can return 8 or 16.
Trusting it makes llama.cpp spawn far too many threads for the real quota, which
pins the CPU on context-switching and slows every turn down. We read the cgroup
quota (v2 then v1), fall back to the CPU affinity mask, then to ``os.cpu_count()``.
"""
# cgroup v2: "<quota> <period>" (or "max <period>" when unlimited).
try:
raw = Path("/sys/fs/cgroup/cpu.max").read_text().split()
if raw and raw[0] != "max":
quota, period = int(raw[0]), int(raw[1])
if quota > 0 and period > 0:
return max(1, round(quota / period))
except (OSError, ValueError, IndexError):
pass
# cgroup v1.
try:
quota = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read_text())
period = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read_text())
if quota > 0 and period > 0:
return max(1, round(quota / period))
except (OSError, ValueError):
pass
# Affinity mask (respects taskset / some container setups). Not on Windows/macOS.
try:
return max(1, len(os.sched_getaffinity(0))) # type: ignore[attr-defined]
except (AttributeError, OSError):
pass
return os.cpu_count() or 4
class TTSEngine(StrEnum):
SUPERTONIC = "supertonic"
NULL = "null"
class Settings(BaseSettings):
"""Immutable application settings. Read once at startup via ``get_settings``."""
model_config = SettingsConfigDict(
env_prefix="CASE0_",
env_file=".env",
env_file_encoding="utf-8",
frozen=True,
extra="ignore",
)
# Small + fast (1.5B -> Tiny Titan). The whole game runs on this single model.
llm_model_path: Path = MODELS_DIR / "llm" / "qwen2.5-1.5b-instruct-q4_k_m.gguf"
llm_n_ctx: int = Field(default=6144, ge=1024, le=32768)
# 0 means auto: the validator picks a physical-core estimate (big speed win on
# many-core hosts, where a fixed default would leave most of the CPU idle).
llm_n_threads: int = Field(default=0, ge=0)
seed: int | None = None
tts_engine: TTSEngine = TTSEngine.SUPERTONIC
@field_validator("llm_n_threads")
@classmethod
def _cap_threads(cls, value: int) -> int:
# Use the REAL usable-core count (cgroup quota), never the host's core count -
# over-threading a 2-vCPU Space pins the CPU on context-switching and slows it down.
cpu = effective_cpus()
if value <= 0:
# Auto: above 4 cores assume hyperthreading and use physical cores; at or below
# 4 (e.g. a 2-vCPU Space) use them all - that is the CPU llama.cpp sweet spot.
return max(1, cpu // 2) if cpu > 4 else cpu
return max(1, min(value, cpu))
@lru_cache(maxsize=1)
def get_settings() -> Settings:
"""Return the process-wide settings singleton."""
return Settings()
|