Spaces:
Running
Running
Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)
414dc55 | """Runtime configuration via pydantic-settings. | |
| All models are open-weights and self-run. Settings are read once at startup and are | |
| immutable thereafter (``frozen=True``), overridable via ``CASE0_*`` env vars / ``.env``. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from enum import StrEnum | |
| from functools import lru_cache | |
| from pathlib import Path | |
| from pydantic import Field, field_validator | |
| from pydantic_settings import BaseSettings, SettingsConfigDict | |
| from .constants import MODELS_DIR | |
| def effective_cpus() -> int: | |
| """The number of CPUs this process can ACTUALLY use. | |
| ``os.cpu_count()`` reports the host machine's cores, not the cgroup CPU quota a | |
| container is limited to - so on a 2-vCPU Hugging Face Space it can return 8 or 16. | |
| Trusting it makes llama.cpp spawn far too many threads for the real quota, which | |
| pins the CPU on context-switching and slows every turn down. We read the cgroup | |
| quota (v2 then v1), fall back to the CPU affinity mask, then to ``os.cpu_count()``. | |
| """ | |
| # cgroup v2: "<quota> <period>" (or "max <period>" when unlimited). | |
| try: | |
| raw = Path("/sys/fs/cgroup/cpu.max").read_text().split() | |
| if raw and raw[0] != "max": | |
| quota, period = int(raw[0]), int(raw[1]) | |
| if quota > 0 and period > 0: | |
| return max(1, round(quota / period)) | |
| except (OSError, ValueError, IndexError): | |
| pass | |
| # cgroup v1. | |
| try: | |
| quota = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read_text()) | |
| period = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read_text()) | |
| if quota > 0 and period > 0: | |
| return max(1, round(quota / period)) | |
| except (OSError, ValueError): | |
| pass | |
| # Affinity mask (respects taskset / some container setups). Not on Windows/macOS. | |
| try: | |
| return max(1, len(os.sched_getaffinity(0))) # type: ignore[attr-defined] | |
| except (AttributeError, OSError): | |
| pass | |
| return os.cpu_count() or 4 | |
| class TTSEngine(StrEnum): | |
| SUPERTONIC = "supertonic" | |
| NULL = "null" | |
| class Settings(BaseSettings): | |
| """Immutable application settings. Read once at startup via ``get_settings``.""" | |
| model_config = SettingsConfigDict( | |
| env_prefix="CASE0_", | |
| env_file=".env", | |
| env_file_encoding="utf-8", | |
| frozen=True, | |
| extra="ignore", | |
| ) | |
| # Small + fast (1.5B -> Tiny Titan). The whole game runs on this single model. | |
| llm_model_path: Path = MODELS_DIR / "llm" / "qwen2.5-1.5b-instruct-q4_k_m.gguf" | |
| llm_n_ctx: int = Field(default=6144, ge=1024, le=32768) | |
| # 0 means auto: the validator picks a physical-core estimate (big speed win on | |
| # many-core hosts, where a fixed default would leave most of the CPU idle). | |
| llm_n_threads: int = Field(default=0, ge=0) | |
| seed: int | None = None | |
| tts_engine: TTSEngine = TTSEngine.SUPERTONIC | |
| def _cap_threads(cls, value: int) -> int: | |
| # Use the REAL usable-core count (cgroup quota), never the host's core count - | |
| # over-threading a 2-vCPU Space pins the CPU on context-switching and slows it down. | |
| cpu = effective_cpus() | |
| if value <= 0: | |
| # Auto: above 4 cores assume hyperthreading and use physical cores; at or below | |
| # 4 (e.g. a 2-vCPU Space) use them all - that is the CPU llama.cpp sweet spot. | |
| return max(1, cpu // 2) if cpu > 4 else cpu | |
| return max(1, min(value, cpu)) | |
| def get_settings() -> Settings: | |
| """Return the process-wide settings singleton.""" | |
| return Settings() | |