Spaces:

build-small-hackathon
/

case0

Running

Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)

414dc55 3 days ago

raw

history blame

3.62 kB

	"""Runtime configuration via pydantic-settings.

	All models are open-weights and self-run. Settings are read once at startup and are
	immutable thereafter (``frozen=True``), overridable via ``CASE0_*`` env vars / ``.env``.
	"""

	from __future__ import annotations

	import os
	from enum import StrEnum
	from functools import lru_cache
	from pathlib import Path

	from pydantic import Field, field_validator
	from pydantic_settings import BaseSettings, SettingsConfigDict

	from .constants import MODELS_DIR


	def effective_cpus() -> int:
	"""The number of CPUs this process can ACTUALLY use.

	``os.cpu_count()`` reports the host machine's cores, not the cgroup CPU quota a
	container is limited to - so on a 2-vCPU Hugging Face Space it can return 8 or 16.
	Trusting it makes llama.cpp spawn far too many threads for the real quota, which
	pins the CPU on context-switching and slows every turn down. We read the cgroup
	quota (v2 then v1), fall back to the CPU affinity mask, then to ``os.cpu_count()``.
	"""
	# cgroup v2: "<quota> <period>" (or "max <period>" when unlimited).
	try:
	raw = Path("/sys/fs/cgroup/cpu.max").read_text().split()
	if raw and raw[0] != "max":
	quota, period = int(raw[0]), int(raw[1])
	if quota > 0 and period > 0:
	return max(1, round(quota / period))
	except (OSError, ValueError, IndexError):
	pass
	# cgroup v1.
	try:
	quota = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read_text())
	period = int(Path("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read_text())
	if quota > 0 and period > 0:
	return max(1, round(quota / period))
	except (OSError, ValueError):
	pass
	# Affinity mask (respects taskset / some container setups). Not on Windows/macOS.
	try:
	return max(1, len(os.sched_getaffinity(0))) # type: ignore[attr-defined]
	except (AttributeError, OSError):
	pass
	return os.cpu_count() or 4


	class TTSEngine(StrEnum):
	SUPERTONIC = "supertonic"
	NULL = "null"


	class Settings(BaseSettings):
	"""Immutable application settings. Read once at startup via ``get_settings``."""

	model_config = SettingsConfigDict(
	env_prefix="CASE0_",
	env_file=".env",
	env_file_encoding="utf-8",
	frozen=True,
	extra="ignore",
	)

	# Small + fast (1.5B -> Tiny Titan). The whole game runs on this single model.
	llm_model_path: Path = MODELS_DIR / "llm" / "qwen2.5-1.5b-instruct-q4_k_m.gguf"
	llm_n_ctx: int = Field(default=6144, ge=1024, le=32768)
	# 0 means auto: the validator picks a physical-core estimate (big speed win on
	# many-core hosts, where a fixed default would leave most of the CPU idle).
	llm_n_threads: int = Field(default=0, ge=0)

	seed: int \| None = None
	tts_engine: TTSEngine = TTSEngine.SUPERTONIC

	@field_validator("llm_n_threads")
	@classmethod
	def _cap_threads(cls, value: int) -> int:
	# Use the REAL usable-core count (cgroup quota), never the host's core count -
	# over-threading a 2-vCPU Space pins the CPU on context-switching and slows it down.
	cpu = effective_cpus()
	if value <= 0:
	# Auto: above 4 cores assume hyperthreading and use physical cores; at or below
	# 4 (e.g. a 2-vCPU Space) use them all - that is the CPU llama.cpp sweet spot.
	return max(1, cpu // 2) if cpu > 4 else cpu
	return max(1, min(value, cpu))


	@lru_cache(maxsize=1)
	def get_settings() -> Settings:
	"""Return the process-wide settings singleton."""
	return Settings()