case0 / app.py
HusseinEid's picture
Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)
414dc55
raw
history blame
2.35 kB
"""Case Zero entrypoint - one ``gradio.Server`` for a Hugging Face (CPU) Space.
The pixel-art frontend (built Preact bundle in ``web/dist``) and the game's JSON/SSE API
are both served by a single ``gradio.Server`` (a FastAPI subclass). The LLM and TTS run
in-process on the CPU via llama.cpp / Supertonic - no GPU, no inference API, fully local.
Gradio's own frontend and Node SSR proxy are disabled (``_frontend=False``,
``ssr_mode=False``) so our SPA owns ``/`` and we don't pay the node-proxy CPU cost on the
2-vCPU Space.
"""
from __future__ import annotations
import os
import sys
from pathlib import Path
# Make src importable whether run from the repo root or as a Space.
sys.path.insert(0, str(Path(__file__).resolve().parent / "src"))
os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")
def _ensure_weights() -> None:
"""Download the LLM GGUF once if it is not already on disk (Spaces have no baked
weights). A no-op locally where the file already exists. Invoked lazily on first
real generation (M1+), not at boot, so the server starts instantly."""
from case_zero.config import get_settings
settings = get_settings()
if settings.llm_model_path.exists():
return
try:
import shutil
from huggingface_hub import hf_hub_download
dest = settings.llm_model_path
dest.parent.mkdir(parents=True, exist_ok=True)
cached = hf_hub_download(
repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
filename="qwen2.5-1.5b-instruct-q4_k_m.gguf",
)
shutil.copy(cached, dest)
print(f"[startup] fetched LLM weights -> {dest}")
except Exception as exc: # pragma: no cover
print(f"[startup] weight fetch failed: {exc}", file=sys.stderr)
def main() -> None:
from case_zero.api import build_server
from case_zero.api.runtime import RUNTIME
# Fetch weights if needed, then prebuild one case in the background so the first
# "New Case" is ready (or nearly) by the time a detective connects.
_ensure_weights()
RUNTIME.start_buffer()
server = build_server()
server.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("CASE0_PORT", "7860")),
share=False,
ssr_mode=False,
_frontend=False,
)
if __name__ == "__main__":
main()