File size: 2,346 Bytes
414dc55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""Case Zero entrypoint - one ``gradio.Server`` for a Hugging Face (CPU) Space.

The pixel-art frontend (built Preact bundle in ``web/dist``) and the game's JSON/SSE API
are both served by a single ``gradio.Server`` (a FastAPI subclass). The LLM and TTS run
in-process on the CPU via llama.cpp / Supertonic - no GPU, no inference API, fully local.

Gradio's own frontend and Node SSR proxy are disabled (``_frontend=False``,
``ssr_mode=False``) so our SPA owns ``/`` and we don't pay the node-proxy CPU cost on the
2-vCPU Space.
"""

from __future__ import annotations

import os
import sys
from pathlib import Path

# Make src importable whether run from the repo root or as a Space.
sys.path.insert(0, str(Path(__file__).resolve().parent / "src"))

os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")


def _ensure_weights() -> None:
    """Download the LLM GGUF once if it is not already on disk (Spaces have no baked
    weights). A no-op locally where the file already exists. Invoked lazily on first
    real generation (M1+), not at boot, so the server starts instantly."""
    from case_zero.config import get_settings

    settings = get_settings()
    if settings.llm_model_path.exists():
        return
    try:
        import shutil

        from huggingface_hub import hf_hub_download

        dest = settings.llm_model_path
        dest.parent.mkdir(parents=True, exist_ok=True)
        cached = hf_hub_download(
            repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
            filename="qwen2.5-1.5b-instruct-q4_k_m.gguf",
        )
        shutil.copy(cached, dest)
        print(f"[startup] fetched LLM weights -> {dest}")
    except Exception as exc:  # pragma: no cover
        print(f"[startup] weight fetch failed: {exc}", file=sys.stderr)


def main() -> None:
    from case_zero.api import build_server
    from case_zero.api.runtime import RUNTIME

    # Fetch weights if needed, then prebuild one case in the background so the first
    # "New Case" is ready (or nearly) by the time a detective connects.
    _ensure_weights()
    RUNTIME.start_buffer()

    server = build_server()
    server.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("CASE0_PORT", "7860")),
        share=False,
        ssr_mode=False,
        _frontend=False,
    )


if __name__ == "__main__":
    main()