Spaces:

build-small-hackathon
/

case0

Running

case0 / app.py

Case Zero - initial public release (fully local: Qwen2.5-1.5B via llama.cpp + Supertonic, custom pixel-noir SPA via gradio.Server)

414dc55 3 days ago

raw

history blame

2.35 kB

	"""Case Zero entrypoint - one ``gradio.Server`` for a Hugging Face (CPU) Space.

	The pixel-art frontend (built Preact bundle in ``web/dist``) and the game's JSON/SSE API
	are both served by a single ``gradio.Server`` (a FastAPI subclass). The LLM and TTS run
	in-process on the CPU via llama.cpp / Supertonic - no GPU, no inference API, fully local.

	Gradio's own frontend and Node SSR proxy are disabled (``_frontend=False``,
	``ssr_mode=False``) so our SPA owns ``/`` and we don't pay the node-proxy CPU cost on the
	2-vCPU Space.
	"""

	from __future__ import annotations

	import os
	import sys
	from pathlib import Path

	# Make src importable whether run from the repo root or as a Space.
	sys.path.insert(0, str(Path(__file__).resolve().parent / "src"))

	os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")


	def _ensure_weights() -> None:
	"""Download the LLM GGUF once if it is not already on disk (Spaces have no baked
	weights). A no-op locally where the file already exists. Invoked lazily on first
	real generation (M1+), not at boot, so the server starts instantly."""
	from case_zero.config import get_settings

	settings = get_settings()
	if settings.llm_model_path.exists():
	return
	try:
	import shutil

	from huggingface_hub import hf_hub_download

	dest = settings.llm_model_path
	dest.parent.mkdir(parents=True, exist_ok=True)
	cached = hf_hub_download(
	repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
	filename="qwen2.5-1.5b-instruct-q4_k_m.gguf",
	)
	shutil.copy(cached, dest)
	print(f"[startup] fetched LLM weights -> {dest}")
	except Exception as exc: # pragma: no cover
	print(f"[startup] weight fetch failed: {exc}", file=sys.stderr)


	def main() -> None:
	from case_zero.api import build_server
	from case_zero.api.runtime import RUNTIME

	# Fetch weights if needed, then prebuild one case in the background so the first
	# "New Case" is ready (or nearly) by the time a detective connects.
	_ensure_weights()
	RUNTIME.start_buffer()

	server = build_server()
	server.launch(
	server_name="0.0.0.0",
	server_port=int(os.environ.get("CASE0_PORT", "7860")),
	share=False,
	ssr_mode=False,
	_frontend=False,
	)


	if __name__ == "__main__":
	main()