"""Hugging Face Space entrypoint for Small Cuts. Local dev keeps the lazy/mock defaults. On a Space this module refuses unsafe CPU local inference and never lets startup failures crash-loop the container: - ``import spaces`` happens before anything touches torch (ZeroGPU hijack). - The narrator loads lazily inside the ``@spaces.GPU`` event handler. - TTS runs inside @spaces.GPU workers too (kokoro's torch use poisons worker forks if it ever runs in the main process). """ import os import sys import warnings from pathlib import Path import gradio as gr from starlette.exceptions import StarletteDeprecationWarning ROOT = Path(__file__).resolve().parent SRC = ROOT / "src" if str(SRC) not in sys.path: sys.path.insert(0, str(SRC)) warnings.filterwarnings( "ignore", message=r".*HTTP_422_UNPROCESSABLE_ENTITY.*HTTP_422_UNPROCESSABLE_CONTENT.*", category=StarletteDeprecationWarning, ) ON_SPACE = bool(os.environ.get("SPACE_ID")) ENGINE_MODE = bool(os.environ.get("SMALL_CUTS_ENGINE_URL", "").strip()) from small_cuts.hf_relay import RELAY_BUCKET_ENV # noqa: E402 RELAY_MODE = bool(os.environ.get(RELAY_BUCKET_ENV, "").strip()) MODAL_UPLOAD_MODE = bool(os.environ.get("SMALL_CUTS_MODAL_API_URL", "").strip()) VIEWER_ONLY_MODE = ENGINE_MODE or RELAY_MODE or MODAL_UPLOAD_MODE NEEDS_LOCAL_INFERENCE = not VIEWER_ONLY_MODE try: import spaces # noqa: F401 (must precede torch imports for ZeroGPU) except ImportError: # local dev / CI: no ZeroGPU spaces = None if NEEDS_LOCAL_INFERENCE: if ON_SPACE: os.environ.setdefault("SMALL_CUTS_BACKEND", "transformers") else: os.environ.setdefault("SMALL_CUTS_BACKEND", "llama_cpp") os.environ.setdefault("SMALL_CUTS_TTS_BACKEND", "kokoro") from small_cuts.observability import capture_exception, init_sentry # noqa: E402 from small_cuts.space_hooks import install_relay_hooks # noqa: E402 from small_cuts.viewer import THEME, build_viewer_app # noqa: E402 init_sentry() STARTUP_ERROR: str | None = None def _allow_cpu_inference() -> bool: return os.environ.get("SMALL_CUTS_ALLOW_CPU_INFERENCE", "").strip().lower() in ( "1", "true", "yes", ) def _validate_startup_mode() -> None: if ON_SPACE and NEEDS_LOCAL_INFERENCE and spaces is None and not _allow_cpu_inference(): raise RuntimeError( "refusing local inference on a Space without ZeroGPU; configure relay, engine, " "or Modal upload mode, or set SMALL_CUTS_ALLOW_CPU_INFERENCE=1 explicitly" ) def _degraded_app(message: str) -> gr.Blocks: with gr.Blocks(title="Small Cuts") as degraded: gr.Markdown( f"# Small Cuts is temporarily unavailable\n\nStartup configuration failed: `{message}`" ) return degraded def _build_demo() -> gr.Blocks: _validate_startup_mode() # In engine/relay/upload modes the Space is a public reader and upload front door, so it must # not warm local model weights. In local-inference mode, ZeroGPU loads lazily inside the # Gradio handler decorated with @spaces.GPU. app = build_viewer_app() install_relay_hooks(app.app) return app try: demo = _build_demo() except Exception as exc: capture_exception(exc) STARTUP_ERROR = str(exc) demo = _degraded_app(STARTUP_ERROR) if __name__ == "__main__": demo.launch(theme=THEME)