Spaces:

Rishabh12j
/

DepthLens

Sleeping

Rishabh Jain

Pre-load models at import; wrap inference with longer-duration @spaces.GPU

9b33910 30 days ago

2.88 kB

	"""
	Hugging Face Spaces entry point (ZeroGPU compatible).

	On ZeroGPU, CUDA is available at import time AND inside @spaces.GPU
	decorated functions. Models are loaded once at import so the first
	request does not spend minutes downloading weights inside a short GPU
	slice (which caused every call to fall back to CPU).
	"""

	import os
	import sys

	sys.path.insert(0, os.path.dirname(__file__))

	# ---------------------------------------------------------------------------
	# Defensive patch: gradio_client.utils.get_type crashes on bool schemas.
	# ---------------------------------------------------------------------------
	try:
	from gradio_client import utils as _gc_utils

	_orig_get_type = _gc_utils.get_type

	def _safe_get_type(schema):
	if isinstance(schema, bool) or not isinstance(schema, dict):
	return "Any"
	return _orig_get_type(schema)

	_gc_utils.get_type = _safe_get_type

	_orig_json_to_py = _gc_utils._json_schema_to_python_type

	def _safe_json_to_py(schema, defs=None):
	if isinstance(schema, bool) or not isinstance(schema, dict):
	return "Any"
	try:
	return _orig_json_to_py(schema, defs)
	except (TypeError, KeyError):
	return "Any"

	_gc_utils._json_schema_to_python_type = _safe_json_to_py
	except Exception:
	pass

	# ---------------------------------------------------------------------------
	# ZeroGPU import + pipeline warm-up
	# ---------------------------------------------------------------------------
	try:
	import spaces
	_HAS_SPACES = True
	except ImportError:
	_HAS_SPACES = False

	from src.ui import gradio_app as _ga
	from src.pipeline import Pipeline

	# Pre-load every model at import — on ZeroGPU torch.cuda is available here
	# and the loaded tensors stay on GPU across @spaces.GPU calls (the runtime
	# handles slice re-attachment transparently).
	print("Pre-loading pipeline (VLM + Depth + YOLO)...")
	_pipeline = Pipeline()
	_pipeline._get_vlm()
	_pipeline._get_depth()
	_pipeline._get_detector()
	_ga._PIPELINE = _pipeline
	print("Pipeline ready.")

	# ---------------------------------------------------------------------------
	# Wrap inference entry points so ZeroGPU keeps a GPU slice for the call.
	# 300 s covers first-call weight migration + heaviest Stage 3 inference.
	# ---------------------------------------------------------------------------
	if _HAS_SPACES:
	_orig_run = _ga.run_inference
	_orig_live = _ga.live_run_inference

	@spaces.GPU(duration=300)
	def _gpu_run_inference(args, *kwargs):
	return _orig_run(args, *kwargs)

	@spaces.GPU(duration=300)
	def _gpu_live_inference(args, *kwargs):
	return _orig_live(args, *kwargs)

	_ga.run_inference = _gpu_run_inference
	_ga.live_run_inference = _gpu_live_inference

	demo = _ga.build_ui()

	if __name__ == "__main__":
	demo.launch()