Spaces:

bellmake
/

SAM3-video-segmentation-tracking

Sleeping

App Files Files Community

SAM3-video-segmentation-tracking / parallel_segment_worker.py

bellmake

SAM3 Video Segmentation - Clean deployment

ae50268 29 days ago

raw

history blame contribute delete

12 kB

	"""
	Worker entry for the parallel multi-GPU Auto-Mode dispatcher.

	This module is intentionally lightweight at top level — it must NOT import
	torch (or anything that imports torch) before `worker_main()` has a chance to
	narrow ``CUDA_VISIBLE_DEVICES`` to a single device. ``mp.spawn`` re-imports the
	target module in each child process; importing torch at top level here would
	cause CUDA to initialize against all visible devices before we can pin the
	worker to one GPU.

	Flow:
	1. Parent dispatcher in app.py spawns one of these per concurrent video.
	2. ``worker_main(gpu_index, args, progress_queue)`` is the spawn target.
	3. It sets ``CUDA_VISIBLE_DEVICES`` to ``str(gpu_index)`` BEFORE importing
	torch, so the child sees only one device (always referenced as cuda:0).
	4. It then imports ``app._segment_video_core`` and runs the segmentation,
	streaming progress / status / result / error messages back to the parent
	via ``progress_queue``. Each message carries ``gpu_index`` so the
	dispatcher can route it to the right per-video UI slot.
	"""

	import os
	import pathlib
	import shutil
	import sys
	import tempfile
	import traceback
	import uuid


	_DISTRIBUTED_ENV_KEYS = (
	"RANK",
	"WORLD_SIZE",
	"LOCAL_RANK",
	"LOCAL_WORLD_SIZE",
	"GROUP_RANK",
	"GROUP_WORLD_SIZE",
	"ROLE_RANK",
	"ROLE_WORLD_SIZE",
	"MASTER_ADDR",
	"MASTER_PORT",
	"TORCHELASTIC_RUN_ID",
	"TORCHELASTIC_RESTART_COUNT",
	"TORCHELASTIC_MAX_RESTARTS",
	)


	def _truthy(value):
	return str(value).strip().lower() not in {"0", "false", "no", "off"}


	def _force_single_rank_env(gpu_index=None, cpu_only=False):
	os.environ.setdefault("CUDA_DEVICE_ORDER", "PCI_BUS_ID")
	if cpu_only:
	os.environ["CUDA_VISIBLE_DEVICES"] = ""
	elif gpu_index is not None:
	os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_index)

	for key in _DISTRIBUTED_ENV_KEYS:
	os.environ.pop(key, None)
	os.environ["RANK"] = "0"
	os.environ["WORLD_SIZE"] = "1"
	os.environ["LOCAL_RANK"] = "0"
	os.environ["LOCAL_WORLD_SIZE"] = "1"

	os.environ["SAM3_WORKER_MODE"] = "1"
	os.environ.setdefault("SAM3_CACHE_FRAME_OUTPUTS", "0")
	os.environ.setdefault("SAM3_OFFLOAD_TRACKER_STATE_TO_CPU", "1")
	os.environ.setdefault("PYTHONDONTWRITEBYTECODE", "1")
	sys.dont_write_bytecode = True


	def _copy_runtime_item(source_dir, runtime_dir, name):
	src = pathlib.Path(source_dir) / name
	if not src.exists():
	fallback = pathlib.Path(__file__).resolve().parent / name
	src = fallback if fallback.exists() else src
	if not src.exists():
	return

	dst = pathlib.Path(runtime_dir) / name
	if src.is_dir():
	shutil.copytree(
	src,
	dst,
	symlinks=False,
	ignore=shutil.ignore_patterns("__pycache__", "*.pyc", ".git"),
	)
	else:
	shutil.copy2(src, dst)


	def _prepare_runtime(worker_options, tag):
	"""
	Optionally copy app.py + SAM3 source into a private temp runtime.

	The model weights still come from the HF cache, but Python module globals,
	__pycache__, and source imports are per-worker. Result temp files stay in
	the normal system temp area so the parent process can persist them after
	the worker exits. Disable with
	SAM3_PARALLEL_COPY_RUNTIME=0 if startup latency matters more than isolation.
	"""
	worker_options = worker_options or {}
	source_dir = pathlib.Path(
	worker_options.get("source_app_dir") or pathlib.Path(__file__).resolve().parent
	).resolve()
	os.environ["SAM3_OUTPUT_ROOT"] = str(source_dir)

	if not _truthy(worker_options.get("isolate_runtime", "1")):
	return str(source_dir), None

	app_src = source_dir / "app.py"
	if not app_src.exists():
	return str(source_dir), None

	runtime_dir = pathlib.Path(
	tempfile.mkdtemp(prefix=f"sam3_{tag}_{uuid.uuid4().hex[:8]}_")
	).resolve()
	try:
	_copy_runtime_item(source_dir, runtime_dir, "app.py")
	_copy_runtime_item(source_dir, runtime_dir, "parallel_segment_worker.py")
	_copy_runtime_item(source_dir, runtime_dir, "sam3")
	_copy_runtime_item(source_dir, runtime_dir, "assets")
	os.environ["SAM3_ISOLATED_RUNTIME_DIR"] = str(runtime_dir)
	return str(runtime_dir), str(runtime_dir)
	except Exception:
	shutil.rmtree(runtime_dir, ignore_errors=True)
	raise


	def _prepend_import_paths(*paths):
	for path in reversed([p for p in paths if p]):
	if path in sys.path:
	sys.path.remove(path)
	sys.path.insert(0, path)


	def _cleanup_runtime(runtime_dir):
	if runtime_dir:
	shutil.rmtree(runtime_dir, ignore_errors=True)


	def worker_main(gpu_index, args, progress_queue, worker_options=None):
	# Pin BEFORE any torch import — this is the whole point of the separate file.
	_force_single_rank_env(gpu_index=gpu_index)

	runtime_dir = None
	try:
	app_root, runtime_dir = _prepare_runtime(worker_options, f"gpu{gpu_index}")
	except Exception as exc: # noqa: BLE001
	progress_queue.put({
	"type": "error",
	"message": f"runtime isolation setup failed on GPU {gpu_index}: {exc}",
	"traceback": traceback.format_exc(),
	"gpu_index": gpu_index,
	})
	return

	repo_root = os.path.dirname(os.path.abspath(__file__))
	_prepend_import_paths(app_root, repo_root)
	sys.modules.pop("app", None)

	try:
	import torch # safe now: only one device visible
	if torch.cuda.is_available():
	torch.cuda.set_device(0)
	except Exception as exc: # noqa: BLE001
	progress_queue.put({
	"type": "error",
	"message": f"torch init failed on GPU {gpu_index}: {exc}",
	"traceback": traceback.format_exc(),
	"gpu_index": gpu_index,
	})
	_cleanup_runtime(runtime_dir)
	return

	try:
	progress_queue.put({
	"type": "status",
	"message": (
	f"🔒 GPU {gpu_index}: isolated worker ready "
	f"(CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES')}, "
	f"WORLD_SIZE={os.environ.get('WORLD_SIZE')}, runtime={app_root})"
	),
	"gpu_index": gpu_index,
	})
	from app import _segment_video_core # imports torch but env is already pinned
	except Exception as exc: # noqa: BLE001
	progress_queue.put({
	"type": "error",
	"message": f"import _segment_video_core failed: {exc}",
	"traceback": traceback.format_exc(),
	"gpu_index": gpu_index,
	})
	_cleanup_runtime(runtime_dir)
	return

	(
	video_path,
	text_prompt,
	duration_limit,
	id_corrections_text,
	id_drop_text,
	id_override_start_sec,
	show_trails,
	view_mode,
	) = args

	def _progress_cb(val, desc):
	progress_queue.put({
	"type": "progress",
	"value": val,
	"desc": desc,
	"gpu_index": gpu_index,
	})

	def _status_cb(msg):
	progress_queue.put({
	"type": "status",
	"message": msg,
	"gpu_index": gpu_index,
	})

	try:
	progress_queue.put({
	"type": "progress",
	"value": 0.0,
	"desc": f"GPU {gpu_index}: starting...",
	"gpu_index": gpu_index,
	})
	out_path, status, loc_path = _segment_video_core(
	video_path,
	text_prompt,
	duration_limit,
	id_corrections_text=id_corrections_text,
	id_drop_text=id_drop_text,
	id_override_start_sec=id_override_start_sec,
	show_trails=show_trails,
	view_mode=view_mode,
	progress_callback=_progress_cb,
	status_callback=_status_cb,
	)
	progress_queue.put({
	"type": "result",
	"data": (out_path, status, loc_path),
	"gpu_index": gpu_index,
	})
	except Exception as exc: # noqa: BLE001
	progress_queue.put({
	"type": "error",
	"message": str(exc),
	"traceback": traceback.format_exc(),
	"gpu_index": gpu_index,
	})
	finally:
	try:
	import torch
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()
	except Exception:
	pass
	_cleanup_runtime(runtime_dir)


	def overlay_worker_main(task_id, args, event_queue, worker_options=None):
	"""
	Render segmentation + trails overlays in an isolated process so that
	multiple videos' CPU-bound overlay work can run on different vCPUs in
	parallel after their GPU segmentation completes.

	Pushes a single ``overlay_done`` (or ``overlay_error``) message onto
	``event_queue`` carrying ``task_id`` so the parent dispatcher can route
	it back to the correct video slot.
	"""
	# Don't pin CUDA — overlay rendering is pure CPU. Avoid importing torch
	# at all if we can help it; just route to ffmpeg/cv2.
	_force_single_rank_env(cpu_only=True)

	runtime_dir = None
	try:
	app_root, runtime_dir = _prepare_runtime(worker_options, f"overlay_{task_id}")
	except Exception as exc: # noqa: BLE001
	event_queue.put({
	"type": "overlay_error",
	"task_id": task_id,
	"error": f"runtime isolation setup failed: {exc}",
	"traceback": traceback.format_exc(),
	})
	return

	repo_root = os.path.dirname(os.path.abspath(__file__))
	_prepend_import_paths(app_root, repo_root)
	sys.modules.pop("app", None)

	try:
	from app import (
	_render_segmentation_overlay_video,
	_render_trails_overlay_video,
	_build_trail_filter_options,
	_persist_for_download,
	)
	except Exception as exc: # noqa: BLE001
	event_queue.put({
	"type": "overlay_error",
	"task_id": task_id,
	"error": f"import failed: {exc}",
	"traceback": traceback.format_exc(),
	})
	_cleanup_runtime(runtime_dir)
	return

	(output_video, location_path, text_prompt) = args

	try:
	seg_overlay = _render_segmentation_overlay_video(
	output_video, location_path, text_prompt
	)
	except Exception as exc: # noqa: BLE001
	event_queue.put({
	"type": "overlay_error",
	"task_id": task_id,
	"error": f"seg overlay failed: {exc}",
	"traceback": traceback.format_exc(),
	})
	_cleanup_runtime(runtime_dir)
	return

	seg_display_path = seg_overlay or output_video

	try:
	trails_overlay = _render_trails_overlay_video(
	seg_display_path, location_path, text_prompt, force_unique=True
	)
	except Exception as exc: # noqa: BLE001
	# Trails overlay is a "nice to have" — fall back to seg overlay
	trails_overlay = None
	trails_error = f"trails overlay failed: {exc}"
	else:
	trails_error = None

	try:
	choices, defaults, _legend = _build_trail_filter_options(
	location_path, text_prompt
	)
	except Exception:
	choices, defaults = [], []

	download_path = trails_overlay or seg_display_path
	try:
	persisted = _persist_for_download(download_path, subdir="downloads")
	if persisted:
	download_path = persisted
	except Exception:
	pass

	event_queue.put({
	"type": "overlay_done",
	"task_id": task_id,
	"seg_overlay": seg_display_path,
	"trails_overlay": trails_overlay,
	"download_path": download_path,
	"trail_choices": choices,
	"trail_selected": defaults,
	"warning": trails_error,
	})
	_cleanup_runtime(runtime_dir)