Spaces:

bellmake
/

SAM3-video-segmentation-tracking

Sleeping

App Files Files Community

SAM3-video-segmentation-tracking / sam3 /model /utils /sam2_utils.py

bellmake

SAM3 Video Segmentation - Clean deployment

14114e8 about 2 months ago

raw

history blame contribute delete

8.28 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import os
	from threading import Thread

	import numpy as np
	import torch
	from PIL import Image
	from tqdm import tqdm


	def _load_img_as_tensor(img_path, image_size):
	img_pil = Image.open(img_path)
	img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size)))
	if img_np.dtype == np.uint8: # np.uint8 is expected for JPEG images
	img_np = img_np / 255.0
	else:
	raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}")
	img = torch.from_numpy(img_np).permute(2, 0, 1)
	video_width, video_height = img_pil.size # the original video size
	return img, video_height, video_width


	class AsyncVideoFrameLoader:
	"""
	A list of video frames to be load asynchronously without blocking session start.
	"""

	def __init__(
	self,
	img_paths,
	image_size,
	offload_video_to_cpu,
	img_mean,
	img_std,
	compute_device,
	):
	self.img_paths = img_paths
	self.image_size = image_size
	self.offload_video_to_cpu = offload_video_to_cpu
	self.img_mean = img_mean
	self.img_std = img_std
	# items in `self.images` will be loaded asynchronously
	self.images = [None] * len(img_paths)
	# catch and raise any exceptions in the async loading thread
	self.exception = None
	# video_height and video_width be filled when loading the first image
	self.video_height = None
	self.video_width = None
	self.compute_device = compute_device

	# load the first frame to fill video_height and video_width and also
	# to cache it (since it's most likely where the user will click)
	self.__getitem__(0)

	# load the rest of frames asynchronously without blocking the session start
	def _load_frames():
	try:
	for n in tqdm(range(len(self.images)), desc="frame loading (JPEG)"):
	self.__getitem__(n)
	except Exception as e:
	self.exception = e

	self.thread = Thread(target=_load_frames, daemon=True)
	self.thread.start()

	def __getitem__(self, index):
	if self.exception is not None:
	raise RuntimeError("Failure in frame loading thread") from self.exception

	img = self.images[index]
	if img is not None:
	return img

	img, video_height, video_width = _load_img_as_tensor(
	self.img_paths[index], self.image_size
	)
	self.video_height = video_height
	self.video_width = video_width
	# normalize by mean and std
	img -= self.img_mean
	img /= self.img_std
	if not self.offload_video_to_cpu:
	img = img.to(self.compute_device, non_blocking=True)
	self.images[index] = img
	return img

	def __len__(self):
	return len(self.images)


	def load_video_frames(
	video_path,
	image_size,
	offload_video_to_cpu,
	img_mean=(0.5, 0.5, 0.5),
	img_std=(0.5, 0.5, 0.5),
	async_loading_frames=False,
	compute_device=torch.device("cuda"),
	):
	"""
	Load the video frames from video_path. The frames are resized to image_size as in
	the model and are loaded to GPU if offload_video_to_cpu=False. This is used by the demo.
	"""
	is_bytes = isinstance(video_path, bytes)
	is_str = isinstance(video_path, str)
	is_mp4_path = is_str and os.path.splitext(video_path)[-1] in [".mp4", ".MP4"]
	if is_bytes or is_mp4_path:
	return load_video_frames_from_video_file(
	video_path=video_path,
	image_size=image_size,
	offload_video_to_cpu=offload_video_to_cpu,
	img_mean=img_mean,
	img_std=img_std,
	compute_device=compute_device,
	)
	elif is_str and os.path.isdir(video_path):
	return load_video_frames_from_jpg_images(
	video_path=video_path,
	image_size=image_size,
	offload_video_to_cpu=offload_video_to_cpu,
	img_mean=img_mean,
	img_std=img_std,
	async_loading_frames=async_loading_frames,
	compute_device=compute_device,
	)
	else:
	raise NotImplementedError(
	"Only MP4 video and JPEG folder are supported at this moment"
	)


	def load_video_frames_from_jpg_images(
	video_path,
	image_size,
	offload_video_to_cpu,
	img_mean=(0.5, 0.5, 0.5),
	img_std=(0.5, 0.5, 0.5),
	async_loading_frames=False,
	compute_device=torch.device("cuda"),
	):
	"""
	Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).

	The frames are resized to image_size x image_size and are loaded to GPU if
	`offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.

	You can load a frame asynchronously by setting `async_loading_frames` to `True`.
	"""
	if isinstance(video_path, str) and os.path.isdir(video_path):
	jpg_folder = video_path
	else:
	raise NotImplementedError(
	"Only JPEG frames are supported at this moment. For video files, you may use "
	"ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as \n"
	"```\n"
	"ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'\n"
	"```\n"
	"where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks "
	"ffmpeg to start the JPEG file from 00000.jpg."
	)

	frame_names = [
	p
	for p in os.listdir(jpg_folder)
	if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
	]
	frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
	num_frames = len(frame_names)
	if num_frames == 0:
	raise RuntimeError(f"no images found in {jpg_folder}")
	img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names]
	img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
	img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]

	if async_loading_frames:
	lazy_images = AsyncVideoFrameLoader(
	img_paths,
	image_size,
	offload_video_to_cpu,
	img_mean,
	img_std,
	compute_device,
	)
	return lazy_images, lazy_images.video_height, lazy_images.video_width

	images = torch.zeros(num_frames, 3, image_size, image_size, dtype=torch.float32)
	for n, img_path in enumerate(tqdm(img_paths, desc="frame loading (JPEG)")):
	images[n], video_height, video_width = _load_img_as_tensor(img_path, image_size)
	if not offload_video_to_cpu:
	images = images.to(compute_device)
	img_mean = img_mean.to(compute_device)
	img_std = img_std.to(compute_device)
	# normalize by mean and std
	images -= img_mean
	images /= img_std
	return images, video_height, video_width


	def load_video_frames_from_video_file(
	video_path,
	image_size,
	offload_video_to_cpu,
	img_mean=(0.5, 0.5, 0.5),
	img_std=(0.5, 0.5, 0.5),
	compute_device=torch.device("cuda"),
	):
	"""Load the video frames from a video file."""
	import decord

	img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
	img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
	# Get the original video height and width
	decord.bridge.set_bridge("torch")
	video_height, video_width, _ = decord.VideoReader(video_path).next().shape
	# Iterate over all frames in the video
	images = []
	for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
	images.append(frame.permute(2, 0, 1))

	images = torch.stack(images, dim=0).float() / 255.0
	if not offload_video_to_cpu:
	images = images.to(compute_device)
	img_mean = img_mean.to(compute_device)
	img_std = img_std.to(compute_device)
	# normalize by mean and std
	images -= img_mean
	images /= img_std
	return images, video_height, video_width