# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # Adapted from https://github.com/facebookresearch/sam2/blob/main/sam2/benchmark.py import os import time import numpy as np import torch from efficient_track_anything.build_efficienttam import ( build_efficienttam_video_predictor, ) from tqdm import tqdm if torch.cuda.is_available(): device = torch.device("cuda") torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__() if torch.cuda.get_device_properties(0).major >= 8: # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True elif torch.mps.is_available(): device = torch.device("mps") else: raise RuntimeError("No CUDA or MPS device found") # Config and checkpoint # model_cfg = "configs/efficienttam/efficienttam_s.yaml" # model_cfg = "configs/efficienttam/efficienttam_s_1.yaml" # model_cfg = "configs/efficienttam/efficienttam_s_2.yaml" model_cfg = "configs/efficienttam/efficienttam_s_512x512.yaml" # model_cfg = "configs/efficienttam/efficienttam_ti.yaml" # model_cfg = "configs/efficienttam/efficienttam_ti_1.yaml" # model_cfg = "configs/efficienttam/efficienttam_ti_2.yaml" # model_cfg = "configs/efficienttam/efficienttam_ti_512x512.yaml" efficienttam_checkpoint = None # Build video predictor with vos_optimized=True setting predictor = build_efficienttam_video_predictor( model_cfg, efficienttam_checkpoint, device=device, vos_optimized=True ) model_total_params = sum(p.numel() for p in predictor.parameters()) print("Model Size: ", model_total_params) # Initialize with video video_dir = "notebooks/videos/bedroom" # scan all the JPEG frame names in this directory frame_names = [ p for p in os.listdir(video_dir) if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"] ] frame_names.sort(key=lambda p: int(os.path.splitext(p)[0])) inference_state = predictor.init_state(video_path=video_dir) # Number of runs, warmup etc warm_up, runs = 5, 25 verbose = True num_frames = len(frame_names) total, count = 0, 0 torch.cuda.empty_cache() # We will select an object with a click. # See video_predictor_example.ipynb for more detailed explanation ann_frame_idx, ann_obj_id = 0, 1 # Add a positive click at (x, y) = (210, 350) # For labels, `1` means positive click points = np.array([[210, 350]], dtype=np.float32) labels = np.array([1], np.int32) _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box( inference_state=inference_state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, points=points, labels=labels, ) # Warmup and then average FPS over several runs with torch.inference_mode(): for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"): start = time.time() # Start tracking for ( out_frame_idx, out_obj_ids, out_mask_logits, ) in predictor.propagate_in_video(inference_state): pass end = time.time() total += end - start count += 1 if i == warm_up - 1: print("Warmup FPS: ", count * num_frames / total) total = 0 count = 0 print("FPS: ", count * num_frames / total)