ATCTrack-VLM / lib /test /tracker /atctrack_utils.py
SunXiang2025's picture
Upload ATCTrack-VLM code and selected checkpoints
25986db verified
import torch
import math
import numpy as np
import cv2 as cv
import torch.nn.functional as F
from lib.utils.misc import NestedTensor
class Preprocessor(object):
def __init__(self):
self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1)).cuda()
self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1)).cuda()
self.mm_mean = torch.tensor([0.485, 0.456, 0.406, 0.485, 0.456, 0.406]).view((1, 6, 1, 1)).cuda()
self.mm_std = torch.tensor([0.229, 0.224, 0.225, 0.229, 0.224, 0.225]).view((1, 6, 1, 1)).cuda()
def process(self, img_arr: np.ndarray):
if img_arr.shape[-1] == 6:
mean = self.mm_mean
std = self.mm_std
else:
mean = self.mean
std = self.std
# Deal with the image patch
img_tensor = torch.tensor(img_arr).cuda().float().permute((2,0,1)).unsqueeze(dim=0)
# img_tensor = torch.tensor(img_arr).float().permute((2,0,1)).unsqueeze(dim=0)
img_tensor_norm = ((img_tensor / 255.0) - mean) / std # (1,3,H,W)
return img_tensor_norm
def sample_target(im, target_bb, search_area_factor, output_sz=None):
""" Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area
args:
im - cv image
target_bb - target box [x, y, w, h]
search_area_factor - Ratio of crop size to target size
output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.
returns:
cv image - extracted crop
float - the factor by which the crop has been resized to make the crop size equal output_size
"""
if not isinstance(target_bb, list):
x, y, w, h = target_bb.tolist()
else:
x, y, w, h = target_bb
# Crop image
crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)
if crop_sz < 1:
raise Exception('Too small bounding box.')
x1 = round(x + 0.5 * w - crop_sz * 0.5)
x2 = x1 + crop_sz
y1 = round(y + 0.5 * h - crop_sz * 0.5)
y2 = y1 + crop_sz
x1_pad = max(0, -x1)
x2_pad = max(x2 - im.shape[1] + 1, 0)
y1_pad = max(0, -y1)
y2_pad = max(y2 - im.shape[0] + 1, 0)
# Crop target
im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
# Pad
im_crop_padded = cv.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, x2_pad, cv.BORDER_CONSTANT)
# deal with attention mask
H, W, _ = im_crop_padded.shape
if output_sz is not None:
resize_factor = output_sz / crop_sz
im_crop_padded = cv.resize(im_crop_padded, (output_sz, output_sz))
return im_crop_padded, resize_factor
else:
return im_crop_padded, 1.0
def resize_sample_target(im, output_sz=None):
""" Resize the image
args:
im - cv image
output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.
returns:
cv image - extracted crop
float - the factor by which the crop has been resized to make the crop size equal output_size
"""
# Resize image
# deal with attention mask
H, W, _ = im.shape
if output_sz is not None:
resize_factor = (output_sz / W, output_sz / H) # (w,h) rather than (h,w)
im_resized = cv.resize(im, (output_sz, output_sz))
return im_resized, resize_factor
else:
return im, 1.0
def transform_image_to_crop(box_in: torch.Tensor, box_extract: torch.Tensor, resize_factor: float,
crop_sz: torch.Tensor, normalize=False) -> torch.Tensor:
""" Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image
args:
box_in - the box for which the co-ordinates are to be transformed
box_extract - the box about which the image crop has been extracted.
resize_factor - the ratio between the original image scale and the scale of the image crop
crop_sz - size of the cropped image
returns:
torch.Tensor - transformed co-ordinates of box_in
"""
box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]
box_in_center = box_in[0:2] + 0.5 * box_in[2:4]
box_out_center = (crop_sz - 1) / 2 + (box_in_center - box_extract_center) * resize_factor
box_out_wh = box_in[2:4] * resize_factor
box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh))
if normalize:
return box_out / (crop_sz[0]-1)
else:
return box_out