| import torch |
| import math |
| import cv2 as cv |
| import torch.nn.functional as F |
| import numpy as np |
|
|
| '''modified from the original test implementation |
| Replace cv.BORDER_REPLICATE with cv.BORDER_CONSTANT |
| Add a variable called att_mask for computing attention and positional encoding later''' |
|
|
|
|
| def sample_target(im, target_bb, search_area_factor, output_sz=None, mask=None): |
| """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area |
| |
| args: |
| im - cv image |
| target_bb - target box [x, y, w, h] |
| search_area_factor - Ratio of crop size to target size |
| output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done. |
| |
| returns: |
| cv image - extracted crop |
| float - the factor by which the crop has been resized to make the crop size equal output_size |
| """ |
| if not isinstance(target_bb, list): |
| x, y, w, h = target_bb.tolist() |
| else: |
| x, y, w, h = target_bb |
| |
| crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor) |
|
|
| if crop_sz < 1: |
| raise Exception('Too small bounding box.') |
|
|
| x1 = round(x + 0.5 * w - crop_sz * 0.5) |
| x2 = x1 + crop_sz |
|
|
| y1 = round(y + 0.5 * h - crop_sz * 0.5) |
| y2 = y1 + crop_sz |
|
|
| x1_pad = max(0, -x1) |
| x2_pad = max(x2 - im.shape[1] + 1, 0) |
|
|
| y1_pad = max(0, -y1) |
| y2_pad = max(y2 - im.shape[0] + 1, 0) |
|
|
| |
| im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :] |
| if mask is not None: |
| mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad] |
|
|
| |
| im_crop_padded = cv.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, x2_pad, cv.BORDER_CONSTANT) |
| |
| H, W, _ = im_crop_padded.shape |
| att_mask = np.ones((H,W)) |
| end_x, end_y = -x2_pad, -y2_pad |
| if y2_pad == 0: |
| end_y = None |
| if x2_pad == 0: |
| end_x = None |
| att_mask[y1_pad:end_y, x1_pad:end_x] = 0 |
| if mask is not None: |
| mask_crop_padded = F.pad(mask_crop, pad=(x1_pad, x2_pad, y1_pad, y2_pad), mode='constant', value=0) |
|
|
| if output_sz is not None: |
| resize_factor = output_sz / crop_sz |
| im_crop_padded = cv.resize(im_crop_padded, (output_sz, output_sz)) |
| att_mask = cv.resize(att_mask, (output_sz, output_sz)).astype(np.bool_) |
| if mask is None: |
| return im_crop_padded, resize_factor, att_mask |
| mask_crop_padded = \ |
| F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz), mode='bilinear', align_corners=False)[0, 0] |
| return im_crop_padded, resize_factor, att_mask, mask_crop_padded |
|
|
| else: |
| if mask is None: |
| return im_crop_padded, att_mask.astype(np.bool_), 1.0 |
| return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded |
|
|
| def resize_sample_target(im, target_bb, output_sz=None, mask=None): |
| """ Resize the image |
| |
| args: |
| im - cv image |
| target_bb - target box [x, y, w, h] |
| output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done. |
| |
| returns: |
| cv image - extracted crop |
| float - the factor by which the crop has been resized to make the crop size equal output_size |
| """ |
|
|
| |
| |
| H, W, _ = im.shape |
| att_mask = np.zeros((H,W)) |
|
|
| if output_sz is not None: |
| resize_factor = (output_sz / W, output_sz / H) |
| im_resized = cv.resize(im, (output_sz, output_sz)) |
| att_mask = cv.resize(att_mask, (output_sz, output_sz)).astype(np.bool_) |
| if mask is None: |
| return im_resized, resize_factor, att_mask |
| mask_resized = \ |
| F.interpolate(mask[None, None], (output_sz, output_sz), mode='bilinear', align_corners=False)[0, 0] |
| return im_resized, resize_factor, att_mask, mask_resized |
|
|
| else: |
| if mask is None: |
| return im, att_mask.astype(np.bool_), 1.0 |
| return im, 1.0, att_mask.astype(np.bool_), mask |
|
|
| def transform_image_to_crop(box_in: torch.Tensor, box_extract: torch.Tensor, resize_factor: float, |
| crop_sz: torch.Tensor, normalize=False) -> torch.Tensor: |
| """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image |
| args: |
| box_in - the box for which the co-ordinates are to be transformed |
| box_extract - the box about which the image crop has been extracted. |
| resize_factor - the ratio between the original image scale and the scale of the image crop |
| crop_sz - size of the cropped image |
| |
| returns: |
| torch.Tensor - transformed co-ordinates of box_in |
| """ |
| box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4] |
|
|
| box_in_center = box_in[0:2] + 0.5 * box_in[2:4] |
|
|
| box_out_center = (crop_sz - 1) / 2 + (box_in_center - box_extract_center) * resize_factor |
| box_out_wh = box_in[2:4] * resize_factor |
|
|
| box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh)) |
| if normalize: |
| |
| return box_out / (crop_sz[0]-1) |
| else: |
| return box_out |
|
|
| def transform_image_to_resize(box_in: torch.Tensor, resize_factor: float, |
| crop_sz: torch.Tensor, normalize=False) -> torch.Tensor: |
| """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image |
| args: |
| box_in - the box for which the co-ordinates are to be transformed |
| box_extract - the box about which the image crop has been extracted. |
| resize_factor - the ratio between the original image scale and the scale of the image crop |
| crop_sz - size of the cropped image |
| |
| returns: |
| torch.Tensor - transformed co-ordinates of box_in |
| """ |
| box_out_xy = box_in[:2] * torch.tensor(resize_factor) |
| box_out_wh = box_in[2:4] * torch.tensor(resize_factor) |
|
|
| box_out = torch.cat((box_out_xy, box_out_wh)) |
| if normalize: |
| return box_out / (crop_sz[0]-1) |
| else: |
| return box_out |
|
|
| def jittered_center_crop(frames, box_extract, box_gt, search_area_factor, output_sz, masks=None): |
| """ For each frame in frames, extracts a square crop centered at box_extract, of area search_area_factor^2 |
| times box_extract area. The extracted crops are then resized to output_sz. Further, the co-ordinates of the box |
| box_gt are transformed to the image crop co-ordinates |
| |
| args: |
| frames - list of frames |
| box_extract - list of boxes of same length as frames. The crops are extracted using anno_extract |
| box_gt - list of boxes of same length as frames. The co-ordinates of these boxes are transformed from |
| image co-ordinates to the crop co-ordinates |
| search_area_factor - The area of the extracted crop is search_area_factor^2 times box_extract area |
| output_sz - The size to which the extracted crops are resized |
| |
| returns: |
| list - list of image crops |
| list - box_gt location in the crop co-ordinates |
| """ |
|
|
| if masks is None: |
| crops_resize_factors = [sample_target(f, a, search_area_factor, output_sz) |
| for f, a in zip(frames, box_extract)] |
| frames_crop, resize_factors, att_mask = zip(*crops_resize_factors) |
| masks_crop = None |
| else: |
| crops_resize_factors = [sample_target(f, a, search_area_factor, output_sz, m) |
| for f, a, m in zip(frames, box_extract, masks)] |
| frames_crop, resize_factors, att_mask, masks_crop = zip(*crops_resize_factors) |
| |
| crop_sz = torch.Tensor([output_sz, output_sz]) |
|
|
| |
| '''Note that here we use normalized coord''' |
| box_crop = [transform_image_to_crop(a_gt, a_ex, rf, crop_sz, normalize=True) |
| for a_gt, a_ex, rf in zip(box_gt, box_extract, resize_factors)] |
|
|
| return frames_crop, box_crop, att_mask, masks_crop |
|
|
| def pstb_jittered_center_crop(frames, box_extract, box_gt, box_frame, search_area_factor, output_sz, masks=None): |
| """ For each frame in frames, extracts a square crop centered at box_extract, of area search_area_factor^2 |
| times box_extract area. The extracted crops are then resized to output_sz. Further, the co-ordinates of the box |
| box_gt are transformed to the image crop co-ordinates |
| |
| args: |
| frames - list of frames |
| box_extract - list of boxes of same length as frames. The crops are extracted using anno_extract |
| box_gt - list of boxes of same length as frames. The co-ordinates of these boxes are transformed from |
| image co-ordinates to the crop co-ordinates |
| search_area_factor - The area of the extracted crop is search_area_factor^2 times box_extract area |
| output_sz - The size to which the extracted crops are resized |
| |
| returns: |
| list - list of image crops |
| list - box_gt location in the crop co-ordinates |
| """ |
|
|
| if masks is None: |
| crops_resize_factors = [sample_target(f, a, search_area_factor, output_sz) |
| for f, a in zip(frames, box_extract)] |
| frames_crop, resize_factors, att_mask = zip(*crops_resize_factors) |
| masks_crop = None |
| else: |
| crops_resize_factors = [sample_target(f, a, search_area_factor, output_sz, m) |
| for f, a, m in zip(frames, box_extract, masks)] |
| frames_crop, resize_factors, att_mask, masks_crop = zip(*crops_resize_factors) |
| |
| crop_sz = torch.Tensor([output_sz, output_sz]) |
|
|
| |
| '''Note that here we use normalized coord''' |
| box_crop = [transform_image_to_crop(a_gt, a_ex, rf, crop_sz, normalize=True) |
| for a_gt, a_ex, rf in zip(box_gt, box_extract, resize_factors)] |
| box_frame_crop = [transform_image_to_crop(a_gt, box_extract[-1], resize_factors[-1], crop_sz, normalize=True) |
| for a_gt in box_frame] |
|
|
| return frames_crop, box_crop, box_frame_crop, att_mask, masks_crop |
|
|
| def resize(frames, box, output_sz, masks=None): |
| """ For each frame in frames, extracts a square crop centered at box_extract, of area search_area_factor^2 |
| times box_extract area. The extracted crops are then resized to output_sz. Further, the co-ordinates of the box |
| box_gt are transformed to the image crop co-ordinates |
| |
| args: |
| frames - list of frames |
| box_extract - list of boxes of same length as frames. The crops are extracted using anno_extract |
| box_gt - list of boxes of same length as frames. The co-ordinates of these boxes are transformed from |
| image co-ordinates to the crop co-ordinates |
| search_area_factor - The area of the extracted crop is search_area_factor^2 times box_extract area |
| output_sz - The size to which the extracted crops are resized |
| |
| returns: |
| list - list of image crops |
| list - box_gt location in the crop co-ordinates |
| """ |
|
|
| if masks is None: |
| crops_resize_factors = [resize_sample_target(f, a, output_sz) |
| for f, a in zip(frames, box)] |
| frames_crop, resize_factors, att_mask = zip(*crops_resize_factors) |
| masks_crop = None |
| else: |
| crops_resize_factors = [resize_sample_target(f, a, output_sz, m) |
| for f, a, m in zip(frames, box, masks)] |
| frames_crop, resize_factors, att_mask, masks_crop = zip(*crops_resize_factors) |
| |
| crop_sz = torch.Tensor([output_sz, output_sz]) |
|
|
| |
| '''Note that here we use normalized coord''' |
| box_crop = [transform_image_to_resize(bb, rf, crop_sz, normalize=True) |
| for bb, rf in zip(box, resize_factors)] |
|
|
| return frames_crop, box_crop, att_mask, masks_crop |
|
|
|
|
| def transform_box_to_crop(box: torch.Tensor, crop_box: torch.Tensor, crop_sz: torch.Tensor, normalize=False) -> torch.Tensor: |
| """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image |
| args: |
| box - the box for which the co-ordinates are to be transformed |
| crop_box - bounding box defining the crop in the original image |
| crop_sz - size of the cropped image |
| |
| returns: |
| torch.Tensor - transformed co-ordinates of box_in |
| """ |
|
|
| box_out = box.clone() |
| box_out[:2] -= crop_box[:2] |
|
|
| scale_factor = crop_sz / crop_box[2:] |
|
|
| box_out[:2] *= scale_factor |
| box_out[2:] *= scale_factor |
| if normalize: |
| return box_out / crop_sz[0] |
| else: |
| return box_out |
|
|
|
|