SparseBev / loaders /pipelines /transforms.py
Alfred Liu
Code release
d19bd3e
import mmcv
import torch
import numpy as np
from PIL import Image
from numpy import random
from mmdet.datasets.builder import PIPELINES
@PIPELINES.register_module()
class PadMultiViewImage(object):
"""Pad the multi-view image.
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
Args:
size (tuple, optional): Fixed padding size.
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
"""
def __init__(self, size=None, size_divisor=None, pad_val=0):
self.size = size
self.size_divisor = size_divisor
self.pad_val = pad_val
# only one of size and size_divisor should be valid
assert size is not None or size_divisor is not None
assert size is None or size_divisor is None
def _pad_img(self, img):
if self.size_divisor is not None:
pad_h = int(np.ceil(img.shape[0] / self.size_divisor)) * self.size_divisor
pad_w = int(np.ceil(img.shape[1] / self.size_divisor)) * self.size_divisor
else:
pad_h, pad_w = self.size
pad_width = ((0, pad_h - img.shape[0]), (0, pad_w - img.shape[1]), (0, 0))
img = np.pad(img, pad_width, constant_values=self.pad_val)
return img
def _pad_imgs(self, results):
padded_img = [self._pad_img(img) for img in results['img']]
results['ori_shape'] = [img.shape for img in results['img']]
results['img'] = padded_img
results['img_shape'] = [img.shape for img in padded_img]
results['pad_shape'] = [img.shape for img in padded_img]
results['pad_fixed_size'] = self.size
results['pad_size_divisor'] = self.size_divisor
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self._pad_imgs(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.size}, '
repr_str += f'size_divisor={self.size_divisor}, '
repr_str += f'pad_val={self.pad_val})'
return repr_str
@PIPELINES.register_module()
class NormalizeMultiviewImage(object):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def __init__(self, mean, std, to_rgb=True):
self.mean = np.array(mean, dtype=np.float32).reshape(-1)
self.std = 1 / np.array(std, dtype=np.float32).reshape(-1)
self.to_rgb = to_rgb
def __call__(self, results):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
normalized_imgs = []
for img in results['img']:
img = img.astype(np.float32)
if self.to_rgb:
img = img[..., ::-1]
img = img - self.mean
img = img * self.std
normalized_imgs.append(img)
results['img'] = normalized_imgs
results['img_norm_cfg'] = dict(
mean=self.mean,
std=self.std,
to_rgb=self.to_rgb
)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
return repr_str
@PIPELINES.register_module()
class PhotoMetricDistortionMultiViewImage:
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def __call__(self, results):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
imgs = results['img']
new_imgs = []
for img in imgs:
ori_dtype = img.dtype
img = img.astype(np.float32)
# random brightness
if random.randint(2):
delta = random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(2)
if mode == 1:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if random.randint(2):
img[..., 1] *= random.uniform(self.saturation_lower,
self.saturation_upper)
# random hue
if random.randint(2):
img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# randomly swap channels
if random.randint(2):
img = img[..., random.permutation(3)]
new_imgs.append(img.astype(ori_dtype))
results['img'] = new_imgs
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str
@PIPELINES.register_module()
class RandomTransformImage(object):
def __init__(self, ida_aug_conf=None, training=True):
self.ida_aug_conf = ida_aug_conf
self.training = training
def __call__(self, results):
resize, resize_dims, crop, flip, rotate = self.sample_augmentation()
if len(results['lidar2img']) == len(results['img']):
for i in range(len(results['img'])):
img = Image.fromarray(np.uint8(results['img'][i]))
# resize, resize_dims, crop, flip, rotate = self._sample_augmentation()
img, ida_mat = self.img_transform(
img,
resize=resize,
resize_dims=resize_dims,
crop=crop,
flip=flip,
rotate=rotate,
)
results['img'][i] = np.array(img).astype(np.uint8)
results['lidar2img'][i] = ida_mat @ results['lidar2img'][i]
elif len(results['img']) == 6:
for i in range(len(results['img'])):
img = Image.fromarray(np.uint8(results['img'][i]))
# resize, resize_dims, crop, flip, rotate = self._sample_augmentation()
img, ida_mat = self.img_transform(
img,
resize=resize,
resize_dims=resize_dims,
crop=crop,
flip=flip,
rotate=rotate,
)
results['img'][i] = np.array(img).astype(np.uint8)
for i in range(len(results['lidar2img'])):
results['lidar2img'][i] = ida_mat @ results['lidar2img'][i]
else:
raise ValueError()
results['ori_shape'] = [img.shape for img in results['img']]
results['img_shape'] = [img.shape for img in results['img']]
results['pad_shape'] = [img.shape for img in results['img']]
return results
def img_transform(self, img, resize, resize_dims, crop, flip, rotate):
"""
https://github.com/Megvii-BaseDetection/BEVStereo/blob/master/dataset/nusc_mv_det_dataset.py#L48
"""
def get_rot(h):
return torch.Tensor([
[np.cos(h), np.sin(h)],
[-np.sin(h), np.cos(h)],
])
ida_rot = torch.eye(2)
ida_tran = torch.zeros(2)
# adjust image
img = img.resize(resize_dims)
img = img.crop(crop)
if flip:
img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
img = img.rotate(rotate)
# post-homography transformation
ida_rot *= resize
ida_tran -= torch.Tensor(crop[:2])
if flip:
A = torch.Tensor([[-1, 0], [0, 1]])
b = torch.Tensor([crop[2] - crop[0], 0])
ida_rot = A.matmul(ida_rot)
ida_tran = A.matmul(ida_tran) + b
A = get_rot(rotate / 180 * np.pi)
b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
b = A.matmul(-b) + b
ida_rot = A.matmul(ida_rot)
ida_tran = A.matmul(ida_tran) + b
ida_mat = torch.eye(4)
ida_mat[:2, :2] = ida_rot
ida_mat[:2, 2] = ida_tran
return img, ida_mat.numpy()
def sample_augmentation(self):
"""
https://github.com/Megvii-BaseDetection/BEVStereo/blob/master/dataset/nusc_mv_det_dataset.py#L247
"""
H, W = self.ida_aug_conf['H'], self.ida_aug_conf['W']
fH, fW = self.ida_aug_conf['final_dim']
if self.training:
resize = np.random.uniform(*self.ida_aug_conf['resize_lim'])
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.random.uniform(*self.ida_aug_conf['bot_pct_lim'])) * newH) - fH
crop_w = int(np.random.uniform(0, max(0, newW - fW)))
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
if self.ida_aug_conf['rand_flip'] and np.random.choice([0, 1]):
flip = True
rotate = np.random.uniform(*self.ida_aug_conf['rot_lim'])
else:
resize = max(fH / H, fW / W)
resize_dims = (int(W * resize), int(H * resize))
newW, newH = resize_dims
crop_h = int((1 - np.mean(self.ida_aug_conf['bot_pct_lim'])) * newH) - fH
crop_w = int(max(0, newW - fW) / 2)
crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
flip = False
rotate = 0
return resize, resize_dims, crop, flip, rotate
@PIPELINES.register_module()
class GlobalRotScaleTransImage(object):
def __init__(self,
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]):
self.rot_range = rot_range
self.scale_ratio_range = scale_ratio_range
self.translation_std = translation_std
def __call__(self, results):
# random rotate
rot_angle = np.random.uniform(*self.rot_range)
self.rotate_z(results, rot_angle)
results["gt_bboxes_3d"].rotate(np.array(rot_angle))
# random scale
scale_ratio = np.random.uniform(*self.scale_ratio_range)
self.scale_xyz(results, scale_ratio)
results["gt_bboxes_3d"].scale(scale_ratio)
# TODO: support translation
return results
def rotate_z(self, results, rot_angle):
rot_cos = torch.cos(torch.tensor(rot_angle))
rot_sin = torch.sin(torch.tensor(rot_angle))
rot_mat = torch.tensor([
[rot_cos, -rot_sin, 0, 0],
[rot_sin, rot_cos, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1],
])
rot_mat_inv = torch.inverse(rot_mat)
for view in range(len(results['lidar2img'])):
results['lidar2img'][view] = (torch.tensor(results['lidar2img'][view]).float() @ rot_mat_inv).numpy()
def scale_xyz(self, results, scale_ratio):
scale_mat = torch.tensor([
[scale_ratio, 0, 0, 0],
[0, scale_ratio, 0, 0],
[0, 0, scale_ratio, 0],
[0, 0, 0, 1],
])
scale_mat_inv = torch.inverse(scale_mat)
for view in range(len(results['lidar2img'])):
results['lidar2img'][view] = (torch.tensor(results['lidar2img'][view]).float() @ scale_mat_inv).numpy()