|
|
import random |
|
|
import numpy as np |
|
|
import torch |
|
|
import torchvision.transforms.functional as F |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
|
|
|
class Augmentation(object): |
|
|
def __init__(self, img_size=224, jitter=0.2, hue=0.1, saturation=1.5, exposure=1.5): |
|
|
self.img_size = img_size |
|
|
self.jitter = jitter |
|
|
self.hue = hue |
|
|
self.saturation = saturation |
|
|
self.exposure = exposure |
|
|
|
|
|
|
|
|
def rand_scale(self, s): |
|
|
scale = random.uniform(1, s) |
|
|
|
|
|
if random.randint(0, 1): |
|
|
return scale |
|
|
|
|
|
return 1./scale |
|
|
|
|
|
|
|
|
def random_distort_image(self, video_clip): |
|
|
dhue = random.uniform(-self.hue, self.hue) |
|
|
dsat = self.rand_scale(self.saturation) |
|
|
dexp = self.rand_scale(self.exposure) |
|
|
|
|
|
video_clip_ = [] |
|
|
for image in video_clip: |
|
|
image = image.convert('HSV') |
|
|
cs = list(image.split()) |
|
|
cs[1] = cs[1].point(lambda i: i * dsat) |
|
|
cs[2] = cs[2].point(lambda i: i * dexp) |
|
|
|
|
|
def change_hue(x): |
|
|
x += dhue * 255 |
|
|
if x > 255: |
|
|
x -= 255 |
|
|
if x < 0: |
|
|
x += 255 |
|
|
return x |
|
|
|
|
|
cs[0] = cs[0].point(change_hue) |
|
|
image = Image.merge(image.mode, tuple(cs)) |
|
|
|
|
|
image = image.convert('RGB') |
|
|
|
|
|
video_clip_.append(image) |
|
|
|
|
|
return video_clip_ |
|
|
|
|
|
|
|
|
def random_crop(self, video_clip, width, height): |
|
|
dw =int(width * self.jitter) |
|
|
dh =int(height * self.jitter) |
|
|
|
|
|
pleft = random.randint(-dw, dw) |
|
|
pright = random.randint(-dw, dw) |
|
|
ptop = random.randint(-dh, dh) |
|
|
pbot = random.randint(-dh, dh) |
|
|
|
|
|
swidth = width - pleft - pright |
|
|
sheight = height - ptop - pbot |
|
|
|
|
|
sx = float(swidth) / width |
|
|
sy = float(sheight) / height |
|
|
|
|
|
dx = (float(pleft) / width)/sx |
|
|
dy = (float(ptop) / height)/sy |
|
|
|
|
|
|
|
|
cropped_clip = [img.crop((pleft, ptop, pleft + swidth - 1, ptop + sheight - 1)) for img in video_clip] |
|
|
|
|
|
return cropped_clip, dx, dy, sx, sy |
|
|
|
|
|
|
|
|
def apply_bbox(self, target, ow, oh, dx, dy, sx, sy): |
|
|
sx, sy = 1./sx, 1./sy |
|
|
|
|
|
target[..., 0] = np.minimum(0.999, np.maximum(0, target[..., 0] / ow * sx - dx)) |
|
|
target[..., 1] = np.minimum(0.999, np.maximum(0, target[..., 1] / oh * sy - dy)) |
|
|
target[..., 2] = np.minimum(0.999, np.maximum(0, target[..., 2] / ow * sx - dx)) |
|
|
target[..., 3] = np.minimum(0.999, np.maximum(0, target[..., 3] / oh * sy - dy)) |
|
|
|
|
|
|
|
|
refine_target = [] |
|
|
for i in range(target.shape[0]): |
|
|
tgt = target[i] |
|
|
bw = (tgt[2] - tgt[0]) * ow |
|
|
bh = (tgt[3] - tgt[1]) * oh |
|
|
|
|
|
if bw < 1. or bh < 1.: |
|
|
continue |
|
|
|
|
|
refine_target.append(tgt) |
|
|
|
|
|
refine_target = np.array(refine_target).reshape(-1, target.shape[-1]) |
|
|
|
|
|
return refine_target |
|
|
|
|
|
|
|
|
def to_tensor(self, video_clip): |
|
|
return [F.to_tensor(image) * 255. for image in video_clip] |
|
|
|
|
|
|
|
|
def __call__(self, video_clip, target): |
|
|
|
|
|
oh = video_clip[0].height |
|
|
ow = video_clip[0].width |
|
|
|
|
|
|
|
|
video_clip, dx, dy, sx, sy = self.random_crop(video_clip, ow, oh) |
|
|
|
|
|
|
|
|
video_clip = [img.resize([self.img_size, self.img_size]) for img in video_clip] |
|
|
|
|
|
|
|
|
flip = random.randint(0, 1) |
|
|
if flip: |
|
|
video_clip = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in video_clip] |
|
|
|
|
|
|
|
|
video_clip = self.random_distort_image(video_clip) |
|
|
|
|
|
|
|
|
if target is not None: |
|
|
target = self.apply_bbox(target, ow, oh, dx, dy, sx, sy) |
|
|
if flip: |
|
|
target[..., [0, 2]] = 1.0 - target[..., [2, 0]] |
|
|
else: |
|
|
target = np.array([]) |
|
|
|
|
|
|
|
|
video_clip = self.to_tensor(video_clip) |
|
|
target = torch.as_tensor(target).float() |
|
|
|
|
|
return video_clip, target |
|
|
|
|
|
|
|
|
|
|
|
class BaseTransform(object): |
|
|
def __init__(self, img_size=224, ): |
|
|
self.img_size = img_size |
|
|
|
|
|
|
|
|
def to_tensor(self, video_clip): |
|
|
return [F.to_tensor(image) * 255. for image in video_clip] |
|
|
|
|
|
|
|
|
def __call__(self, video_clip, target=None, normalize=True): |
|
|
oh = video_clip[0].height |
|
|
ow = video_clip[0].width |
|
|
|
|
|
|
|
|
video_clip = [img.resize([self.img_size, self.img_size]) for img in video_clip] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
video_clip = self.to_tensor(video_clip) |
|
|
|
|
|
|
|
|
|
|
|
return video_clip |
|
|
|
|
|
|