Spaces:
Sleeping
Sleeping
| # ------------------------------------------------------------------------ | |
| # RF-DETR | |
| # Copyright (c) 2025 Roboflow. All Rights Reserved. | |
| # Licensed under the Apache License, Version 2.0 [see LICENSE for details] | |
| # ------------------------------------------------------------------------ | |
| # Modified from LW-DETR (https://github.com/Atten4Vis/LW-DETR) | |
| # Copyright (c) 2024 Baidu. All Rights Reserved. | |
| # ------------------------------------------------------------------------ | |
| # Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR) | |
| # Copyright (c) 2021 Microsoft. All Rights Reserved. | |
| # ------------------------------------------------------------------------ | |
| # Copied from DETR (https://github.com/facebookresearch/detr) | |
| # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. | |
| # ------------------------------------------------------------------------ | |
| """ | |
| Transforms and data augmentation for both image + bbox. | |
| """ | |
| import random | |
| import PIL | |
| import numpy as np | |
| try: | |
| from collections.abc import Sequence | |
| except Exception: | |
| from collections import Sequence | |
| from numbers import Number | |
| import torch | |
| import torchvision.transforms as T | |
| # from detectron2.data import transforms as DT | |
| import torchvision.transforms.functional as F | |
| from rfdetr.util.box_ops import box_xyxy_to_cxcywh | |
| from rfdetr.util.misc import interpolate | |
| def crop(image, target, region): | |
| cropped_image = F.crop(image, *region) | |
| target = target.copy() | |
| i, j, h, w = region | |
| # should we do something wrt the original size? | |
| target["size"] = torch.tensor([h, w]) | |
| fields = ["labels", "area", "iscrowd"] | |
| if "boxes" in target: | |
| boxes = target["boxes"] | |
| max_size = torch.as_tensor([w, h], dtype=torch.float32) | |
| cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) | |
| cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) | |
| cropped_boxes = cropped_boxes.clamp(min=0) | |
| area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) | |
| target["boxes"] = cropped_boxes.reshape(-1, 4) | |
| target["area"] = area | |
| fields.append("boxes") | |
| if "masks" in target: | |
| # FIXME should we update the area here if there are no boxes? | |
| target['masks'] = target['masks'][:, i:i + h, j:j + w] | |
| fields.append("masks") | |
| # remove elements for which the boxes or masks that have zero area | |
| if "boxes" in target or "masks" in target: | |
| # favor boxes selection when defining which elements to keep | |
| # this is compatible with previous implementation | |
| if "boxes" in target: | |
| cropped_boxes = target['boxes'].reshape(-1, 2, 2) | |
| keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) | |
| else: | |
| keep = target['masks'].flatten(1).any(1) | |
| for field in fields: | |
| target[field] = target[field][keep] | |
| return cropped_image, target | |
| def hflip(image, target): | |
| flipped_image = F.hflip(image) | |
| w, h = image.size | |
| target = target.copy() | |
| if "boxes" in target: | |
| boxes = target["boxes"] | |
| boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) | |
| target["boxes"] = boxes | |
| if "masks" in target: | |
| target['masks'] = target['masks'].flip(-1) | |
| return flipped_image, target | |
| def resize(image, target, size, max_size=None): | |
| # size can be min_size (scalar) or (w, h) tuple | |
| def get_size_with_aspect_ratio(image_size, size, max_size=None): | |
| w, h = image_size | |
| if max_size is not None: | |
| min_original_size = float(min((w, h))) | |
| max_original_size = float(max((w, h))) | |
| if max_original_size / min_original_size * size > max_size: | |
| size = int(round(max_size * min_original_size / max_original_size)) | |
| if (w <= h and w == size) or (h <= w and h == size): | |
| return (h, w) | |
| if w < h: | |
| ow = size | |
| oh = int(size * h / w) | |
| else: | |
| oh = size | |
| ow = int(size * w / h) | |
| return (oh, ow) | |
| def get_size(image_size, size, max_size=None): | |
| if isinstance(size, (list, tuple)): | |
| return size[::-1] | |
| else: | |
| return get_size_with_aspect_ratio(image_size, size, max_size) | |
| size = get_size(image.size, size, max_size) | |
| rescaled_image = F.resize(image, size) | |
| if target is None: | |
| return rescaled_image, None | |
| ratios = tuple( | |
| float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) | |
| ratio_width, ratio_height = ratios | |
| target = target.copy() | |
| if "boxes" in target: | |
| boxes = target["boxes"] | |
| scaled_boxes = boxes * torch.as_tensor( | |
| [ratio_width, ratio_height, ratio_width, ratio_height]) | |
| target["boxes"] = scaled_boxes | |
| if "area" in target: | |
| area = target["area"] | |
| scaled_area = area * (ratio_width * ratio_height) | |
| target["area"] = scaled_area | |
| h, w = size | |
| target["size"] = torch.tensor([h, w]) | |
| if "masks" in target: | |
| target['masks'] = interpolate( | |
| target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 | |
| return rescaled_image, target | |
| def pad(image, target, padding): | |
| # assumes that we only pad on the bottom right corners | |
| padded_image = F.pad(image, (0, 0, padding[0], padding[1])) | |
| if target is None: | |
| return padded_image, None | |
| target = target.copy() | |
| # should we do something wrt the original size? | |
| target["size"] = torch.tensor(padded_image.size[::-1]) | |
| if "masks" in target: | |
| target['masks'] = torch.nn.functional.pad( | |
| target['masks'], (0, padding[0], 0, padding[1])) | |
| return padded_image, target | |
| class RandomCrop(object): | |
| def __init__(self, size): | |
| self.size = size | |
| def __call__(self, img, target): | |
| region = T.RandomCrop.get_params(img, self.size) | |
| return crop(img, target, region) | |
| class RandomSizeCrop(object): | |
| def __init__(self, min_size: int, max_size: int): | |
| self.min_size = min_size | |
| self.max_size = max_size | |
| def __call__(self, img: PIL.Image.Image, target: dict): | |
| w = random.randint(self.min_size, min(img.width, self.max_size)) | |
| h = random.randint(self.min_size, min(img.height, self.max_size)) | |
| region = T.RandomCrop.get_params(img, [h, w]) | |
| return crop(img, target, region) | |
| class CenterCrop(object): | |
| def __init__(self, size): | |
| self.size = size | |
| def __call__(self, img, target): | |
| image_width, image_height = img.size | |
| crop_height, crop_width = self.size | |
| crop_top = int(round((image_height - crop_height) / 2.)) | |
| crop_left = int(round((image_width - crop_width) / 2.)) | |
| return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) | |
| class RandomHorizontalFlip(object): | |
| def __init__(self, p=0.5): | |
| self.p = p | |
| def __call__(self, img, target): | |
| if random.random() < self.p: | |
| return hflip(img, target) | |
| return img, target | |
| class RandomResize(object): | |
| def __init__(self, sizes, max_size=None): | |
| assert isinstance(sizes, (list, tuple)) | |
| self.sizes = sizes | |
| self.max_size = max_size | |
| def __call__(self, img, target=None): | |
| size = random.choice(self.sizes) | |
| return resize(img, target, size, self.max_size) | |
| class SquareResize(object): | |
| def __init__(self, sizes): | |
| assert isinstance(sizes, (list, tuple)) | |
| self.sizes = sizes | |
| def __call__(self, img, target=None): | |
| size = random.choice(self.sizes) | |
| rescaled_img=F.resize(img, (size, size)) | |
| w, h = rescaled_img.size | |
| if target is None: | |
| return rescaled_img, None | |
| ratios = tuple( | |
| float(s) / float(s_orig) for s, s_orig in zip(rescaled_img.size, img.size)) | |
| ratio_width, ratio_height = ratios | |
| target = target.copy() | |
| if "boxes" in target: | |
| boxes = target["boxes"] | |
| scaled_boxes = boxes * torch.as_tensor( | |
| [ratio_width, ratio_height, ratio_width, ratio_height]) | |
| target["boxes"] = scaled_boxes | |
| if "area" in target: | |
| area = target["area"] | |
| scaled_area = area * (ratio_width * ratio_height) | |
| target["area"] = scaled_area | |
| target["size"] = torch.tensor([h, w]) | |
| return rescaled_img, target | |
| class RandomPad(object): | |
| def __init__(self, max_pad): | |
| self.max_pad = max_pad | |
| def __call__(self, img, target): | |
| pad_x = random.randint(0, self.max_pad) | |
| pad_y = random.randint(0, self.max_pad) | |
| return pad(img, target, (pad_x, pad_y)) | |
| class PILtoNdArray(object): | |
| def __call__(self, img, target): | |
| return np.asarray(img), target | |
| class NdArraytoPIL(object): | |
| def __call__(self, img, target): | |
| return F.to_pil_image(img.astype('uint8')), target | |
| class Pad(object): | |
| def __init__(self, | |
| size=None, | |
| size_divisor=32, | |
| pad_mode=0, | |
| offsets=None, | |
| fill_value=(127.5, 127.5, 127.5)): | |
| """ | |
| Pad image to a specified size or multiple of size_divisor. | |
| Args: | |
| size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None | |
| size_divisor (int): size divisor, default 32 | |
| pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets | |
| if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top | |
| offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1 | |
| fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5) | |
| """ | |
| if not isinstance(size, (int, Sequence)): | |
| raise TypeError( | |
| "Type of target_size is invalid when random_size is True. \ | |
| Must be List, now is {}".format(type(size))) | |
| if isinstance(size, int): | |
| size = [size, size] | |
| assert pad_mode in [ | |
| -1, 0, 1, 2 | |
| ], 'currently only supports four modes [-1, 0, 1, 2]' | |
| if pad_mode == -1: | |
| assert offsets, 'if pad_mode is -1, offsets should not be None' | |
| self.size = size | |
| self.size_divisor = size_divisor | |
| self.pad_mode = pad_mode | |
| self.fill_value = fill_value | |
| self.offsets = offsets | |
| def apply_bbox(self, bbox, offsets): | |
| return bbox + np.array(offsets * 2, dtype=np.float32) | |
| def apply_image(self, image, offsets, im_size, size): | |
| x, y = offsets | |
| im_h, im_w = im_size | |
| h, w = size | |
| canvas = np.ones((h, w, 3), dtype=np.float32) | |
| canvas *= np.array(self.fill_value, dtype=np.float32) | |
| canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32) | |
| return canvas | |
| def __call__(self, im, target): | |
| im_h, im_w = im.shape[:2] | |
| if self.size: | |
| h, w = self.size | |
| assert ( | |
| im_h <= h and im_w <= w | |
| ), '(h, w) of target size should be greater than (im_h, im_w)' | |
| else: | |
| h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor) | |
| w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor) | |
| if h == im_h and w == im_w: | |
| return im.astype(np.float32), target | |
| if self.pad_mode == -1: | |
| offset_x, offset_y = self.offsets | |
| elif self.pad_mode == 0: | |
| offset_y, offset_x = 0, 0 | |
| elif self.pad_mode == 1: | |
| offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2 | |
| else: | |
| offset_y, offset_x = h - im_h, w - im_w | |
| offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w] | |
| im = self.apply_image(im, offsets, im_size, size) | |
| if self.pad_mode == 0: | |
| target["size"] = torch.tensor([h, w]) | |
| return im, target | |
| if 'boxes' in target and len(target['boxes']) > 0: | |
| boxes = np.asarray(target["boxes"]) | |
| target["boxes"] = torch.from_numpy(self.apply_bbox(boxes, offsets)) | |
| target["size"] = torch.tensor([h, w]) | |
| return im, target | |
| class RandomExpand(object): | |
| """Random expand the canvas. | |
| Args: | |
| ratio (float): maximum expansion ratio. | |
| prob (float): probability to expand. | |
| fill_value (list): color value used to fill the canvas. in RGB order. | |
| """ | |
| def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)): | |
| assert ratio > 1.01, "expand ratio must be larger than 1.01" | |
| self.ratio = ratio | |
| self.prob = prob | |
| assert isinstance(fill_value, (Number, Sequence)), \ | |
| "fill value must be either float or sequence" | |
| if isinstance(fill_value, Number): | |
| fill_value = (fill_value, ) * 3 | |
| if not isinstance(fill_value, tuple): | |
| fill_value = tuple(fill_value) | |
| self.fill_value = fill_value | |
| def __call__(self, img, target): | |
| if np.random.uniform(0., 1.) < self.prob: | |
| return img, target | |
| height, width = img.shape[:2] | |
| ratio = np.random.uniform(1., self.ratio) | |
| h = int(height * ratio) | |
| w = int(width * ratio) | |
| if not h > height or not w > width: | |
| return img, target | |
| y = np.random.randint(0, h - height) | |
| x = np.random.randint(0, w - width) | |
| offsets, size = [x, y], [h, w] | |
| pad = Pad(size, | |
| pad_mode=-1, | |
| offsets=offsets, | |
| fill_value=self.fill_value) | |
| return pad(img, target) | |
| class RandomSelect(object): | |
| """ | |
| Randomly selects between transforms1 and transforms2, | |
| with probability p for transforms1 and (1 - p) for transforms2 | |
| """ | |
| def __init__(self, transforms1, transforms2, p=0.5): | |
| self.transforms1 = transforms1 | |
| self.transforms2 = transforms2 | |
| self.p = p | |
| def __call__(self, img, target): | |
| if random.random() < self.p: | |
| return self.transforms1(img, target) | |
| return self.transforms2(img, target) | |
| class ToTensor(object): | |
| def __call__(self, img, target): | |
| return F.to_tensor(img), target | |
| class RandomErasing(object): | |
| def __init__(self, *args, **kwargs): | |
| self.eraser = T.RandomErasing(*args, **kwargs) | |
| def __call__(self, img, target): | |
| return self.eraser(img), target | |
| class Normalize(object): | |
| def __init__(self, mean, std): | |
| self.mean = mean | |
| self.std = std | |
| def __call__(self, image, target=None): | |
| image = F.normalize(image, mean=self.mean, std=self.std) | |
| if target is None: | |
| return image, None | |
| target = target.copy() | |
| h, w = image.shape[-2:] | |
| if "boxes" in target: | |
| boxes = target["boxes"] | |
| boxes = box_xyxy_to_cxcywh(boxes) | |
| boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) | |
| target["boxes"] = boxes | |
| return image, target | |
| class Compose(object): | |
| def __init__(self, transforms): | |
| self.transforms = transforms | |
| def __call__(self, image, target): | |
| for t in self.transforms: | |
| image, target = t(image, target) | |
| return image, target | |
| def __repr__(self): | |
| format_string = self.__class__.__name__ + "(" | |
| for t in self.transforms: | |
| format_string += "\n" | |
| format_string += " {0}".format(t) | |
| format_string += "\n)" | |
| return format_string |