Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) OpenMMLab. All rights reserved. | |
| import copy | |
| from abc import ABCMeta | |
| from collections import defaultdict | |
| from typing import Optional, Sequence, Tuple | |
| import mmcv | |
| import numpy as np | |
| from mmcv.transforms import BaseTransform | |
| from mmengine.dataset.base_dataset import Compose | |
| from numpy import random | |
| from mmpose.registry import TRANSFORMS | |
| from mmpose.structures import (bbox_clip_border, flip_bbox, flip_keypoints, | |
| keypoint_clip_border) | |
| class MixImageTransform(BaseTransform, metaclass=ABCMeta): | |
| """Abstract base class for mixup-style image data augmentation. | |
| Args: | |
| pre_transform (Optional[Sequence[str]]): A sequence of transform | |
| to be applied before mixup. Defaults to None. | |
| prob (float): Probability of applying the mixup transformation. | |
| Defaults to 1.0. | |
| """ | |
| def __init__(self, | |
| pre_transform: Optional[Sequence[str]] = None, | |
| prob: float = 1.0): | |
| self.prob = prob | |
| if pre_transform is None: | |
| self.pre_transform = None | |
| else: | |
| self.pre_transform = Compose(pre_transform) | |
| def transform(self, results: dict) -> dict: | |
| """Transform the input data dictionary using mixup-style augmentation. | |
| Args: | |
| results (dict): A dictionary containing input data. | |
| """ | |
| if random.uniform(0, 1) < self.prob: | |
| dataset = results.pop('dataset', None) | |
| results['mixed_data_list'] = self._get_mixed_data_list(dataset) | |
| results = self.apply_mix(results) | |
| if 'mixed_data_list' in results: | |
| results.pop('mixed_data_list') | |
| results['dataset'] = dataset | |
| return results | |
| def _get_mixed_data_list(self, dataset): | |
| """Get a list of mixed data samples from the dataset. | |
| Args: | |
| dataset: The dataset from which to sample the mixed data. | |
| Returns: | |
| List[dict]: A list of dictionaries containing mixed data samples. | |
| """ | |
| indexes = [ | |
| random.randint(0, len(dataset)) for _ in range(self.num_aux_image) | |
| ] | |
| mixed_data_list = [ | |
| copy.deepcopy(dataset.get_data_info(index)) for index in indexes | |
| ] | |
| if self.pre_transform is not None: | |
| for i, data in enumerate(mixed_data_list): | |
| data.update({'dataset': dataset}) | |
| _results = self.pre_transform(data) | |
| _results.pop('dataset') | |
| mixed_data_list[i] = _results | |
| return mixed_data_list | |
| class Mosaic(MixImageTransform): | |
| """Mosaic augmentation. This transformation takes four input images and | |
| combines them into a single output image using the mosaic technique. The | |
| resulting image is composed of parts from each of the four sub-images. The | |
| mosaic transform steps are as follows: | |
| 1. Choose the mosaic center as the intersection of the four images. | |
| 2. Select the top-left image according to the index and randomly sample | |
| three more images from the custom dataset. | |
| 3. If an image is larger than the mosaic patch, it will be cropped. | |
| .. code:: text | |
| mosaic transform | |
| center_x | |
| +------------------------------+ | |
| | pad | | | |
| | +-----------+ pad | | |
| | | | | | |
| | | image1 +-----------+ | |
| | | | | | |
| | | | image2 | | |
| center_y |----+-+-----------+-----------+ | |
| | | cropped | | | |
| |pad | image3 | image4 | | |
| | | | | | |
| +----|-------------+-----------+ | |
| | | | |
| +-------------+ | |
| Required Keys: | |
| - img | |
| - bbox (optional) | |
| - bbox_score (optional) | |
| - category_id (optional) | |
| - keypoints (optional) | |
| - keypoints_visible (optional) | |
| - area (optional) | |
| Modified Keys: | |
| - img | |
| - bbox (optional) | |
| - bbox_score (optional) | |
| - category_id (optional) | |
| - keypoints (optional) | |
| - keypoints_visible (optional) | |
| - area (optional) | |
| Args: | |
| img_scale (Sequence[int]): Image size after mosaic pipeline of single | |
| image. The shape order should be (width, height). | |
| Defaults to (640, 640). | |
| center_range (Sequence[float]): Center ratio range of mosaic | |
| output. Defaults to (0.5, 1.5). | |
| pad_val (int): Pad value. Defaults to 114. | |
| pre_transform (Optional[Sequence[str]]): A sequence of transform | |
| to be applied before mixup. Defaults to None. | |
| prob (float): Probability of applying the mixup transformation. | |
| Defaults to 1.0. | |
| """ | |
| num_aux_image = 3 | |
| def __init__( | |
| self, | |
| img_scale: Tuple[int, int] = (640, 640), | |
| center_range: Tuple[float, float] = (0.5, 1.5), | |
| pad_val: float = 114.0, | |
| pre_transform: Sequence[dict] = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(pre_transform=pre_transform, prob=prob) | |
| self.img_scale = img_scale | |
| self.center_range = center_range | |
| self.pad_val = pad_val | |
| def apply_mix(self, results: dict) -> dict: | |
| """Apply mosaic augmentation to the input data.""" | |
| assert 'mixed_data_list' in results | |
| mixed_data_list = results.pop('mixed_data_list') | |
| assert len(mixed_data_list) == self.num_aux_image | |
| img, annos = self._create_mosaic_image(results, mixed_data_list) | |
| bboxes = annos['bboxes'] | |
| kpts = annos['keypoints'] | |
| kpts_vis = annos['keypoints_visible'] | |
| bboxes = bbox_clip_border(bboxes, (2 * self.img_scale[0], | |
| 2 * self.img_scale[1])) | |
| kpts, kpts_vis = keypoint_clip_border(kpts, kpts_vis, | |
| (2 * self.img_scale[0], | |
| 2 * self.img_scale[1])) | |
| results['img'] = img | |
| results['img_shape'] = img.shape | |
| results['bbox'] = bboxes | |
| results['category_id'] = annos['category_id'] | |
| results['bbox_score'] = annos['bbox_scores'] | |
| results['keypoints'] = kpts | |
| results['keypoints_visible'] = kpts_vis | |
| results['area'] = annos['area'] | |
| return results | |
| def _create_mosaic_image(self, results, mixed_data_list): | |
| """Create the mosaic image and corresponding annotations by combining | |
| four input images.""" | |
| # init mosaic image | |
| img_scale_w, img_scale_h = self.img_scale | |
| mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2), 3), | |
| self.pad_val, | |
| dtype=results['img'].dtype) | |
| # calculate mosaic center | |
| center = (int(random.uniform(*self.center_range) * img_scale_w), | |
| int(random.uniform(*self.center_range) * img_scale_h)) | |
| annos = defaultdict(list) | |
| locs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') | |
| for loc, data in zip(locs, (results, *mixed_data_list)): | |
| # process image | |
| img = data['img'] | |
| h, w = img.shape[:2] | |
| scale_ratio = min(img_scale_h / h, img_scale_w / w) | |
| img = mmcv.imresize(img, | |
| (int(w * scale_ratio), int(h * scale_ratio))) | |
| # paste | |
| paste_coord, crop_coord = self._mosaic_combine( | |
| loc, center, img.shape[:2][::-1]) | |
| x1_p, y1_p, x2_p, y2_p = paste_coord | |
| x1_c, y1_c, x2_c, y2_c = crop_coord | |
| # crop and paste image | |
| mosaic_img[y1_p:y2_p, x1_p:x2_p] = img[y1_c:y2_c, x1_c:x2_c] | |
| padw = x1_p - x1_c | |
| padh = y1_p - y1_c | |
| # merge annotations | |
| if 'bbox' in data: | |
| bboxes = data['bbox'] | |
| # rescale & translate | |
| bboxes *= scale_ratio | |
| bboxes[..., ::2] += padw | |
| bboxes[..., 1::2] += padh | |
| annos['bboxes'].append(bboxes) | |
| annos['bbox_scores'].append(data['bbox_score']) | |
| annos['category_id'].append(data['category_id']) | |
| if 'keypoints' in data: | |
| kpts = data['keypoints'] | |
| # rescale & translate | |
| kpts *= scale_ratio | |
| kpts[..., 0] += padw | |
| kpts[..., 1] += padh | |
| annos['keypoints'].append(kpts) | |
| annos['keypoints_visible'].append(data['keypoints_visible']) | |
| if 'area' in data: | |
| annos['area'].append(data['area'] * scale_ratio**2) | |
| for key in annos: | |
| annos[key] = np.concatenate(annos[key]) | |
| return mosaic_img, annos | |
| def _mosaic_combine( | |
| self, loc: str, center: Tuple[float, float], img_shape: Tuple[int, int] | |
| ) -> Tuple[Tuple[int, int, int, int], Tuple[int, int, int, int]]: | |
| """Determine the overall coordinates of the mosaic image and the | |
| specific coordinates of the cropped sub-image.""" | |
| assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') | |
| x1, y1, x2, y2 = 0, 0, 0, 0 | |
| cx, cy = center | |
| w, h = img_shape | |
| if loc == 'top_left': | |
| x1, y1, x2, y2 = max(cx - w, 0), max(cy - h, 0), cx, cy | |
| crop_coord = w - (x2 - x1), h - (y2 - y1), w, h | |
| elif loc == 'top_right': | |
| x1, y1, x2, y2 = cx, max(cy - h, 0), min(cx + w, | |
| self.img_scale[0] * 2), cy | |
| crop_coord = 0, h - (y2 - y1), min(w, x2 - x1), h | |
| elif loc == 'bottom_left': | |
| x1, y1, x2, y2 = max(cx - w, | |
| 0), cy, cx, min(self.img_scale[1] * 2, cy + h) | |
| crop_coord = w - (x2 - x1), 0, w, min(y2 - y1, h) | |
| else: | |
| x1, y1, x2, y2 = cx, cy, min(cx + w, self.img_scale[0] * | |
| 2), min(self.img_scale[1] * 2, cy + h) | |
| crop_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h) | |
| return (x1, y1, x2, y2), crop_coord | |
| def __repr__(self) -> str: | |
| repr_str = self.__class__.__name__ | |
| repr_str += f'(img_scale={self.img_scale}, ' | |
| repr_str += f'center_range={self.center_range}, ' | |
| repr_str += f'pad_val={self.pad_val}, ' | |
| repr_str += f'prob={self.prob})' | |
| return repr_str | |
| class YOLOXMixUp(MixImageTransform): | |
| """MixUp data augmentation for YOLOX. This transform combines two images | |
| through mixup to enhance the dataset's diversity. | |
| Mixup Transform Steps: | |
| 1. A random image is chosen from the dataset and placed in the | |
| top-left corner of the target image (after padding and resizing). | |
| 2. The target of the mixup transform is obtained by taking the | |
| weighted average of the mixup image and the original image. | |
| .. code:: text | |
| mixup transform | |
| +---------------+--------------+ | |
| | mixup image | | | |
| | +--------|--------+ | | |
| | | | | | | |
| +---------------+ | | | |
| | | | | | |
| | | image | | | |
| | | | | | |
| | | | | | |
| | +-----------------+ | | |
| | pad | | |
| +------------------------------+ | |
| Required Keys: | |
| - img | |
| - bbox (optional) | |
| - bbox_score (optional) | |
| - category_id (optional) | |
| - keypoints (optional) | |
| - keypoints_visible (optional) | |
| - area (optional) | |
| Modified Keys: | |
| - img | |
| - bbox (optional) | |
| - bbox_score (optional) | |
| - category_id (optional) | |
| - keypoints (optional) | |
| - keypoints_visible (optional) | |
| - area (optional) | |
| Args: | |
| img_scale (Sequence[int]): Image output size after mixup pipeline. | |
| The shape order should be (width, height). Defaults to (640, 640). | |
| ratio_range (Sequence[float]): Scale ratio of mixup image. | |
| Defaults to (0.5, 1.5). | |
| flip_ratio (float): Horizontal flip ratio of mixup image. | |
| Defaults to 0.5. | |
| pad_val (int): Pad value. Defaults to 114. | |
| pre_transform (Optional[Sequence[str]]): A sequence of transform | |
| to be applied before mixup. Defaults to None. | |
| prob (float): Probability of applying the mixup transformation. | |
| Defaults to 1.0. | |
| """ | |
| num_aux_image = 1 | |
| def __init__(self, | |
| img_scale: Tuple[int, int] = (640, 640), | |
| ratio_range: Tuple[float, float] = (0.5, 1.5), | |
| flip_ratio: float = 0.5, | |
| pad_val: float = 114.0, | |
| bbox_clip_border: bool = True, | |
| pre_transform: Sequence[dict] = None, | |
| prob: float = 1.0): | |
| assert isinstance(img_scale, tuple) | |
| super().__init__(pre_transform=pre_transform, prob=prob) | |
| self.img_scale = img_scale | |
| self.ratio_range = ratio_range | |
| self.flip_ratio = flip_ratio | |
| self.pad_val = pad_val | |
| self.bbox_clip_border = bbox_clip_border | |
| def apply_mix(self, results: dict) -> dict: | |
| """YOLOX MixUp transform function.""" | |
| assert 'mixed_data_list' in results | |
| mixed_data_list = results.pop('mixed_data_list') | |
| assert len(mixed_data_list) == self.num_aux_image | |
| if mixed_data_list[0]['keypoints'].shape[0] == 0: | |
| return results | |
| img, annos = self._create_mixup_image(results, mixed_data_list) | |
| bboxes = annos['bboxes'] | |
| kpts = annos['keypoints'] | |
| kpts_vis = annos['keypoints_visible'] | |
| h, w = img.shape[:2] | |
| bboxes = bbox_clip_border(bboxes, (w, h)) | |
| kpts, kpts_vis = keypoint_clip_border(kpts, kpts_vis, (w, h)) | |
| results['img'] = img.astype(np.uint8) | |
| results['img_shape'] = img.shape | |
| results['bbox'] = bboxes | |
| results['category_id'] = annos['category_id'] | |
| results['bbox_score'] = annos['bbox_scores'] | |
| results['keypoints'] = kpts | |
| results['keypoints_visible'] = kpts_vis | |
| results['area'] = annos['area'] | |
| return results | |
| def _create_mixup_image(self, results, mixed_data_list): | |
| """Create the mixup image and corresponding annotations by combining | |
| two input images.""" | |
| aux_results = mixed_data_list[0] | |
| aux_img = aux_results['img'] | |
| # init mixup image | |
| out_img = np.ones((self.img_scale[1], self.img_scale[0], 3), | |
| dtype=aux_img.dtype) * self.pad_val | |
| annos = defaultdict(list) | |
| # Calculate scale ratio and resize aux_img | |
| scale_ratio = min(self.img_scale[1] / aux_img.shape[0], | |
| self.img_scale[0] / aux_img.shape[1]) | |
| aux_img = mmcv.imresize(aux_img, (int(aux_img.shape[1] * scale_ratio), | |
| int(aux_img.shape[0] * scale_ratio))) | |
| # Set the resized aux_img in the top-left of out_img | |
| out_img[:aux_img.shape[0], :aux_img.shape[1]] = aux_img | |
| # random rescale | |
| jit_factor = random.uniform(*self.ratio_range) | |
| scale_ratio *= jit_factor | |
| out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), | |
| int(out_img.shape[0] * jit_factor))) | |
| # random flip | |
| is_filp = random.uniform(0, 1) > self.flip_ratio | |
| if is_filp: | |
| out_img = out_img[:, ::-1, :] | |
| # random crop | |
| ori_img = results['img'] | |
| aux_h, aux_w = out_img.shape[:2] | |
| h, w = ori_img.shape[:2] | |
| padded_img = np.ones((max(aux_h, h), max(aux_w, w), 3)) * self.pad_val | |
| padded_img = padded_img.astype(np.uint8) | |
| padded_img[:aux_h, :aux_w] = out_img | |
| dy = random.randint(0, max(0, padded_img.shape[0] - h) + 1) | |
| dx = random.randint(0, max(0, padded_img.shape[1] - w) + 1) | |
| padded_cropped_img = padded_img[dy:dy + h, dx:dx + w] | |
| # mix up | |
| mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img | |
| # merge annotations | |
| # bboxes | |
| bboxes = aux_results['bbox'].copy() | |
| bboxes *= scale_ratio | |
| bboxes = bbox_clip_border(bboxes, (aux_w, aux_h)) | |
| if is_filp: | |
| bboxes = flip_bbox(bboxes, [aux_w, aux_h], 'xyxy') | |
| bboxes[..., ::2] -= dx | |
| bboxes[..., 1::2] -= dy | |
| annos['bboxes'] = [results['bbox'], bboxes] | |
| annos['bbox_scores'] = [ | |
| results['bbox_score'], aux_results['bbox_score'] | |
| ] | |
| annos['category_id'] = [ | |
| results['category_id'], aux_results['category_id'] | |
| ] | |
| # keypoints | |
| kpts = aux_results['keypoints'] * scale_ratio | |
| kpts, kpts_vis = keypoint_clip_border(kpts, | |
| aux_results['keypoints_visible'], | |
| (aux_w, aux_h)) | |
| if is_filp: | |
| kpts, kpts_vis = flip_keypoints(kpts, kpts_vis, (aux_w, aux_h), | |
| aux_results['flip_indices']) | |
| kpts[..., 0] -= dx | |
| kpts[..., 1] -= dy | |
| annos['keypoints'] = [results['keypoints'], kpts] | |
| annos['keypoints_visible'] = [results['keypoints_visible'], kpts_vis] | |
| annos['area'] = [results['area'], aux_results['area'] * scale_ratio**2] | |
| for key in annos: | |
| annos[key] = np.concatenate(annos[key]) | |
| return mixup_img, annos | |
| def __repr__(self) -> str: | |
| repr_str = self.__class__.__name__ | |
| repr_str += f'(img_scale={self.img_scale}, ' | |
| repr_str += f'ratio_range={self.ratio_range}, ' | |
| repr_str += f'flip_ratio={self.flip_ratio}, ' | |
| repr_str += f'pad_val={self.pad_val})' | |
| return repr_str | |