| |
| |
|
|
| """ |
| Common data processing utilities that are used in a |
| typical object detection data pipeline. |
| """ |
| import logging |
| import numpy as np |
| from typing import List, Union |
| import pycocotools.mask as mask_util |
| import torch |
| from PIL import Image |
|
|
| from detectron2.structures import ( |
| BitMasks, |
| Boxes, |
| BoxMode, |
| Instances, |
| Keypoints, |
| PolygonMasks, |
| RotatedBoxes, |
| polygons_to_bitmask, |
| ) |
| from detectron2.utils.file_io import PathManager |
|
|
| from . import transforms as T |
| from .catalog import MetadataCatalog |
|
|
| __all__ = [ |
| "SizeMismatchError", |
| "convert_image_to_rgb", |
| "check_image_size", |
| "transform_proposals", |
| "transform_instance_annotations", |
| "annotations_to_instances", |
| "annotations_to_instances_rotated", |
| "build_augmentation", |
| "build_transform_gen", |
| "create_keypoint_hflip_indices", |
| "filter_empty_instances", |
| "read_image", |
| ] |
|
|
|
|
| class SizeMismatchError(ValueError): |
| """ |
| When loaded image has difference width/height compared with annotation. |
| """ |
|
|
|
|
| |
| _M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]] |
| _M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]] |
|
|
| |
| _EXIF_ORIENT = 274 |
|
|
|
|
| def convert_PIL_to_numpy(image, format): |
| """ |
| Convert PIL image to numpy array of target format. |
| |
| Args: |
| image (PIL.Image): a PIL image |
| format (str): the format of output image |
| |
| Returns: |
| (np.ndarray): also see `read_image` |
| """ |
| if format is not None: |
| |
| conversion_format = format |
| if format in ["BGR", "YUV-BT.601"]: |
| conversion_format = "RGB" |
| image = image.convert(conversion_format) |
| image = np.asarray(image) |
| |
| if format == "L": |
| image = np.expand_dims(image, -1) |
|
|
| |
| elif format == "BGR": |
| |
| image = image[:, :, ::-1] |
| elif format == "YUV-BT.601": |
| image = image / 255.0 |
| image = np.dot(image, np.array(_M_RGB2YUV).T) |
|
|
| return image |
|
|
|
|
| def convert_image_to_rgb(image, format): |
| """ |
| Convert an image from given format to RGB. |
| |
| Args: |
| image (np.ndarray or Tensor): an HWC image |
| format (str): the format of input image, also see `read_image` |
| |
| Returns: |
| (np.ndarray): (H,W,3) RGB image in 0-255 range, can be either float or uint8 |
| """ |
| if isinstance(image, torch.Tensor): |
| image = image.cpu().numpy() |
| if format == "BGR": |
| image = image[:, :, [2, 1, 0]] |
| elif format == "YUV-BT.601": |
| image = np.dot(image, np.array(_M_YUV2RGB).T) |
| image = image * 255.0 |
| else: |
| if format == "L": |
| image = image[:, :, 0] |
| image = image.astype(np.uint8) |
| image = np.asarray(Image.fromarray(image, mode=format).convert("RGB")) |
| return image |
|
|
|
|
| def _apply_exif_orientation(image): |
| """ |
| Applies the exif orientation correctly. |
| |
| This code exists per the bug: |
| https://github.com/python-pillow/Pillow/issues/3973 |
| with the function `ImageOps.exif_transpose`. The Pillow source raises errors with |
| various methods, especially `tobytes` |
| |
| Function based on: |
| https://github.com/wkentaro/labelme/blob/v4.5.4/labelme/utils/image.py#L59 |
| https://github.com/python-pillow/Pillow/blob/7.1.2/src/PIL/ImageOps.py#L527 |
| |
| Args: |
| image (PIL.Image): a PIL image |
| |
| Returns: |
| (PIL.Image): the PIL image with exif orientation applied, if applicable |
| """ |
| if not hasattr(image, "getexif"): |
| return image |
|
|
| try: |
| exif = image.getexif() |
| except Exception: |
| exif = None |
|
|
| if exif is None: |
| return image |
|
|
| orientation = exif.get(_EXIF_ORIENT) |
|
|
| method = { |
| 2: Image.FLIP_LEFT_RIGHT, |
| 3: Image.ROTATE_180, |
| 4: Image.FLIP_TOP_BOTTOM, |
| 5: Image.TRANSPOSE, |
| 6: Image.ROTATE_270, |
| 7: Image.TRANSVERSE, |
| 8: Image.ROTATE_90, |
| }.get(orientation) |
|
|
| if method is not None: |
| return image.transpose(method) |
| return image |
|
|
|
|
| def read_image(file_name, format=None): |
| """ |
| Read an image into the given format. |
| Will apply rotation and flipping if the image has such exif information. |
| |
| Args: |
| file_name (str): image file path |
| format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601". |
| |
| Returns: |
| image (np.ndarray): |
| an HWC image in the given format, which is 0-255, uint8 for |
| supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601. |
| """ |
| with PathManager.open(file_name, "rb") as f: |
| image = Image.open(f) |
|
|
| |
| image = _apply_exif_orientation(image) |
| return convert_PIL_to_numpy(image, format) |
|
|
|
|
| def check_image_size(dataset_dict, image): |
| """ |
| Raise an error if the image does not match the size specified in the dict. |
| """ |
| if "width" in dataset_dict or "height" in dataset_dict: |
| image_wh = (image.shape[1], image.shape[0]) |
| expected_wh = (dataset_dict["width"], dataset_dict["height"]) |
| if not image_wh == expected_wh: |
| raise SizeMismatchError( |
| "Mismatched image shape{}, got {}, expect {}.".format( |
| " for image " + dataset_dict["file_name"] |
| if "file_name" in dataset_dict |
| else "", |
| image_wh, |
| expected_wh, |
| ) |
| + " Please check the width/height in your annotation." |
| ) |
|
|
| |
| if "width" not in dataset_dict: |
| dataset_dict["width"] = image.shape[1] |
| if "height" not in dataset_dict: |
| dataset_dict["height"] = image.shape[0] |
|
|
|
|
| def transform_proposals(dataset_dict, image_shape, transforms, *, proposal_topk, min_box_size=0): |
| """ |
| Apply transformations to the proposals in dataset_dict, if any. |
| |
| Args: |
| dataset_dict (dict): a dict read from the dataset, possibly |
| contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode" |
| image_shape (tuple): height, width |
| transforms (TransformList): |
| proposal_topk (int): only keep top-K scoring proposals |
| min_box_size (int): proposals with either side smaller than this |
| threshold are removed |
| |
| The input dict is modified in-place, with abovementioned keys removed. A new |
| key "proposals" will be added. Its value is an `Instances` |
| object which contains the transformed proposals in its field |
| "proposal_boxes" and "objectness_logits". |
| """ |
| if "proposal_boxes" in dataset_dict: |
| |
| boxes = transforms.apply_box( |
| BoxMode.convert( |
| dataset_dict.pop("proposal_boxes"), |
| dataset_dict.pop("proposal_bbox_mode"), |
| BoxMode.XYXY_ABS, |
| ) |
| ) |
| boxes = Boxes(boxes) |
| objectness_logits = torch.as_tensor( |
| dataset_dict.pop("proposal_objectness_logits").astype("float32") |
| ) |
|
|
| boxes.clip(image_shape) |
| keep = boxes.nonempty(threshold=min_box_size) |
| boxes = boxes[keep] |
| objectness_logits = objectness_logits[keep] |
|
|
| proposals = Instances(image_shape) |
| proposals.proposal_boxes = boxes[:proposal_topk] |
| proposals.objectness_logits = objectness_logits[:proposal_topk] |
| dataset_dict["proposals"] = proposals |
|
|
|
|
| def get_bbox(annotation): |
| """ |
| Get bbox from data |
| Args: |
| annotation (dict): dict of instance annotations for a single instance. |
| Returns: |
| bbox (ndarray): x1, y1, x2, y2 coordinates |
| """ |
| |
| bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) |
| return bbox |
|
|
|
|
| def transform_instance_annotations( |
| annotation, transforms, image_size, *, keypoint_hflip_indices=None |
| ): |
| """ |
| Apply transforms to box, segmentation and keypoints annotations of a single instance. |
| |
| It will use `transforms.apply_box` for the box, and |
| `transforms.apply_coords` for segmentation polygons & keypoints. |
| If you need anything more specially designed for each data structure, |
| you'll need to implement your own version of this function or the transforms. |
| |
| Args: |
| annotation (dict): dict of instance annotations for a single instance. |
| It will be modified in-place. |
| transforms (TransformList or list[Transform]): |
| image_size (tuple): the height, width of the transformed image |
| keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. |
| |
| Returns: |
| dict: |
| the same input dict with fields "bbox", "segmentation", "keypoints" |
| transformed according to `transforms`. |
| The "bbox_mode" field will be set to XYXY_ABS. |
| """ |
| if isinstance(transforms, (tuple, list)): |
| transforms = T.TransformList(transforms) |
| |
| bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) |
| |
| bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0) |
| annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1]) |
| annotation["bbox_mode"] = BoxMode.XYXY_ABS |
|
|
| if "segmentation" in annotation: |
| |
| segm = annotation["segmentation"] |
| if isinstance(segm, list): |
| |
| polygons = [np.asarray(p).reshape(-1, 2) for p in segm] |
| annotation["segmentation"] = [ |
| p.reshape(-1) for p in transforms.apply_polygons(polygons) |
| ] |
| elif isinstance(segm, dict): |
| |
| mask = mask_util.decode(segm) |
| mask = transforms.apply_segmentation(mask) |
| assert tuple(mask.shape[:2]) == image_size |
| annotation["segmentation"] = mask |
| else: |
| raise ValueError( |
| "Cannot transform segmentation of type '{}'!" |
| "Supported types are: polygons as list[list[float] or ndarray]," |
| " COCO-style RLE as a dict.".format(type(segm)) |
| ) |
|
|
| if "keypoints" in annotation: |
| keypoints = transform_keypoint_annotations( |
| annotation["keypoints"], transforms, image_size, keypoint_hflip_indices |
| ) |
| annotation["keypoints"] = keypoints |
|
|
| return annotation |
|
|
|
|
| def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None): |
| """ |
| Transform keypoint annotations of an image. |
| If a keypoint is transformed out of image boundary, it will be marked "unlabeled" (visibility=0) |
| |
| Args: |
| keypoints (list[float]): Nx3 float in Detectron2's Dataset format. |
| Each point is represented by (x, y, visibility). |
| transforms (TransformList): |
| image_size (tuple): the height, width of the transformed image |
| keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. |
| When `transforms` includes horizontal flip, will use the index |
| mapping to flip keypoints. |
| """ |
| |
| keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3) |
| keypoints_xy = transforms.apply_coords(keypoints[:, :2]) |
|
|
| |
| inside = (keypoints_xy >= np.array([0, 0])) & (keypoints_xy <= np.array(image_size[::-1])) |
| inside = inside.all(axis=1) |
| keypoints[:, :2] = keypoints_xy |
| keypoints[:, 2][~inside] = 0 |
|
|
| |
| do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 |
|
|
| |
| |
| |
| |
|
|
| |
| if do_hflip: |
| if keypoint_hflip_indices is None: |
| raise ValueError("Cannot flip keypoints without providing flip indices!") |
| if len(keypoints) != len(keypoint_hflip_indices): |
| raise ValueError( |
| "Keypoint data has {} points, but metadata " |
| "contains {} points!".format(len(keypoints), len(keypoint_hflip_indices)) |
| ) |
| keypoints = keypoints[np.asarray(keypoint_hflip_indices, dtype=np.int32), :] |
|
|
| |
| keypoints[keypoints[:, 2] == 0] = 0 |
| return keypoints |
|
|
|
|
| def annotations_to_instances(annos, image_size, mask_format="polygon"): |
| """ |
| Create an :class:`Instances` object used by the models, |
| from instance annotations in the dataset dict. |
| |
| Args: |
| annos (list[dict]): a list of instance annotations in one image, each |
| element for one instance. |
| image_size (tuple): height, width |
| |
| Returns: |
| Instances: |
| It will contain fields "gt_boxes", "gt_classes", |
| "gt_masks", "gt_keypoints", if they can be obtained from `annos`. |
| This is the format that builtin models expect. |
| """ |
| boxes = ( |
| np.stack( |
| [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] |
| ) |
| if len(annos) |
| else np.zeros((0, 4)) |
| ) |
| target = Instances(image_size) |
| target.gt_boxes = Boxes(boxes) |
|
|
| classes = [int(obj["category_id"]) for obj in annos] |
| classes = torch.tensor(classes, dtype=torch.int64) |
| target.gt_classes = classes |
|
|
| if len(annos) and "segmentation" in annos[0]: |
| segms = [obj["segmentation"] for obj in annos] |
| if mask_format == "polygon": |
| try: |
| masks = PolygonMasks(segms) |
| except ValueError as e: |
| raise ValueError( |
| "Failed to use mask_format=='polygon' from the given annotations!" |
| ) from e |
| else: |
| assert mask_format == "bitmask", mask_format |
| masks = [] |
| for segm in segms: |
| if isinstance(segm, list): |
| |
| masks.append(polygons_to_bitmask(segm, *image_size)) |
| elif isinstance(segm, dict): |
| |
| masks.append(mask_util.decode(segm)) |
| elif isinstance(segm, np.ndarray): |
| assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( |
| segm.ndim |
| ) |
| |
| masks.append(segm) |
| else: |
| raise ValueError( |
| "Cannot convert segmentation of type '{}' to BitMasks!" |
| "Supported types are: polygons as list[list[float] or ndarray]," |
| " COCO-style RLE as a dict, or a binary segmentation mask " |
| " in a 2D numpy array of shape HxW.".format(type(segm)) |
| ) |
| |
| masks = BitMasks( |
| torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) |
| ) |
| target.gt_masks = masks |
|
|
| if len(annos) and "keypoints" in annos[0]: |
| kpts = [obj.get("keypoints", []) for obj in annos] |
| target.gt_keypoints = Keypoints(kpts) |
|
|
| return target |
|
|
|
|
| def annotations_to_instances_rotated(annos, image_size): |
| """ |
| Create an :class:`Instances` object used by the models, |
| from instance annotations in the dataset dict. |
| Compared to `annotations_to_instances`, this function is for rotated boxes only |
| |
| Args: |
| annos (list[dict]): a list of instance annotations in one image, each |
| element for one instance. |
| image_size (tuple): height, width |
| |
| Returns: |
| Instances: |
| Containing fields "gt_boxes", "gt_classes", |
| if they can be obtained from `annos`. |
| This is the format that builtin models expect. |
| """ |
| boxes = [obj["bbox"] for obj in annos] |
| target = Instances(image_size) |
| boxes = target.gt_boxes = RotatedBoxes(boxes) |
| boxes.clip(image_size) |
|
|
| classes = [obj["category_id"] for obj in annos] |
| classes = torch.tensor(classes, dtype=torch.int64) |
| target.gt_classes = classes |
|
|
| return target |
|
|
|
|
| def filter_empty_instances( |
| instances, by_box=True, by_mask=True, box_threshold=1e-5, return_mask=False |
| ): |
| """ |
| Filter out empty instances in an `Instances` object. |
| |
| Args: |
| instances (Instances): |
| by_box (bool): whether to filter out instances with empty boxes |
| by_mask (bool): whether to filter out instances with empty masks |
| box_threshold (float): minimum width and height to be considered non-empty |
| return_mask (bool): whether to return boolean mask of filtered instances |
| |
| Returns: |
| Instances: the filtered instances. |
| tensor[bool], optional: boolean mask of filtered instances |
| """ |
| assert by_box or by_mask |
| r = [] |
| if by_box: |
| r.append(instances.gt_boxes.nonempty(threshold=box_threshold)) |
| if instances.has("gt_masks") and by_mask: |
| r.append(instances.gt_masks.nonempty()) |
|
|
| |
|
|
| if not r: |
| return instances |
| m = r[0] |
| for x in r[1:]: |
| m = m & x |
| if return_mask: |
| return instances[m], m |
| return instances[m] |
|
|
|
|
| def create_keypoint_hflip_indices(dataset_names: Union[str, List[str]]) -> List[int]: |
| """ |
| Args: |
| dataset_names: list of dataset names |
| |
| Returns: |
| list[int]: a list of size=#keypoints, storing the |
| horizontally-flipped keypoint indices. |
| """ |
| if isinstance(dataset_names, str): |
| dataset_names = [dataset_names] |
|
|
| check_metadata_consistency("keypoint_names", dataset_names) |
| check_metadata_consistency("keypoint_flip_map", dataset_names) |
|
|
| meta = MetadataCatalog.get(dataset_names[0]) |
| names = meta.keypoint_names |
| |
| flip_map = dict(meta.keypoint_flip_map) |
| flip_map.update({v: k for k, v in flip_map.items()}) |
| flipped_names = [i if i not in flip_map else flip_map[i] for i in names] |
| flip_indices = [names.index(i) for i in flipped_names] |
| return flip_indices |
|
|
|
|
| def get_fed_loss_cls_weights(dataset_names: Union[str, List[str]], freq_weight_power=1.0): |
| """ |
| Get frequency weight for each class sorted by class id. |
| We now calcualte freqency weight using image_count to the power freq_weight_power. |
| |
| Args: |
| dataset_names: list of dataset names |
| freq_weight_power: power value |
| """ |
| if isinstance(dataset_names, str): |
| dataset_names = [dataset_names] |
|
|
| check_metadata_consistency("class_image_count", dataset_names) |
|
|
| meta = MetadataCatalog.get(dataset_names[0]) |
| class_freq_meta = meta.class_image_count |
| class_freq = torch.tensor( |
| [c["image_count"] for c in sorted(class_freq_meta, key=lambda x: x["id"])] |
| ) |
| class_freq_weight = class_freq.float() ** freq_weight_power |
| return class_freq_weight |
|
|
|
|
| def gen_crop_transform_with_instance(crop_size, image_size, instance): |
| """ |
| Generate a CropTransform so that the cropping region contains |
| the center of the given instance. |
| |
| Args: |
| crop_size (tuple): h, w in pixels |
| image_size (tuple): h, w |
| instance (dict): an annotation dict of one instance, in Detectron2's |
| dataset format. |
| """ |
| crop_size = np.asarray(crop_size, dtype=np.int32) |
| bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS) |
| center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5 |
| assert ( |
| image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1] |
| ), "The annotation bounding box is outside of the image!" |
| assert ( |
| image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1] |
| ), "Crop size is larger than image size!" |
|
|
| min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0) |
| max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0) |
| max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32)) |
|
|
| y0 = np.random.randint(min_yx[0], max_yx[0] + 1) |
| x0 = np.random.randint(min_yx[1], max_yx[1] + 1) |
| return T.CropTransform(x0, y0, crop_size[1], crop_size[0]) |
|
|
|
|
| def check_metadata_consistency(key, dataset_names): |
| """ |
| Check that the datasets have consistent metadata. |
| |
| Args: |
| key (str): a metadata key |
| dataset_names (list[str]): a list of dataset names |
| |
| Raises: |
| AttributeError: if the key does not exist in the metadata |
| ValueError: if the given datasets do not have the same metadata values defined by key |
| """ |
| if len(dataset_names) == 0: |
| return |
| logger = logging.getLogger(__name__) |
| entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names] |
| for idx, entry in enumerate(entries_per_dataset): |
| if entry != entries_per_dataset[0]: |
| logger.error( |
| "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry)) |
| ) |
| logger.error( |
| "Metadata '{}' for dataset '{}' is '{}'".format( |
| key, dataset_names[0], str(entries_per_dataset[0]) |
| ) |
| ) |
| raise ValueError("Datasets have different metadata '{}'!".format(key)) |
|
|
|
|
| def build_augmentation(cfg, is_train): |
| """ |
| Create a list of default :class:`Augmentation` from config. |
| Now it includes resizing and flipping. |
| |
| Returns: |
| list[Augmentation] |
| """ |
| if is_train: |
| min_size = cfg.INPUT.MIN_SIZE_TRAIN |
| max_size = cfg.INPUT.MAX_SIZE_TRAIN |
| sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING |
| else: |
| min_size = cfg.INPUT.MIN_SIZE_TEST |
| max_size = cfg.INPUT.MAX_SIZE_TEST |
| sample_style = "choice" |
| augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)] |
| if is_train and cfg.INPUT.RANDOM_FLIP != "none": |
| augmentation.append( |
| T.RandomFlip( |
| horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", |
| vertical=cfg.INPUT.RANDOM_FLIP == "vertical", |
| ) |
| ) |
| return augmentation |
|
|
|
|
| build_transform_gen = build_augmentation |
| """ |
| Alias for backward-compatibility. |
| """ |
|
|