File size: 23,614 Bytes

7b7527a

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import math
import random
import numpy as np
from copy import deepcopy
from typing import List, Tuple
from collections import defaultdict

from .chip_box_utils import nms, transform_chip_boxes2image_boxes
from .chip_box_utils import find_chips_to_cover_overlaped_boxes
from .chip_box_utils import transform_chip_box
from .chip_box_utils import intersection_over_box


class AnnoCropper(object):
    def __init__(self,
                 image_target_sizes: List[int],
                 valid_box_ratio_ranges: List[List[float]],
                 chip_target_size: int,
                 chip_target_stride: int,
                 use_neg_chip: bool=False,
                 max_neg_num_per_im: int=8,
                 max_per_img: int=-1,
                 nms_thresh: int=0.5):
        """
        Generate chips by chip_target_size and chip_target_stride.
        These two parameters just like kernel_size and stride in cnn.

        Each image has its raw size. After resizing, then get its target size.
        The resizing scale = target_size / raw_size.
        So are chips of the image.
        box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size
        The 'size' above mentioned is the size of long-side of image, box or chip.

        :param image_target_sizes: [2000, 1000]
        :param valid_box_ratio_ranges:  [[-1, 0.1],[0.08, -1]]
        :param chip_target_size: 500
        :param chip_target_stride: 200
        """
        self.target_sizes = image_target_sizes
        self.valid_box_ratio_ranges = valid_box_ratio_ranges
        assert len(self.target_sizes) == len(self.valid_box_ratio_ranges)
        self.scale_num = len(self.target_sizes)
        self.chip_target_size = chip_target_size  # is target size
        self.chip_target_stride = chip_target_stride  # is target stride
        self.use_neg_chip = use_neg_chip
        self.max_neg_num_per_im = max_neg_num_per_im
        self.max_per_img = max_per_img
        self.nms_thresh = nms_thresh

    def crop_anno_records(self, records: List[dict]):
        """
        The main logic:
        # foreach record(image):
        #   foreach scale:
        #     1 generate chips by chip size and stride for each scale
        #     2 get pos chips
        #     - validate boxes: current scale; h,w >= 1
        #     - find pos chips greedily by valid gt boxes in each scale
        #     - for every valid gt box, find its corresponding pos chips in each scale
        #     3 get neg chips
        #     - If given proposals, find neg boxes in them which are not in pos chips
        #     - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2.
        # 4 sample neg chips if too much each image
        #   transform this image-scale annotations to chips(pos chips&neg chips) annotations

        :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1
                        model and maybe have neg boxes in them.
        :return: new_records, list of dict like
        {
            'im_file': 'fake_image1.jpg',
            'im_id': np.array([1]),  # new _global_chip_id as im_id
            'h': h,  # chip height
            'w': w,  # chip width
            'is_crowd': is_crowd,  # Nx1 -> Mx1
            'gt_class': gt_class,  # Nx1 -> Mx1
            'gt_bbox': gt_bbox,  # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2]
            'gt_poly': gt_poly,  # [None]xN -> [None]xM
            'chip': [x1, y1, x2, y2]  # added
        }

        Attention:
        ------------------------------>x
        |
        |    (x1,y1)------
        |       |        |
        |       |        |
        |       |        |
        |       |        |
        |       |        |
        |       ----------
        |                 (x2,y2)
        |
        ↓
        y

        If we use [x1, y1, x2, y2] to represent boxes or chips,
        (x1,y1) is the left-top point which is in the box,
        but (x2,y2) is the right-bottom point which is not in the box.
        So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h].
        And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area.
        """

        self.chip_records = []
        self._global_chip_id = 1
        for r in records:
            self._cur_im_pos_chips = [
            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
            self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
            for scale_i in range(self.scale_num):
                self._get_current_scale_parameters(scale_i, r)

                # Cx4
                chips = self._create_chips(r['h'], r['w'], self._cur_scale)

                # # dict: chipid->[box_id, ...]
                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
                    r['gt_bbox'], chips)

                # dict: chipid->neg_box_num
                neg_chip2box_num = self._get_neg_boxes_and_chips(
                    chips,
                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))

                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
                                          neg_chip2box_num)

            cur_image_records = self._trans_all_chips2annotations(r)
            self.chip_records.extend(cur_image_records)
        return self.chip_records

    def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num):
        for pos_chipid, boxes_idx in pos_chip2boxes_idx.items():
            chip = np.array(chips[pos_chipid])  # copy chips slice
            self._cur_im_pos_chips.append((chip, boxes_idx))

        if neg_chip2box_num is None:
            return

        for neg_chipid, neg_box_num in neg_chip2box_num.items():
            chip = np.array(chips[neg_chipid])
            self._cur_im_neg_chips.append((chip, neg_box_num))

    def _trans_all_chips2annotations(self, r):
        gt_bbox = r['gt_bbox']
        im_file = r['im_file']
        is_crowd = r['is_crowd']
        gt_class = r['gt_class']
        # gt_poly = r['gt_poly']   # [None]xN
        # remaining keys: im_id, h, w
        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
                                                         is_crowd, gt_class)

        if not self.use_neg_chip:
            return chip_records

        sampled_neg_chips = self._sample_neg_chips()
        neg_chip_records = self._trans_neg_chips2annotations(im_file,
                                                             sampled_neg_chips)
        chip_records.extend(neg_chip_records)
        return chip_records

    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
                                     gt_class):
        chip_records = []
        for chip, boxes_idx in self._cur_im_pos_chips:
            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
                                                            chip)
            x1, y1, x2, y2 = chip
            chip_h = y2 - y1
            chip_w = x2 - x1
            rec = {
                'im_file': im_file,
                'im_id': np.array([self._global_chip_id]),
                'h': chip_h,
                'w': chip_w,
                'gt_bbox': chip_bbox,
                'is_crowd': is_crowd[final_boxes_idx].copy(),
                'gt_class': gt_class[final_boxes_idx].copy(),
                # 'gt_poly': [None] * len(final_boxes_idx),
                'chip': chip
            }
            self._global_chip_id += 1
            chip_records.append(rec)
        return chip_records

    def _sample_neg_chips(self):
        pos_num = len(self._cur_im_pos_chips)
        neg_num = len(self._cur_im_neg_chips)
        sample_num = min(pos_num + 2, self.max_neg_num_per_im)
        assert sample_num >= 1
        if neg_num <= sample_num:
            return self._cur_im_neg_chips

        candidate_num = int(sample_num * 1.5)
        candidate_neg_chips = sorted(
            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
        random.shuffle(candidate_neg_chips)
        sampled_neg_chips = candidate_neg_chips[:sample_num]
        return sampled_neg_chips

    def _trans_neg_chips2annotations(self,
                                     im_file: str,
                                     sampled_neg_chips: List[Tuple]):
        chip_records = []
        for chip, neg_box_num in sampled_neg_chips:
            x1, y1, x2, y2 = chip
            chip_h = y2 - y1
            chip_w = x2 - x1
            rec = {
                'im_file': im_file,
                'im_id': np.array([self._global_chip_id]),
                'h': chip_h,
                'w': chip_w,
                'gt_bbox': np.zeros(
                    (0, 4), dtype=np.float32),
                'is_crowd': np.zeros(
                    (0, 1), dtype=np.int32),
                'gt_class': np.zeros(
                    (0, 1), dtype=np.int32),
                # 'gt_poly': [],
                'chip': chip
            }
            self._global_chip_id += 1
            chip_records.append(rec)
        return chip_records

    def _get_current_scale_parameters(self, scale_i, r):
        im_size = max(r['h'], r['w'])
        im_target_size = self.target_sizes[scale_i]
        self._cur_im_size, self._cur_im_target_size = im_size, im_target_size
        self._cur_scale = self._get_current_scale(im_target_size, im_size)
        self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i]

    def _get_current_scale(self, im_target_size, im_size):
        return im_target_size / im_size

    def _create_chips(self, h: int, w: int, scale: float):
        """
        Generate chips by chip_target_size and chip_target_stride.
        These two parameters just like kernel_size and stride in cnn.
        :return: chips, Cx4, xy in raw size dimension
        """
        chip_size = self.chip_target_size  # omit target for simplicity
        stride = self.chip_target_stride
        width = int(scale * w)
        height = int(scale * h)
        min_chip_location_diff = 20  # in target size

        assert chip_size >= stride
        chip_overlap = chip_size - stride
        if (width - chip_overlap
            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大，则保留
            w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
        else:  # 不能被stride整除的部分比较小，则丢弃
            w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
        if (height - chip_overlap) % stride > min_chip_location_diff:
            h_steps = max(1, int(math.ceil((height - chip_overlap) / stride)))
        else:
            h_steps = max(1, int(math.floor((height - chip_overlap) / stride)))

        chips = list()
        for j in range(h_steps):
            for i in range(w_steps):
                x1 = i * stride
                y1 = j * stride
                x2 = min(x1 + chip_size, width)
                y2 = min(y1 + chip_size, height)
                chips.append([x1, y1, x2, y2])

        # check  chip size
        for item in chips:
            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
                    1] > chip_size * 1.1:
                raise ValueError(item)
        chips = np.array(chips, dtype=np.float32)

        raw_size_chips = chips / scale
        return raw_size_chips

    def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips):
        valid_ratio_range = self._cur_valid_ratio_range
        im_size = self._cur_im_size
        scale = self._cur_scale
        #   Nx4            N
        valid_boxes, valid_boxes_idx = self._validate_boxes(
            valid_ratio_range, im_size, gt_bbox, scale)
        # dict: chipid->[box_id, ...]
        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
                                                  valid_boxes_idx)
        return pos_chip2boxes_idx

    def _validate_boxes(self,
                        valid_ratio_range: List[float],
                        im_size: int,
                        gt_boxes: 'np.array of Nx4',
                        scale: float):
        """
        :return: valid_boxes: Nx4, valid_boxes_idx: N
        """
        ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32)
        hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32)
        maxs = np.maximum(ws, hs)
        box_ratio = maxs / im_size
        mins = np.minimum(ws, hs)
        target_mins = mins * scale

        low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
            np.float32).max

        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (
            target_mins >= 2))[0]
        valid_boxes = gt_boxes[valid_boxes_idx]
        return valid_boxes, valid_boxes_idx

    def _find_pos_chips(self,
                        chips: 'Cx4',
                        valid_boxes: 'Bx4',
                        valid_boxes_idx: 'B'):
        """
        :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
        """
        iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB

        iob_threshold_to_find_chips = 1.
        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
            iob, iob_threshold_to_find_chips)
        pos_chip_ids = set(pos_chip_ids)

        iob_threshold_to_assign_box = 0.5
        pos_chip2boxes_idx = self._assign_boxes_to_pos_chips(
            iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx)
        return pos_chip2boxes_idx

    def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
        return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)

    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
                                   valid_boxes_idx):
        chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
        pos_chip2boxes_idx = defaultdict(list)
        for chip_id, box_id in zip(chip_ids, box_ids):
            if chip_id not in pos_chip_ids:
                continue
            raw_gt_box_idx = valid_boxes_idx[box_id]
            pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
        return pos_chip2boxes_idx

    def _get_neg_boxes_and_chips(self,
                                 chips: 'Cx4',
                                 pos_chip_ids: 'D',
                                 proposals: 'Px4'):
        """
        :param chips:
        :param pos_chip_ids:
        :param proposals:
        :return: neg_chip2box_num, None or dict: chipid->neg_box_num
        """
        if not self.use_neg_chip:
            return None

        # train proposals maybe None
        if proposals is None or len(proposals) < 1:
            return None

        valid_ratio_range = self._cur_valid_ratio_range
        im_size = self._cur_im_size
        scale = self._cur_scale

        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
                                              proposals, scale)
        neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
        neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
        return neg_chip2box_num

    def _find_neg_boxes(self,
                        chips: 'Cx4',
                        pos_chip_ids: 'D',
                        valid_props: 'Px4'):
        """
        :return: neg_boxes: Nx4
        """
        if len(pos_chip_ids) == 0:
            return valid_props

        pos_chips = chips[pos_chip_ids]
        iob = intersection_over_box(pos_chips, valid_props)
        overlap_per_prop = np.max(iob, axis=0)
        non_overlap_props_idx = overlap_per_prop < 0.5
        neg_boxes = valid_props[non_overlap_props_idx]
        return neg_boxes

    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',
                        neg_boxes: 'Nx4'):
        """
        :return: neg_chip2box_num, dict: chipid->neg_box_num
        """
        neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids)
        neg_chips = chips[neg_chip_ids]

        iob = intersection_over_box(neg_chips, neg_boxes)
        iob_threshold_to_find_chips = 0.7
        chosen_neg_chip_ids, chip_id2overlap_box_num = \
            self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)

        neg_chipid2box_num = {}
        for cid in chosen_neg_chip_ids:
            box_num = chip_id2overlap_box_num[cid]
            raw_chip_id = neg_chip_ids[cid]
            neg_chipid2box_num[raw_chip_id] = box_num
        return neg_chipid2box_num

    def crop_infer_anno_records(self, records: List[dict]):
        """
        transform image record to chips record
        :param records:
        :return: new_records, list of dict like
        {
            'im_file': 'fake_image1.jpg',
            'im_id': np.array([1]),  # new _global_chip_id as im_id
            'h': h,  # chip height
            'w': w,  # chip width
            'chip': [x1, y1, x2, y2]  # added
            'ori_im_h': ori_im_h  # added, origin image height
            'ori_im_w': ori_im_w  # added, origin image width
            'scale_i': 0  # added,
        }
        """
        self.chip_records = []
        self._global_chip_id = 1  # im_id start from 1
        self._global_chip_id2img_id = {}

        for r in records:
            for scale_i in range(self.scale_num):
                self._get_current_scale_parameters(scale_i, r)
                # Cx4
                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
                cur_img_chip_record = self._get_chips_records(r, chips, scale_i)
                self.chip_records.extend(cur_img_chip_record)

        return self.chip_records

    def _get_chips_records(self, rec, chips, scale_i):
        cur_img_chip_records = []
        ori_im_h = rec["h"]
        ori_im_w = rec["w"]
        im_file = rec["im_file"]
        ori_im_id = rec["im_id"]
        for id, chip in enumerate(chips):
            chip_rec = {}
            x1, y1, x2, y2 = chip
            chip_h = y2 - y1
            chip_w = x2 - x1
            chip_rec["im_file"] = im_file
            chip_rec["im_id"] = self._global_chip_id
            chip_rec["h"] = chip_h
            chip_rec["w"] = chip_w
            chip_rec["chip"] = chip
            chip_rec["ori_im_h"] = ori_im_h
            chip_rec["ori_im_w"] = ori_im_w
            chip_rec["scale_i"] = scale_i

            self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id)
            self._global_chip_id += 1
            cur_img_chip_records.append(chip_rec)

        return cur_img_chip_records

    def aggregate_chips_detections(self, results, records=None):
        """
        # 1. transform chip dets to image dets
        # 2. nms boxes per image;
        # 3. format output results
        :param results:
        :param roidb:
        :return:
        """
        results = deepcopy(results)
        records = records if records else self.chip_records
        img_id2bbox = self._transform_chip2image_bboxes(results, records)
        nms_img_id2bbox = self._nms_dets(img_id2bbox)
        aggregate_results = self._reformat_results(nms_img_id2bbox)
        return aggregate_results

    def _transform_chip2image_bboxes(self, results, records):
        # 1. Transform chip dets to image dets;
        # 2. Filter valid range;
        # 3. Reformat and Aggregate chip dets to Get scale_cls_dets
        img_id2bbox = defaultdict(list)
        for result in results:
            bbox_locs = result['bbox']
            bbox_nums = result['bbox_num']
            if len(bbox_locs) == 1 and bbox_locs[0][
                    0] == -1:  # current batch has no detections
                # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
                # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
                continue
            im_ids = result['im_id']  # replace with range(len(bbox_nums))

            last_bbox_num = 0
            for idx, im_id in enumerate(im_ids):

                cur_bbox_len = bbox_nums[idx]
                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
                last_bbox_num += cur_bbox_len
                # box: [num_id, score, xmin, ymin, xmax, ymax]
                if len(bboxes) == 0:  # current image has no detections
                    continue

                chip_rec = records[int(im_id) -
                                   1]  # im_id starts from 1, type is np.int64
                image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])

                bboxes = transform_chip_boxes2image_boxes(
                    bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
                    chip_rec["ori_im_w"])

                scale_i = chip_rec["scale_i"]
                cur_scale = self._get_current_scale(self.target_sizes[scale_i],
                                                    image_size)
                _, valid_boxes_idx = self._validate_boxes(
                    self.valid_box_ratio_ranges[scale_i], image_size,
                    bboxes[:, 2:], cur_scale)
                ori_img_id = self._global_chip_id2img_id[int(im_id)]

                img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])

        return img_id2bbox

    def _nms_dets(self, img_id2bbox):
        # 1. NMS on each image-class
        # 2. Limit number of detections to MAX_PER_IMAGE if requested
        max_per_img = self.max_per_img
        nms_thresh = self.nms_thresh

        for img_id in img_id2bbox:
            box = img_id2bbox[
                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
            box = np.concatenate(box, axis=0)
            nms_dets = nms(box, nms_thresh)
            if max_per_img > 0:
                if len(nms_dets) > max_per_img:
                    keep = np.argsort(-nms_dets[:, 1])[:max_per_img]
                    nms_dets = nms_dets[keep]

            img_id2bbox[img_id] = nms_dets

        return img_id2bbox

    def _reformat_results(self, img_id2bbox):
        """reformat results"""
        im_ids = img_id2bbox.keys()
        results = []
        for img_id in im_ids:  # output by original im_id order
            if len(img_id2bbox[img_id]) == 0:
                bbox = np.array(
                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections
                bbox_num = np.array([0])
            else:
                # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
                bbox = img_id2bbox[img_id]
                bbox_num = np.array([len(bbox)])
            res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
            results.append(res)
        return results