| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py |
| """ |
| from __future__ import absolute_import |
| from __future__ import division |
| from __future__ import print_function |
|
|
| import paddle |
| from ppdet.core.workspace import register |
| from .meta_arch import BaseArch |
| from .. import layers as L |
|
|
| __all__ = ['PETR'] |
|
|
|
|
| @register |
| class PETR(BaseArch): |
| __category__ = 'architecture' |
| __inject__ = ['backbone', 'neck', 'bbox_head'] |
|
|
| def __init__(self, |
| backbone='ResNet', |
| neck='ChannelMapper', |
| bbox_head='PETRHead'): |
| """ |
| PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf |
| |
| Args: |
| backbone (nn.Layer): backbone instance |
| neck (nn.Layer): neck between backbone and head |
| bbox_head (nn.Layer): model output and loss |
| """ |
| super(PETR, self).__init__() |
| self.backbone = backbone |
| if neck is not None: |
| self.with_neck = True |
| self.neck = neck |
| self.bbox_head = bbox_head |
| self.deploy = False |
|
|
| def extract_feat(self, img): |
| """Directly extract features from the backbone+neck.""" |
| x = self.backbone(img) |
| if self.with_neck: |
| x = self.neck(x) |
| return x |
|
|
| def get_inputs(self): |
| img_metas = [] |
| gt_bboxes = [] |
| gt_labels = [] |
| gt_keypoints = [] |
| gt_areas = [] |
| pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1) |
| for idx, im_shape in enumerate(self.inputs['im_shape']): |
| img_meta = { |
| 'img_shape': im_shape.astype("int32").tolist() + [1, ], |
| 'batch_input_shape': self.inputs['image'].shape[-2:], |
| 'image_name': self.inputs['image_file'][idx] |
| } |
| img_metas.append(img_meta) |
| if (not pad_gt_mask[idx].any()): |
| gt_keypoints.append(self.inputs['gt_joints'][idx][:1]) |
| gt_labels.append(self.inputs['gt_class'][idx][:1]) |
| gt_bboxes.append(self.inputs['gt_bbox'][idx][:1]) |
| gt_areas.append(self.inputs['gt_areas'][idx][:1]) |
| continue |
|
|
| gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]]) |
| gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]]) |
| gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]]) |
| gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]]) |
|
|
| return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas |
|
|
| def get_loss(self): |
| """ |
| Args: |
| img (Tensor): Input images of shape (N, C, H, W). |
| Typically these should be mean centered and std scaled. |
| img_metas (list[dict]): A List of image info dict where each dict |
| has: 'img_shape', 'scale_factor', 'flip', and may also contain |
| 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. |
| For details on the values of these keys see |
| :class:`mmdet.datasets.pipelines.Collect`. |
| gt_bboxes (list[Tensor]): Each item are the truth boxes for each |
| image in [tl_x, tl_y, br_x, br_y] format. |
| gt_labels (list[Tensor]): Class indices corresponding to each box. |
| gt_keypoints (list[Tensor]): Each item are the truth keypoints for |
| each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, |
| p^{K}_y, p^{K}_v] format. |
| gt_areas (list[Tensor]): mask areas corresponding to each box. |
| gt_bboxes_ignore (None | list[Tensor]): Specify which bounding |
| boxes can be ignored when computing the loss. |
| |
| Returns: |
| dict[str, Tensor]: A dictionary of loss components. |
| """ |
|
|
| img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs( |
| ) |
| gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None) |
|
|
| x = self.extract_feat(self.inputs) |
| losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, |
| gt_labels, gt_keypoints, gt_areas, |
| gt_bboxes_ignore) |
| loss = 0 |
| for k, v in losses.items(): |
| loss += v |
| losses['loss'] = loss |
|
|
| return losses |
|
|
| def get_pred_numpy(self): |
| """Used for computing network flops. |
| """ |
|
|
| img = self.inputs['image'] |
| batch_size, _, height, width = img.shape |
| dummy_img_metas = [ |
| dict( |
| batch_input_shape=(height, width), |
| img_shape=(height, width, 3), |
| scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size) |
| ] |
| x = self.extract_feat(img) |
| outs = self.bbox_head(x, img_metas=dummy_img_metas) |
| bbox_list = self.bbox_head.get_bboxes( |
| *outs, dummy_img_metas, rescale=True) |
| return bbox_list |
|
|
| def get_pred(self): |
| """ |
| """ |
| img = self.inputs['image'] |
| batch_size, _, height, width = img.shape |
| img_metas = [ |
| dict( |
| batch_input_shape=(height, width), |
| img_shape=(height, width, 3), |
| scale_factor=self.inputs['scale_factor'][i]) |
| for i in range(batch_size) |
| ] |
| kptpred = self.simple_test( |
| self.inputs, img_metas=img_metas, rescale=True) |
| keypoints = kptpred[0][1][0] |
| bboxs = kptpred[0][0][0] |
| keypoints[..., 2] = bboxs[:, None, 4] |
| res_lst = [[keypoints, bboxs[:, 4]]] |
| outputs = {'keypoint': res_lst} |
| return outputs |
|
|
| def simple_test(self, inputs, img_metas, rescale=False): |
| """Test function without test time augmentation. |
| |
| Args: |
| inputs (list[paddle.Tensor]): List of multiple images. |
| img_metas (list[dict]): List of image information. |
| rescale (bool, optional): Whether to rescale the results. |
| Defaults to False. |
| |
| Returns: |
| list[list[np.ndarray]]: BBox and keypoint results of each image |
| and classes. The outer list corresponds to each image. |
| The inner list corresponds to each class. |
| """ |
| batch_size = len(img_metas) |
| assert batch_size == 1, 'Currently only batch_size 1 for inference ' \ |
| f'mode is supported. Found batch_size {batch_size}.' |
| feat = self.extract_feat(inputs) |
| results_list = self.bbox_head.simple_test( |
| feat, img_metas, rescale=rescale) |
|
|
| bbox_kpt_results = [ |
| self.bbox_kpt2result(det_bboxes, det_labels, det_kpts, |
| self.bbox_head.num_classes) |
| for det_bboxes, det_labels, det_kpts in results_list |
| ] |
| return bbox_kpt_results |
|
|
| def bbox_kpt2result(self, bboxes, labels, kpts, num_classes): |
| """Convert detection results to a list of numpy arrays. |
| |
| Args: |
| bboxes (paddle.Tensor | np.ndarray): shape (n, 5). |
| labels (paddle.Tensor | np.ndarray): shape (n, ). |
| kpts (paddle.Tensor | np.ndarray): shape (n, K, 3). |
| num_classes (int): class number, including background class. |
| |
| Returns: |
| list(ndarray): bbox and keypoint results of each class. |
| """ |
| if bboxes.shape[0] == 0: |
| return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \ |
| [np.zeros((0, kpts.size(1), 3), dtype=np.float32) |
| for i in range(num_classes)] |
| else: |
| if isinstance(bboxes, paddle.Tensor): |
| bboxes = bboxes.numpy() |
| labels = labels.numpy() |
| kpts = kpts.numpy() |
| return [bboxes[labels == i, :] for i in range(num_classes)], \ |
| [kpts[labels == i, :, :] for i in range(num_classes)] |
|
|