| |
| import logging |
| from typing import Callable, Dict, List, Optional, Tuple, Union |
| import torch |
| from torch import nn |
| from torch.nn import functional as F |
|
|
| from detectron2.config import configurable |
| from detectron2.data.detection_utils import get_fed_loss_cls_weights |
| from detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple |
| from detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss |
| from detectron2.structures import Boxes, Instances |
| from detectron2.utils.events import get_event_storage |
|
|
| __all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"] |
|
|
|
|
| logger = logging.getLogger(__name__) |
|
|
| """ |
| Shape shorthand in this module: |
| |
| N: number of images in the minibatch |
| R: number of ROIs, combined over all images, in the minibatch |
| Ri: number of ROIs in image i |
| K: number of foreground classes. E.g.,there are 80 foreground classes in COCO. |
| |
| Naming convention: |
| |
| deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box |
| transform (see :class:`box_regression.Box2BoxTransform`). |
| |
| pred_class_logits: predicted class scores in [-inf, +inf]; use |
| softmax(pred_class_logits) to estimate P(class). |
| |
| gt_classes: ground-truth classification labels in [0, K], where [0, K) represent |
| foreground object classes and K represents the background class. |
| |
| pred_proposal_deltas: predicted box2box transform deltas for transforming proposals |
| to detection box predictions. |
| |
| gt_proposal_deltas: ground-truth box2box transform deltas |
| """ |
|
|
|
|
| def fast_rcnn_inference( |
| boxes: List[torch.Tensor], |
| scores: List[torch.Tensor], |
| image_shapes: List[Tuple[int, int]], |
| score_thresh: float, |
| nms_thresh: float, |
| topk_per_image: int, |
| ): |
| """ |
| Call `fast_rcnn_inference_single_image` for all images. |
| |
| Args: |
| boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic |
| boxes for each image. Element i has shape (Ri, K * 4) if doing |
| class-specific regression, or (Ri, 4) if doing class-agnostic |
| regression, where Ri is the number of predicted objects for image i. |
| This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`. |
| scores (list[Tensor]): A list of Tensors of predicted class scores for each image. |
| Element i has shape (Ri, K + 1), where Ri is the number of predicted objects |
| for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`. |
| image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch. |
| score_thresh (float): Only return detections with a confidence score exceeding this |
| threshold. |
| nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. |
| topk_per_image (int): The number of top scoring detections to return. Set < 0 to return |
| all detections. |
| |
| Returns: |
| instances: (list[Instances]): A list of N instances, one for each image in the batch, |
| that stores the topk most confidence detections. |
| kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates |
| the corresponding boxes/scores index in [0, Ri) from the input, for image i. |
| """ |
| result_per_image = [ |
| fast_rcnn_inference_single_image( |
| boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image |
| ) |
| for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes) |
| ] |
| return [x[0] for x in result_per_image], [x[1] for x in result_per_image] |
|
|
|
|
| def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"): |
| """ |
| Log the classification metrics to EventStorage. |
| |
| Args: |
| pred_logits: Rx(K+1) logits. The last column is for background class. |
| gt_classes: R labels |
| """ |
| num_instances = gt_classes.numel() |
| if num_instances == 0: |
| return |
| pred_classes = pred_logits.argmax(dim=1) |
| bg_class_ind = pred_logits.shape[1] - 1 |
|
|
| fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind) |
| num_fg = fg_inds.nonzero().numel() |
| fg_gt_classes = gt_classes[fg_inds] |
| fg_pred_classes = pred_classes[fg_inds] |
|
|
| num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel() |
| num_accurate = (pred_classes == gt_classes).nonzero().numel() |
| fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel() |
|
|
| storage = get_event_storage() |
| storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances) |
| if num_fg > 0: |
| storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg) |
| storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg) |
|
|
|
|
| def fast_rcnn_inference_single_image( |
| boxes, |
| scores, |
| image_shape: Tuple[int, int], |
| score_thresh: float, |
| nms_thresh: float, |
| topk_per_image: int, |
| ): |
| """ |
| Single-image inference. Return bounding-box detection results by thresholding |
| on scores and applying non-maximum suppression (NMS). |
| |
| Args: |
| Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes |
| per image. |
| |
| Returns: |
| Same as `fast_rcnn_inference`, but for only one image. |
| """ |
| valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) |
| if not valid_mask.all(): |
| boxes = boxes[valid_mask] |
| scores = scores[valid_mask] |
|
|
| scores = scores[:, :-1] |
| num_bbox_reg_classes = boxes.shape[1] // 4 |
| |
| boxes = Boxes(boxes.reshape(-1, 4)) |
| boxes.clip(image_shape) |
| boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) |
|
|
| |
| |
| filter_mask = scores > score_thresh |
| |
| |
| filter_inds = filter_mask.nonzero() |
| if num_bbox_reg_classes == 1: |
| boxes = boxes[filter_inds[:, 0], 0] |
| else: |
| boxes = boxes[filter_mask] |
| scores = scores[filter_mask] |
|
|
| |
| keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) |
| if topk_per_image >= 0: |
| keep = keep[:topk_per_image] |
| boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] |
|
|
| result = Instances(image_shape) |
| result.pred_boxes = Boxes(boxes) |
| result.scores = scores |
| result.pred_classes = filter_inds[:, 1] |
| return result, filter_inds[:, 0] |
|
|
|
|
| class FastRCNNOutputLayers(nn.Module): |
| """ |
| Two linear layers for predicting Fast R-CNN outputs: |
| |
| 1. proposal-to-detection box regression deltas |
| 2. classification scores |
| """ |
|
|
| @configurable |
| def __init__( |
| self, |
| input_shape: ShapeSpec, |
| *, |
| box2box_transform, |
| num_classes: int, |
| test_score_thresh: float = 0.0, |
| test_nms_thresh: float = 0.5, |
| test_topk_per_image: int = 100, |
| cls_agnostic_bbox_reg: bool = False, |
| smooth_l1_beta: float = 0.0, |
| box_reg_loss_type: str = "smooth_l1", |
| loss_weight: Union[float, Dict[str, float]] = 1.0, |
| use_fed_loss: bool = False, |
| use_sigmoid_ce: bool = False, |
| get_fed_loss_cls_weights: Optional[Callable] = None, |
| fed_loss_num_classes: int = 50, |
| ): |
| """ |
| NOTE: this interface is experimental. |
| |
| Args: |
| input_shape (ShapeSpec): shape of the input feature to this module |
| box2box_transform (Box2BoxTransform or Box2BoxTransformRotated): |
| num_classes (int): number of foreground classes |
| test_score_thresh (float): threshold to filter predictions results. |
| test_nms_thresh (float): NMS threshold for prediction results. |
| test_topk_per_image (int): number of top predictions to produce per image. |
| cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression |
| smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if |
| `box_reg_loss_type` is "smooth_l1" |
| box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou", |
| "diou", "ciou" |
| loss_weight (float|dict): weights to use for losses. Can be single float for weighting |
| all losses, or a dict of individual weightings. Valid dict keys are: |
| * "loss_cls": applied to classification loss |
| * "loss_box_reg": applied to box regression loss |
| use_fed_loss (bool): whether to use federated loss which samples additional negative |
| classes to calculate the loss |
| use_sigmoid_ce (bool): whether to calculate the loss using weighted average of binary |
| cross entropy with logits. This could be used together with federated loss |
| get_fed_loss_cls_weights (Callable): a callable which takes dataset name and frequency |
| weight power, and returns the probabilities to sample negative classes for |
| federated loss. The implementation can be found in |
| detectron2/data/detection_utils.py |
| fed_loss_num_classes (int): number of federated classes to keep in total |
| """ |
| super().__init__() |
| if isinstance(input_shape, int): |
| input_shape = ShapeSpec(channels=input_shape) |
| self.num_classes = num_classes |
| input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1) |
| |
| self.cls_score = nn.Linear(input_size, num_classes + 1) |
| num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes |
| box_dim = len(box2box_transform.weights) |
| self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim) |
|
|
| nn.init.normal_(self.cls_score.weight, std=0.01) |
| nn.init.normal_(self.bbox_pred.weight, std=0.001) |
| for l in [self.cls_score, self.bbox_pred]: |
| nn.init.constant_(l.bias, 0) |
|
|
| self.box2box_transform = box2box_transform |
| self.smooth_l1_beta = smooth_l1_beta |
| self.test_score_thresh = test_score_thresh |
| self.test_nms_thresh = test_nms_thresh |
| self.test_topk_per_image = test_topk_per_image |
| self.box_reg_loss_type = box_reg_loss_type |
| if isinstance(loss_weight, float): |
| loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight} |
| self.loss_weight = loss_weight |
| self.use_fed_loss = use_fed_loss |
| self.use_sigmoid_ce = use_sigmoid_ce |
| self.fed_loss_num_classes = fed_loss_num_classes |
|
|
| if self.use_fed_loss: |
| assert self.use_sigmoid_ce, "Please use sigmoid cross entropy loss with federated loss" |
| fed_loss_cls_weights = get_fed_loss_cls_weights() |
| assert ( |
| len(fed_loss_cls_weights) == self.num_classes |
| ), "Please check the provided fed_loss_cls_weights. Their size should match num_classes" |
| self.register_buffer("fed_loss_cls_weights", fed_loss_cls_weights) |
|
|
| @classmethod |
| def from_config(cls, cfg, input_shape): |
| return { |
| "input_shape": input_shape, |
| "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS), |
| |
| "num_classes" : cfg.MODEL.ROI_HEADS.NUM_CLASSES, |
| "cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, |
| "smooth_l1_beta" : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA, |
| "test_score_thresh" : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST, |
| "test_nms_thresh" : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, |
| "test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE, |
| "box_reg_loss_type" : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE, |
| "loss_weight" : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT}, |
| "use_fed_loss" : cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS, |
| "use_sigmoid_ce" : cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE, |
| "get_fed_loss_cls_weights" : lambda: get_fed_loss_cls_weights(dataset_names=cfg.DATASETS.TRAIN, freq_weight_power=cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER), |
| "fed_loss_num_classes" : cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES, |
| |
| } |
|
|
| def forward(self, x): |
| """ |
| Args: |
| x: per-region features of shape (N, ...) for N bounding boxes to predict. |
| |
| Returns: |
| (Tensor, Tensor): |
| First tensor: shape (N,K+1), scores for each of the N box. Each row contains the |
| scores for K object categories and 1 background class. |
| |
| Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4), |
| or (N,4) for class-agnostic regression. |
| """ |
| if x.dim() > 2: |
| x = torch.flatten(x, start_dim=1) |
| scores = self.cls_score(x) |
| proposal_deltas = self.bbox_pred(x) |
| return scores, proposal_deltas |
|
|
| def losses(self, predictions, proposals): |
| """ |
| Args: |
| predictions: return values of :meth:`forward()`. |
| proposals (list[Instances]): proposals that match the features that were used |
| to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``, |
| ``gt_classes`` are expected. |
| |
| Returns: |
| Dict[str, Tensor]: dict of losses |
| """ |
| scores, proposal_deltas = predictions |
|
|
| |
| gt_classes = ( |
| cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) |
| ) |
| _log_classification_stats(scores, gt_classes) |
|
|
| |
| if len(proposals): |
| proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) |
| assert not proposal_boxes.requires_grad, "Proposals should not require gradients!" |
| |
| |
| |
| |
| gt_boxes = cat( |
| [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals], |
| dim=0, |
| ) |
| else: |
| proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device) |
|
|
| if self.use_sigmoid_ce: |
| loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes) |
| else: |
| loss_cls = cross_entropy(scores, gt_classes, reduction="mean") |
|
|
| losses = { |
| "loss_cls": loss_cls, |
| "loss_box_reg": self.box_reg_loss( |
| proposal_boxes, gt_boxes, proposal_deltas, gt_classes |
| ), |
| } |
| return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()} |
|
|
| |
| |
| def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight): |
| """ |
| Args: |
| gt_classes: a long tensor of shape R that contains the gt class label of each proposal. |
| num_fed_loss_classes: minimum number of classes to keep when calculating federated loss. |
| Will sample negative classes if number of unique gt_classes is smaller than this value. |
| num_classes: number of foreground classes |
| weight: probabilities used to sample negative classes |
| |
| Returns: |
| Tensor: |
| classes to keep when calculating the federated loss, including both unique gt |
| classes and sampled negative classes. |
| """ |
| unique_gt_classes = torch.unique(gt_classes) |
| prob = unique_gt_classes.new_ones(num_classes + 1).float() |
| prob[-1] = 0 |
| if len(unique_gt_classes) < num_fed_loss_classes: |
| prob[:num_classes] = weight.float().clone() |
| prob[unique_gt_classes] = 0 |
| sampled_negative_classes = torch.multinomial( |
| prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False |
| ) |
| fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes]) |
| else: |
| fed_loss_classes = unique_gt_classes |
| return fed_loss_classes |
|
|
| |
| |
| def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes): |
| """ |
| Args: |
| pred_class_logits: shape (N, K+1), scores for each of the N box. Each row contains the |
| scores for K object categories and 1 background class |
| gt_classes: a long tensor of shape R that contains the gt class label of each proposal. |
| """ |
| if pred_class_logits.numel() == 0: |
| return pred_class_logits.new_zeros([1])[0] |
|
|
| N = pred_class_logits.shape[0] |
| K = pred_class_logits.shape[1] - 1 |
|
|
| target = pred_class_logits.new_zeros(N, K + 1) |
| target[range(len(gt_classes)), gt_classes] = 1 |
| target = target[:, :K] |
|
|
| cls_loss = F.binary_cross_entropy_with_logits( |
| pred_class_logits[:, :-1], target, reduction="none" |
| ) |
|
|
| if self.use_fed_loss: |
| fed_loss_classes = self.get_fed_loss_classes( |
| gt_classes, |
| num_fed_loss_classes=self.fed_loss_num_classes, |
| num_classes=K, |
| weight=self.fed_loss_cls_weights, |
| ) |
| fed_loss_classes_mask = fed_loss_classes.new_zeros(K + 1) |
| fed_loss_classes_mask[fed_loss_classes] = 1 |
| fed_loss_classes_mask = fed_loss_classes_mask[:K] |
| weight = fed_loss_classes_mask.view(1, K).expand(N, K).float() |
| else: |
| weight = 1 |
|
|
| loss = torch.sum(cls_loss * weight) / N |
| return loss |
|
|
| def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes): |
| """ |
| Args: |
| proposal_boxes/gt_boxes are tensors with the same shape (R, 4 or 5). |
| pred_deltas has shape (R, 4 or 5), or (R, num_classes * (4 or 5)). |
| gt_classes is a long tensor of shape R, the gt class label of each proposal. |
| R shall be the number of proposals. |
| """ |
| box_dim = proposal_boxes.shape[1] |
| |
| fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0] |
| if pred_deltas.shape[1] == box_dim: |
| fg_pred_deltas = pred_deltas[fg_inds] |
| else: |
| fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[ |
| fg_inds, gt_classes[fg_inds] |
| ] |
|
|
| loss_box_reg = _dense_box_regression_loss( |
| [proposal_boxes[fg_inds]], |
| self.box2box_transform, |
| [fg_pred_deltas.unsqueeze(0)], |
| [gt_boxes[fg_inds]], |
| ..., |
| self.box_reg_loss_type, |
| self.smooth_l1_beta, |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| return loss_box_reg / max(gt_classes.numel(), 1.0) |
|
|
| def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]): |
| """ |
| Args: |
| predictions: return values of :meth:`forward()`. |
| proposals (list[Instances]): proposals that match the features that were |
| used to compute predictions. The ``proposal_boxes`` field is expected. |
| |
| Returns: |
| list[Instances]: same as `fast_rcnn_inference`. |
| list[Tensor]: same as `fast_rcnn_inference`. |
| """ |
| boxes = self.predict_boxes(predictions, proposals) |
| scores = self.predict_probs(predictions, proposals) |
| image_shapes = [x.image_size for x in proposals] |
| return fast_rcnn_inference( |
| boxes, |
| scores, |
| image_shapes, |
| self.test_score_thresh, |
| self.test_nms_thresh, |
| self.test_topk_per_image, |
| ) |
|
|
| def predict_boxes_for_gt_classes(self, predictions, proposals): |
| """ |
| Args: |
| predictions: return values of :meth:`forward()`. |
| proposals (list[Instances]): proposals that match the features that were used |
| to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected. |
| |
| Returns: |
| list[Tensor]: |
| A list of Tensors of predicted boxes for GT classes in case of |
| class-specific box head. Element i of the list has shape (Ri, B), where Ri is |
| the number of proposals for image i and B is the box dimension (4 or 5) |
| """ |
| if not len(proposals): |
| return [] |
| scores, proposal_deltas = predictions |
| proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) |
| N, B = proposal_boxes.shape |
| predict_boxes = self.box2box_transform.apply_deltas( |
| proposal_deltas, proposal_boxes |
| ) |
|
|
| K = predict_boxes.shape[1] // B |
| if K > 1: |
| gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0) |
| |
| |
| gt_classes = gt_classes.clamp_(0, K - 1) |
|
|
| predict_boxes = predict_boxes.view(N, K, B)[ |
| torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes |
| ] |
| num_prop_per_image = [len(p) for p in proposals] |
| return predict_boxes.split(num_prop_per_image) |
|
|
| def predict_boxes( |
| self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances] |
| ): |
| """ |
| Args: |
| predictions: return values of :meth:`forward()`. |
| proposals (list[Instances]): proposals that match the features that were |
| used to compute predictions. The ``proposal_boxes`` field is expected. |
| |
| Returns: |
| list[Tensor]: |
| A list of Tensors of predicted class-specific or class-agnostic boxes |
| for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is |
| the number of proposals for image i and B is the box dimension (4 or 5) |
| """ |
| if not len(proposals): |
| return [] |
| _, proposal_deltas = predictions |
| num_prop_per_image = [len(p) for p in proposals] |
| proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) |
| predict_boxes = self.box2box_transform.apply_deltas( |
| proposal_deltas, |
| proposal_boxes, |
| ) |
| return predict_boxes.split(num_prop_per_image) |
|
|
| def predict_probs( |
| self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances] |
| ): |
| """ |
| Args: |
| predictions: return values of :meth:`forward()`. |
| proposals (list[Instances]): proposals that match the features that were |
| used to compute predictions. |
| |
| Returns: |
| list[Tensor]: |
| A list of Tensors of predicted class probabilities for each image. |
| Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i. |
| """ |
| scores, _ = predictions |
| num_inst_per_image = [len(p) for p in proposals] |
| if self.use_sigmoid_ce: |
| probs = scores.sigmoid() |
| else: |
| probs = F.softmax(scores, dim=-1) |
| return probs.split(num_inst_per_image, dim=0) |
|
|