|
|
|
|
| from typing import Any, List
|
| import torch
|
| from torch.nn import functional as F
|
|
|
| from detectron2.config import CfgNode
|
| from detectron2.structures import Instances
|
|
|
| from .mask_or_segm import MaskOrSegmentationLoss
|
| from .registry import DENSEPOSE_LOSS_REGISTRY
|
| from .utils import (
|
| BilinearInterpolationHelper,
|
| ChartBasedAnnotationsAccumulator,
|
| LossDict,
|
| extract_packed_annotations_from_matches,
|
| )
|
|
|
|
|
| @DENSEPOSE_LOSS_REGISTRY.register()
|
| class DensePoseChartLoss:
|
| """
|
| DensePose loss for chart-based training. A mesh is split into charts,
|
| each chart is given a label (I) and parametrized by 2 coordinates referred to
|
| as U and V. Ground truth consists of a number of points annotated with
|
| I, U and V values and coarse segmentation S defined for all pixels of the
|
| object bounding box. In some cases (see `COARSE_SEGM_TRAINED_BY_MASKS`),
|
| semantic segmentation annotations can be used as ground truth inputs as well.
|
|
|
| Estimated values are tensors:
|
| * U coordinates, tensor of shape [N, C, S, S]
|
| * V coordinates, tensor of shape [N, C, S, S]
|
| * fine segmentation estimates, tensor of shape [N, C, S, S] with raw unnormalized
|
| scores for each fine segmentation label at each location
|
| * coarse segmentation estimates, tensor of shape [N, D, S, S] with raw unnormalized
|
| scores for each coarse segmentation label at each location
|
| where N is the number of detections, C is the number of fine segmentation
|
| labels, S is the estimate size ( = width = height) and D is the number of
|
| coarse segmentation channels.
|
|
|
| The losses are:
|
| * regression (smooth L1) loss for U and V coordinates
|
| * cross entropy loss for fine (I) and coarse (S) segmentations
|
| Each loss has an associated weight
|
| """
|
|
|
| def __init__(self, cfg: CfgNode):
|
| """
|
| Initialize chart-based loss from configuration options
|
|
|
| Args:
|
| cfg (CfgNode): configuration options
|
| """
|
|
|
| self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
|
| self.w_points = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
|
| self.w_part = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
|
| self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
|
| self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
|
|
|
| self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
|
| self.segm_loss = MaskOrSegmentationLoss(cfg)
|
|
|
| def __call__(
|
| self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, **kwargs
|
| ) -> LossDict:
|
| """
|
| Produce chart-based DensePose losses
|
|
|
| Args:
|
| proposals_with_gt (list of Instances): detections with associated ground truth data
|
| densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
|
| with estimated values; assumed to have the following attributes:
|
| * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
|
| * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
|
| * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| where N is the number of detections, C is the number of fine segmentation
|
| labels, S is the estimate size ( = width = height) and D is the number of
|
| coarse segmentation channels.
|
|
|
| Return:
|
| dict: str -> tensor: dict of losses with the following entries:
|
| * `loss_densepose_U`: smooth L1 loss for U coordinate estimates
|
| * `loss_densepose_V`: smooth L1 loss for V coordinate estimates
|
| * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine
|
| segmentation estimates given ground truth labels;
|
| * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse
|
| segmentation estimates given ground truth labels;
|
| """
|
|
|
|
|
|
|
|
|
| if not len(proposals_with_gt):
|
| return self.produce_fake_densepose_losses(densepose_predictor_outputs)
|
|
|
| accumulator = ChartBasedAnnotationsAccumulator()
|
| packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator)
|
|
|
|
|
|
|
|
|
|
|
| if packed_annotations is None:
|
| return self.produce_fake_densepose_losses(densepose_predictor_outputs)
|
|
|
| h, w = densepose_predictor_outputs.u.shape[2:]
|
| interpolator = BilinearInterpolationHelper.from_matches(
|
| packed_annotations,
|
| (h, w),
|
| )
|
|
|
| j_valid_fg = interpolator.j_valid * (
|
| packed_annotations.fine_segm_labels_gt > 0
|
| )
|
|
|
| if not torch.any(j_valid_fg):
|
| return self.produce_fake_densepose_losses(densepose_predictor_outputs)
|
|
|
| losses_uv = self.produce_densepose_losses_uv(
|
| proposals_with_gt,
|
| densepose_predictor_outputs,
|
| packed_annotations,
|
| interpolator,
|
| j_valid_fg,
|
| )
|
|
|
| losses_segm = self.produce_densepose_losses_segm(
|
| proposals_with_gt,
|
| densepose_predictor_outputs,
|
| packed_annotations,
|
| interpolator,
|
| j_valid_fg,
|
| )
|
|
|
| return {**losses_uv, **losses_segm}
|
|
|
| def produce_fake_densepose_losses(self, densepose_predictor_outputs: Any) -> LossDict:
|
| """
|
| Fake losses for fine segmentation and U/V coordinates. These are used when
|
| no suitable ground truth data was found in a batch. The loss has a value 0
|
| and is primarily used to construct the computation graph, so that
|
| `DistributedDataParallel` has similar graphs on all GPUs and can perform
|
| reduction properly.
|
|
|
| Args:
|
| densepose_predictor_outputs: DensePose predictor outputs, an object
|
| of a dataclass that is assumed to have the following attributes:
|
| * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
|
| * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| Return:
|
| dict: str -> tensor: dict of losses with the following entries:
|
| * `loss_densepose_U`: has value 0
|
| * `loss_densepose_V`: has value 0
|
| * `loss_densepose_I`: has value 0
|
| * `loss_densepose_S`: has value 0
|
| """
|
| losses_uv = self.produce_fake_densepose_losses_uv(densepose_predictor_outputs)
|
| losses_segm = self.produce_fake_densepose_losses_segm(densepose_predictor_outputs)
|
| return {**losses_uv, **losses_segm}
|
|
|
| def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict:
|
| """
|
| Fake losses for U/V coordinates. These are used when no suitable ground
|
| truth data was found in a batch. The loss has a value 0
|
| and is primarily used to construct the computation graph, so that
|
| `DistributedDataParallel` has similar graphs on all GPUs and can perform
|
| reduction properly.
|
|
|
| Args:
|
| densepose_predictor_outputs: DensePose predictor outputs, an object
|
| of a dataclass that is assumed to have the following attributes:
|
| * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| Return:
|
| dict: str -> tensor: dict of losses with the following entries:
|
| * `loss_densepose_U`: has value 0
|
| * `loss_densepose_V`: has value 0
|
| """
|
| return {
|
| "loss_densepose_U": densepose_predictor_outputs.u.sum() * 0,
|
| "loss_densepose_V": densepose_predictor_outputs.v.sum() * 0,
|
| }
|
|
|
| def produce_fake_densepose_losses_segm(self, densepose_predictor_outputs: Any) -> LossDict:
|
| """
|
| Fake losses for fine / coarse segmentation. These are used when
|
| no suitable ground truth data was found in a batch. The loss has a value 0
|
| and is primarily used to construct the computation graph, so that
|
| `DistributedDataParallel` has similar graphs on all GPUs and can perform
|
| reduction properly.
|
|
|
| Args:
|
| densepose_predictor_outputs: DensePose predictor outputs, an object
|
| of a dataclass that is assumed to have the following attributes:
|
| * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
|
| * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
|
| Return:
|
| dict: str -> tensor: dict of losses with the following entries:
|
| * `loss_densepose_I`: has value 0
|
| * `loss_densepose_S`: has value 0, added only if `segm_trained_by_masks` is False
|
| """
|
| losses = {
|
| "loss_densepose_I": densepose_predictor_outputs.fine_segm.sum() * 0,
|
| "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs),
|
| }
|
| return losses
|
|
|
| def produce_densepose_losses_uv(
|
| self,
|
| proposals_with_gt: List[Instances],
|
| densepose_predictor_outputs: Any,
|
| packed_annotations: Any,
|
| interpolator: BilinearInterpolationHelper,
|
| j_valid_fg: torch.Tensor,
|
| ) -> LossDict:
|
| """
|
| Compute losses for U/V coordinates: smooth L1 loss between
|
| estimated coordinates and the ground truth.
|
|
|
| Args:
|
| proposals_with_gt (list of Instances): detections with associated ground truth data
|
| densepose_predictor_outputs: DensePose predictor outputs, an object
|
| of a dataclass that is assumed to have the following attributes:
|
| * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
|
| Return:
|
| dict: str -> tensor: dict of losses with the following entries:
|
| * `loss_densepose_U`: smooth L1 loss for U coordinate estimates
|
| * `loss_densepose_V`: smooth L1 loss for V coordinate estimates
|
| """
|
| u_gt = packed_annotations.u_gt[j_valid_fg]
|
| u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg]
|
| v_gt = packed_annotations.v_gt[j_valid_fg]
|
| v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg]
|
| return {
|
| "loss_densepose_U": F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points,
|
| "loss_densepose_V": F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points,
|
| }
|
|
|
| def produce_densepose_losses_segm(
|
| self,
|
| proposals_with_gt: List[Instances],
|
| densepose_predictor_outputs: Any,
|
| packed_annotations: Any,
|
| interpolator: BilinearInterpolationHelper,
|
| j_valid_fg: torch.Tensor,
|
| ) -> LossDict:
|
| """
|
| Losses for fine / coarse segmentation: cross-entropy
|
| for segmentation unnormalized scores given ground truth labels at
|
| annotated points for fine segmentation and dense mask annotations
|
| for coarse segmentation.
|
|
|
| Args:
|
| proposals_with_gt (list of Instances): detections with associated ground truth data
|
| densepose_predictor_outputs: DensePose predictor outputs, an object
|
| of a dataclass that is assumed to have the following attributes:
|
| * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
|
| * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
|
| Return:
|
| dict: str -> tensor: dict of losses with the following entries:
|
| * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine
|
| segmentation estimates given ground truth labels
|
| * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse
|
| segmentation estimates given ground truth labels;
|
| may be included if coarse segmentation is only trained
|
| using DensePose ground truth; if additional supervision through
|
| instance segmentation data is performed (`segm_trained_by_masks` is True),
|
| this loss is handled by `produce_mask_losses` instead
|
| """
|
| fine_segm_gt = packed_annotations.fine_segm_labels_gt[
|
| interpolator.j_valid
|
| ]
|
| fine_segm_est = interpolator.extract_at_points(
|
| densepose_predictor_outputs.fine_segm,
|
| slice_fine_segm=slice(None),
|
| w_ylo_xlo=interpolator.w_ylo_xlo[:, None],
|
| w_ylo_xhi=interpolator.w_ylo_xhi[:, None],
|
| w_yhi_xlo=interpolator.w_yhi_xlo[:, None],
|
| w_yhi_xhi=interpolator.w_yhi_xhi[:, None],
|
| )[interpolator.j_valid, :]
|
| return {
|
| "loss_densepose_I": F.cross_entropy(fine_segm_est, fine_segm_gt.long()) * self.w_part,
|
| "loss_densepose_S": self.segm_loss(
|
| proposals_with_gt, densepose_predictor_outputs, packed_annotations
|
| )
|
| * self.w_segm,
|
| }
|
|
|