unknownuser6666's picture
Upload folder using huggingface_hub
663494c verified
import copy
import math
from typing import List
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from mmdet.core import build_assigner, reduce_mean
from mmdet.models import build_loss
from mmdet.models.builder import LOSSES
from .structures import Instances
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
return dist.get_world_size()
@torch.no_grad()
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
if target.numel() == 0:
return [torch.zeros([], device=output.device)]
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
@LOSSES.register_module()
class ClipMatcherOLD(nn.Module):
# modified from https://github.com/megvii-model/MOTR/blob/main/models/motr.py#L38
def __init__(
self,
num_classes,
weight_dict,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2],
assigner=dict(
type="HungarianAssigner3D",
cls_cost=dict(type="FocalLossCost", weight=2.0),
reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
),
loss_cls=dict(
type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
),
loss_bbox=dict(type="L1Loss", loss_weight=0.25),
):
"""Create the criterion.
Parameters:
num_classes: number of object categories, omitting the special no-object category
weight_dict: dict containing as key the names of the losses and as values their relative weight.
eos_coef: relative classification weight applied to the no-object category
"""
super().__init__()
self.num_classes = num_classes
self.matcher = build_assigner(assigner)
self.loss_cls = build_loss(loss_cls)
self.loss_bboxes = build_loss(loss_bbox)
self.loss_predictions = nn.SmoothL1Loss(reduction="none", beta=1.0)
self.register_buffer(
"code_weights", torch.tensor(code_weights, requires_grad=False)
)
self.weight_dict = weight_dict
# self.losses = ['labels', 'boxes', 'cardinality']
self.losses = ["labels", "boxes"]
self.focal_loss = True
self.losses_dict = {}
self._current_frame_idx = 0
def _get_src_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat(
[torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
)
src_idx = torch.cat([src for (src, _) in indices])
return batch_idx, src_idx
def _get_tgt_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat(
[torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
)
tgt_idx = torch.cat([tgt for (_, tgt) in indices])
return batch_idx, tgt_idx
def initialize_for_single_clip(self, gt_instances: List[Instances]):
self.gt_instances = gt_instances
self.num_samples = 0
self.sample_device = None
self._current_frame_idx = 0
self.losses_dict = {}
def _step(self):
self._current_frame_idx += 1
def calc_loss_for_track_scores(self, track_instances: Instances):
frame_id = self._current_frame_idx - 1
gt_instances = self.gt_instances[frame_id]
outputs = {
"pred_logits": track_instances.track_scores[None],
}
device = track_instances.track_scores.device
num_tracks = len(track_instances)
src_idx = torch.arange(num_tracks, dtype=torch.long, device=device)
tgt_idx = (
track_instances.matched_gt_idxes
) # -1 for FP tracks and disappeared tracks
track_losses = self.get_loss(
"labels",
outputs=outputs,
gt_instances=[gt_instances],
indices=[(src_idx, tgt_idx)],
num_boxes=1,
)
self.losses_dict.update(
{
"frame_{}_track_{}".format(frame_id, key): value
for key, value in track_losses.items()
}
)
def get_num_boxes(self, num_samples):
num_boxes = torch.as_tensor(
num_samples, dtype=torch.float, device=self.sample_device
)
if is_dist_avail_and_initialized():
torch.distributed.all_reduce(num_boxes)
num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
return num_boxes
@torch.no_grad()
def loss_cardinality(self, outputs, targets, indices):
"""Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
"""
pred_logits = outputs["pred_logits"]
device = pred_logits.device
tgt_lengths = torch.as_tensor([len(v.labels) for v in targets], device=device)
# Count the number of predictions that are NOT "no-object" (which is the last class)
card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
losses = {"cardinality_error": card_err}
return losses
def get_loss(self, loss, outputs, gt_instances, indices, **kwargs):
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
}
assert loss in loss_map, f"do you really want to compute {loss} loss?"
return loss_map[loss](outputs, gt_instances, indices, **kwargs)
def loss_boxes(self, outputs, gt_instances: List[Instances], indices: List[tuple]):
"""Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
"""
# We ignore the regression loss of the track-disappear slots.
# TODO: Make this filter process more elegant.
filtered_idx = []
for src_per_img, tgt_per_img in indices:
keep = tgt_per_img != -1
filtered_idx.append((src_per_img[keep], tgt_per_img[keep]))
indices = filtered_idx
idx = self._get_src_permutation_idx(indices)
src_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat(
[gt_per_img.boxes[i] for gt_per_img, (_, i) in zip(gt_instances, indices)],
dim=0,
)
# from IPython import embed
# embed()
# for pad target, don't calculate regression loss, judged by whether obj_id=-1
target_obj_ids = torch.cat(
[
gt_per_img.obj_ids[i]
for gt_per_img, (_, i) in zip(gt_instances, indices)
],
dim=0,
) # size(16)
# [num_matched]
mask = target_obj_ids != -1
bbox_weights = torch.ones_like(target_boxes) * self.code_weights
avg_factor = src_boxes[mask].size(0)
avg_factor = reduce_mean(target_boxes.new_tensor([avg_factor]))
loss_bbox = self.loss_bboxes(
src_boxes[mask],
target_boxes[mask],
bbox_weights[mask],
avg_factor=avg_factor.item(),
)
losses = {}
losses["loss_bbox"] = loss_bbox
# losses['loss_bbox'] = loss_bbox.sum() / num_boxes
return losses
def loss_labels(self, outputs, gt_instances: List[Instances], indices, log=False):
"""Classification loss (NLL)
targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
indices: [(src_idx, tgt_idx)]
"""
# [bs=1, num_query, num_classes]
src_logits = outputs["pred_logits"]
# batch_idx, src_idx
idx = self._get_src_permutation_idx(indices)
# [bs, num_query]
target_classes = torch.full(
src_logits.shape[:2],
self.num_classes,
dtype=torch.int64,
device=src_logits.device,
)
# The matched gt for disappear track query is set -1.
labels = []
for gt_per_img, (_, J) in zip(gt_instances, indices):
labels_per_img = torch.ones_like(J) * self.num_classes
# set labels of track-appear slots to num_classes
if len(gt_per_img) > 0:
labels_per_img[J != -1] = gt_per_img.labels[J[J != -1]]
labels.append(labels_per_img)
# [num_matched]
target_classes_o = torch.cat(labels)
# [bs, num_query]
target_classes[idx] = target_classes_o
label_weights = torch.ones_like(target_classes)
# float tensor
avg_factor = target_classes_o.numel() # pos + mathced gt for disapper track
avg_factor = reduce_mean(src_logits.new_tensor([avg_factor]))
loss_ce = self.loss_cls(
src_logits.flatten(0, 1),
target_classes.flatten(0),
label_weights.flatten(0),
avg_factor,
)
losses = {"loss_cls": loss_ce}
if log:
# TODO this should probably be a separate loss, not hacked in this one here
losses["class_error"] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
return losses
def match_for_single_frame(self, outputs: dict, dec_lvl: int, if_step=False):
# initialize tracklets
outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
track_instances: Instances = outputs_without_aux["track_instances"]
pred_logits_i = track_instances.pred_logits # predicted logits of i-th image.
pred_boxes_i = track_instances.pred_boxes # predicted boxes of i-th image.
outputs_i = {
"pred_logits": pred_logits_i.unsqueeze(0),
"pred_boxes": pred_boxes_i.unsqueeze(0),
}
# print(track_instances.obj_idxes)
# print(track_instances.matched_gt_idxes)
# process GT, gt instances of i-th image.
gt_instances_i = self.gt_instances[
self._current_frame_idx
]
obj_idxes = gt_instances_i.obj_ids # real unique IDs
obj_idxes_list = obj_idxes.detach().cpu().numpy().tolist()
obj_idx_to_gt_idx = {
obj_idx: gt_idx for gt_idx, obj_idx in enumerate(obj_idxes_list)
}
# step1. inherit and update the previous tracks.
num_disappear_track = 0
for j in range(len(track_instances)):
obj_id = track_instances.obj_idxes[j].item()
# set new target idx.
if obj_id >= 0:
# objects tracked is in the GT IDs
if obj_id in obj_idx_to_gt_idx:
track_instances.matched_gt_idxes[j] = obj_idx_to_gt_idx[obj_id]
# objects tracked not in GT anymore, GT disappeared
else:
num_disappear_track += 1
track_instances.matched_gt_idxes[j] = -1 # track-disappear case.
# objects not tracked yet, not having an assigned IDs
else:
track_instances.matched_gt_idxes[j] = -1
# previsouly tracked, which is matched by rule
full_track_idxes = torch.arange(len(track_instances), dtype=torch.long).to(
pred_logits_i.device
)
matched_track_idxes = track_instances.obj_idxes >= 0
prev_matched_indices = torch.stack(
[
full_track_idxes[matched_track_idxes],
track_instances.matched_gt_idxes[matched_track_idxes],
],
dim=1,
).to(pred_logits_i.device)
# step2. select the unmatched slots.
# note that the FP tracks whose obj_idxes are -2 will not be selected here.
unmatched_track_idxes = full_track_idxes[track_instances.obj_idxes == -1]
# print(unmatched_track_idxes)
# step3. select the untracked gt instances (new tracks).
tgt_indexes = track_instances.matched_gt_idxes
tgt_indexes = tgt_indexes[tgt_indexes != -1]
tgt_state = torch.zeros(len(gt_instances_i)).to(pred_logits_i.device)
tgt_state[tgt_indexes] = 1
# new tgt indexes
untracked_tgt_indexes = torch.arange(len(gt_instances_i)).to(
pred_logits_i.device
)[tgt_state == 0]
# untracked_tgt_indexes = select_unmatched_indexes(tgt_indexes, len(gt_instances_i))
# [num_untracked]
untracked_gt_instances = gt_instances_i[untracked_tgt_indexes]
def match_for_single_decoder_layer(unmatched_outputs, matcher):
bbox_preds, cls_preds = (
unmatched_outputs["pred_boxes"],
unmatched_outputs["pred_logits"],
)
bs, num_querys = bbox_preds.shape[:2]
# Also concat the target labels and boxes
targets = [untracked_gt_instances]
if isinstance(targets[0], Instances):
# [num_box], [num_box, 9] (un-normalized bboxes)
gt_labels = torch.cat([gt_per_img.labels for gt_per_img in targets])
gt_bboxes = torch.cat([gt_per_img.boxes for gt_per_img in targets])
else:
gt_labels = torch.cat([v["labels"] for v in targets])
gt_bboxes = torch.cat([v["boxes"] for v in targets])
bbox_pred = bbox_preds[0]
cls_pred = cls_preds[0]
src_idx, tgt_idx = matcher.assign(bbox_pred, cls_pred, gt_bboxes, gt_labels)
if src_idx is None:
return None
# concat src and tgt.
new_matched_indices = torch.stack(
[unmatched_track_idxes[src_idx], untracked_tgt_indexes[tgt_idx]], dim=1
).to(pred_logits_i.device)
return new_matched_indices
# step4. do matching between the unmatched slots and GTs.
unmatched_outputs = {
# [bs, num_pred, num_classes]
"pred_logits": track_instances.pred_logits[unmatched_track_idxes].unsqueeze(
0
),
# [bs, num_pred, box_dim]
"pred_boxes": track_instances.pred_boxes[unmatched_track_idxes].unsqueeze(
0
),
}
# [num_new_matched, 2], mapping between track index -> GT index
new_matched_indices = match_for_single_decoder_layer(
unmatched_outputs, self.matcher
)
# step5. update obj_idxes according to the new matching result.
# i.e., copy the global unique IDs from GT to tracklets
if new_matched_indices is not None:
track_instances.obj_idxes[
new_matched_indices[:, 0]
] = gt_instances_i.obj_ids[new_matched_indices[:, 1]].long()
track_instances.matched_gt_idxes[
new_matched_indices[:, 0]
] = new_matched_indices[:, 1]
# step7. merge the unmatched pairs and the matched pairs.
# [num_new_macthed + num_prev_mathed, 2]
matched_indices = torch.cat(
[new_matched_indices, prev_matched_indices], dim=0
)
else:
matched_indices = prev_matched_indices
# step8. calculate losses.
self.num_samples += len(gt_instances_i) + num_disappear_track
self.sample_device = pred_logits_i.device
for loss in self.losses:
new_track_loss = self.get_loss(
loss,
outputs=outputs_i,
gt_instances=[gt_instances_i],
indices=[(matched_indices[:, 0], matched_indices[:, 1])],
)
self.losses_dict.update(
{
"frame_{}_{}_{}".format(
self._current_frame_idx, key, dec_lvl
): value
for key, value in new_track_loss.items()
}
)
if "aux_outputs" in outputs:
for i, aux_outputs in enumerate(outputs["aux_outputs"]):
unmatched_outputs_layer = {
"pred_logits": aux_outputs["pred_logits"][
0, unmatched_track_idxes
].unsqueeze(0),
"pred_boxes": aux_outputs["pred_boxes"][
0, unmatched_track_idxes
].unsqueeze(0),
}
new_matched_indices_layer = match_for_single_decoder_layer(
unmatched_outputs_layer, self.matcher
)
matched_indices_layer = torch.cat(
[new_matched_indices_layer, prev_matched_indices], dim=0
)
for loss in self.losses:
if loss == "masks":
# Intermediate masks losses are too costly to compute, we ignore them.
continue
l_dict = self.get_loss(
loss,
aux_outputs,
gt_instances=[gt_instances_i],
indices=[
(matched_indices_layer[:, 0], matched_indices_layer[:, 1])
],
)
self.losses_dict.update(
{
"frame_{}_aux{}_{}".format(
self._current_frame_idx, i, key
): value
for key, value in l_dict.items()
}
)
if if_step:
self._step()
return track_instances
def forward(self, outputs, input_data: dict):
# losses of each frame are calculated during the model's forwarding and are outputted by the model as outputs['losses_dict].
losses = outputs.pop("losses_dict")
num_samples = self.get_num_boxes(self.num_samples)
for loss_name, loss in losses.items():
losses[loss_name] /= num_samples
return losses
def prediction_loss(self, track_instances, predictions):
decay_ratio = 1.0
for i in range(self._current_frame_idx, len(self.gt_instances)):
gt_instances_i = self.gt_instances[i] # gt instances of i-th image.
pred_boxes_i = predictions[i - self._current_frame_idx]
obj_idxes = gt_instances_i.obj_ids
obj_idxes_list = obj_idxes.detach().cpu().numpy().tolist()
obj_idx_to_gt_idx = {
obj_idx: gt_idx for gt_idx, obj_idx in enumerate(obj_idxes_list)
}
num_paired = 0
for j in range(len(track_instances)):
obj_id = track_instances.obj_idxes[j].item()
# set new target idx.
if obj_id >= 0:
if obj_id in obj_idx_to_gt_idx:
track_instances.matched_gt_idxes[j] = obj_idx_to_gt_idx[obj_id]
num_paired += 1
else:
track_instances.matched_gt_idxes[
j
] = -1 # track-disappear case.
else:
track_instances.matched_gt_idxes[j] = -1
if num_paired > 0:
if_paired_i = track_instances.matched_gt_idxes >= 0
paired_pred_boxes_i = pred_boxes_i[if_paired_i]
paired_gt_instances = gt_instances_i[
track_instances.matched_gt_idxes[if_paired_i]
]
normalized_bboxes = paired_gt_instances.boxes
cx = normalized_bboxes[..., 0:1]
cy = normalized_bboxes[..., 1:2]
cz = normalized_bboxes[..., 4:5]
gt_boxes_i = torch.cat([cx, cy, cz], dim=-1)
pred_loss_i = (
0.2
* decay_ratio
* self.loss_predictions(paired_pred_boxes_i, gt_boxes_i)
.sum(dim=-1)
.mean()
)
self.losses_dict["pred_loss_{}".format(i)] = pred_loss_i
else:
self.losses_dict["pred_loss_{}".format(i)] = torch.tensor([0.0]).cuda()
decay_ratio = decay_ratio * 0.5