chinmaygarde
/

SparseBev

@@ -70,6 +70,7 @@ class SparseBEVHead(DETRHead):
         query_bbox = self.init_query_bbox.weight.clone()  # [Q, 10]
         #query_bbox[..., :3] = query_bbox[..., :3].sigmoid()
         B = mlvl_feats[0].shape[0]
         query_bbox, query_feat, attn_mask, mask_dict = self.prepare_for_dn_input(B, query_bbox, self.label_enc, img_metas)
@@ -92,7 +93,7 @@ class SparseBEVHead(DETRHead):
             bbox_preds[..., 5:10],
         ], dim=-1)  # [cx, cy, w, l, cz, h, sin, cos, vx, vy]
-        if mask_dict is not None and mask_dict['pad_size'] > 0:
             output_known_cls_scores = cls_scores[:, :, :mask_dict['pad_size'], :]
             output_known_bbox_preds = bbox_preds[:, :, :mask_dict['pad_size'], :]
             output_cls_scores = cls_scores[:, :, mask_dict['pad_size']:, :]
@@ -116,6 +117,10 @@ class SparseBEVHead(DETRHead):
         return outs
     def prepare_for_dn_input(self, batch_size, init_query_bbox, label_enc, img_metas):
         device = init_query_bbox.device
         indicator0 = torch.zeros([self.num_query, 1], device=device)
         init_query_feat = label_enc.weight[self.num_classes].repeat(self.num_query, 1)

         query_bbox = self.init_query_bbox.weight.clone()  # [Q, 10]
         #query_bbox[..., :3] = query_bbox[..., :3].sigmoid()
+        # query denoising
         B = mlvl_feats[0].shape[0]
         query_bbox, query_feat, attn_mask, mask_dict = self.prepare_for_dn_input(B, query_bbox, self.label_enc, img_metas)
             bbox_preds[..., 5:10],
         ], dim=-1)  # [cx, cy, w, l, cz, h, sin, cos, vx, vy]
+        if mask_dict is not None and mask_dict['pad_size'] > 0:  # if using query denoising
             output_known_cls_scores = cls_scores[:, :, :mask_dict['pad_size'], :]
             output_known_bbox_preds = bbox_preds[:, :, :mask_dict['pad_size'], :]
             output_cls_scores = cls_scores[:, :, mask_dict['pad_size']:, :]
         return outs
     def prepare_for_dn_input(self, batch_size, init_query_bbox, label_enc, img_metas):
+        # mostly borrowed from:
+        #  - https://github.com/IDEA-Research/DN-DETR/blob/main/models/DN_DAB_DETR/dn_components.py
+        #  - https://github.com/megvii-research/PETR/blob/main/projects/mmdet3d_plugin/models/dense_heads/petrv2_dnhead.py
         device = init_query_bbox.device
         indicator0 = torch.zeros([self.num_query, 1], device=device)
         init_query_feat = label_enc.weight[self.num_classes].repeat(self.num_query, 1)

models/sparsebev_sampling.py CHANGED Viewed

@@ -25,9 +25,25 @@ def make_sample_points(query_bbox, offset, pc_range):
 def sampling_4d(sample_points, mlvl_feats, scale_weights, lidar2img, image_h, image_w, eps=1e-5):
-    B, Q, T, G, P, _ = sample_points.shape  # [B, Q, T, G, P, 4]
     N = 6
     sample_points = sample_points.reshape(B, Q, T, G * P, 3)
     # get the projection matrix
@@ -42,7 +58,7 @@ def sampling_4d(sample_points, mlvl_feats, scale_weights, lidar2img, image_h, im
     sample_points = sample_points.expand(B, Q, N, T, G * P, 4, 1)
     sample_points = sample_points.transpose(1, 3)   # [B, T, N, Q, GP, 4, 1]
-    # project 3d sampling points to image
     sample_points_cam = torch.matmul(lidar2img, sample_points).squeeze(-1)  # [B, T, N, Q, GP, 4]
     # homo coord -> pixel coord
@@ -62,6 +78,7 @@ def sampling_4d(sample_points, mlvl_feats, scale_weights, lidar2img, image_h, im
         & (sample_points_cam[..., 0:1] < 1.0)
     ).squeeze(-1).float()  # [B, T, N, Q, GP]
     if DUMP.enabled:
         torch.save(torch.cat([sample_points_cam, homo_nonzero], dim=-1),
                    '{}/sample_points_cam_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
@@ -71,6 +88,7 @@ def sampling_4d(sample_points, mlvl_feats, scale_weights, lidar2img, image_h, im
     valid_mask = valid_mask.permute(0, 1, 3, 4, 2)  # [B, T, Q, GP, N]
     sample_points_cam = sample_points_cam.permute(0, 1, 3, 4, 2, 5)  # [B, T, Q, GP, N, 2]
     i_batch = torch.arange(B, dtype=torch.long, device=sample_points.device)
     i_query = torch.arange(Q, dtype=torch.long, device=sample_points.device)
     i_time = torch.arange(T, dtype=torch.long, device=sample_points.device)
@@ -79,21 +97,31 @@ def sampling_4d(sample_points, mlvl_feats, scale_weights, lidar2img, image_h, im
     i_time = i_time.view(1, T, 1, 1, 1).expand(B, T, Q, G * P, 1)
     i_query = i_query.view(1, 1, Q, 1, 1).expand(B, T, Q, G * P, 1)
     i_point = i_point.view(1, 1, 1, G * P, 1).expand(B, T, Q, G * P, 1)
     i_view = torch.argmax(valid_mask, dim=-1)[..., None]  # [B, T, Q, GP, 1]
     sample_points_cam = sample_points_cam[i_batch, i_time, i_query, i_point, i_view, :]  # [B, Q, GP, 1, 2]
     valid_mask = valid_mask[i_batch, i_time, i_query, i_point, i_view]  # [B, Q, GP, 1]
-    sample_points_cam = torch.cat([sample_points_cam, i_view[..., None].float() / 5], dim=-1)
     sample_points_cam = sample_points_cam.reshape(B, T, Q, G, P, 1, 3)
     sample_points_cam = sample_points_cam.permute(0, 1, 3, 2, 4, 5, 6)  # [B, T, G, Q, P, 1, 3]
     sample_points_cam = sample_points_cam.reshape(B*T*G, Q, P, 3)
     scale_weights = scale_weights.reshape(B, Q, G, T, P, -1)
     scale_weights = scale_weights.permute(0, 2, 3, 1, 4, 5)
     scale_weights = scale_weights.reshape(B*G*T, Q, P, -1)
     final = msmv_sampling(mlvl_feats, sample_points_cam, scale_weights)
     C = final.shape[2]  # [BTG, Q, C, P]
     final = final.reshape(B, T, G, Q, C, P)
     final = final.permute(0, 3, 2, 1, 5, 4)

 def sampling_4d(sample_points, mlvl_feats, scale_weights, lidar2img, image_h, image_w, eps=1e-5):
+    """
+    Args:
+        sample_points: 3D sampling points in shape [B, Q, T, G, P, 3]
+        mlvl_feats: list of multi-scale features from neck, each in shape [B*T*G, C, N, H, W]
+        scale_weights: weights for multi-scale aggregation, [B, Q, G, T, P, L]
+        lidar2img: 4x4 projection matrix in shape [B, TN, 4, 4]
+    Symbol meaning:
+        B: batch size
+        Q: num of queries
+        T: num of frames
+        G: num of groups (we follow the group sampling mechanism of AdaMixer)
+        P: num of sampling points per frame per group
+        N: num of views (six for nuScenes)
+        L: num of layers of feature pyramid (typically it is 4: C2, C3, C4, C5)
+    """
+    B, Q, T, G, P, _ = sample_points.shape  # [B, Q, T, G, P, 3]
     N = 6
     sample_points = sample_points.reshape(B, Q, T, G * P, 3)
     # get the projection matrix
     sample_points = sample_points.expand(B, Q, N, T, G * P, 4, 1)
     sample_points = sample_points.transpose(1, 3)   # [B, T, N, Q, GP, 4, 1]
+    # project 3d sampling points to N views
     sample_points_cam = torch.matmul(lidar2img, sample_points).squeeze(-1)  # [B, T, N, Q, GP, 4]
     # homo coord -> pixel coord
         & (sample_points_cam[..., 0:1] < 1.0)
     ).squeeze(-1).float()  # [B, T, N, Q, GP]
+    # for visualization only
     if DUMP.enabled:
         torch.save(torch.cat([sample_points_cam, homo_nonzero], dim=-1),
                    '{}/sample_points_cam_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
     valid_mask = valid_mask.permute(0, 1, 3, 4, 2)  # [B, T, Q, GP, N]
     sample_points_cam = sample_points_cam.permute(0, 1, 3, 4, 2, 5)  # [B, T, Q, GP, N, 2]
+    # prepare batched indexing
     i_batch = torch.arange(B, dtype=torch.long, device=sample_points.device)
     i_query = torch.arange(Q, dtype=torch.long, device=sample_points.device)
     i_time = torch.arange(T, dtype=torch.long, device=sample_points.device)
     i_time = i_time.view(1, T, 1, 1, 1).expand(B, T, Q, G * P, 1)
     i_query = i_query.view(1, 1, Q, 1, 1).expand(B, T, Q, G * P, 1)
     i_point = i_point.view(1, 1, 1, G * P, 1).expand(B, T, Q, G * P, 1)
+    # we only keep at most one valid sampling point, see https://zhuanlan.zhihu.com/p/654821380
     i_view = torch.argmax(valid_mask, dim=-1)[..., None]  # [B, T, Q, GP, 1]
+    # index the only one sampling point and its valid flag
     sample_points_cam = sample_points_cam[i_batch, i_time, i_query, i_point, i_view, :]  # [B, Q, GP, 1, 2]
     valid_mask = valid_mask[i_batch, i_time, i_query, i_point, i_view]  # [B, Q, GP, 1]
+    # treat the view index as a new axis for grid_sample and normalize the view index to [0, 1]
+    sample_points_cam = torch.cat([sample_points_cam, i_view[..., None].float() / (N - 1)], dim=-1)
+    # reorganize the tensor to stack T and G to the batch dim for better parallelism
     sample_points_cam = sample_points_cam.reshape(B, T, Q, G, P, 1, 3)
     sample_points_cam = sample_points_cam.permute(0, 1, 3, 2, 4, 5, 6)  # [B, T, G, Q, P, 1, 3]
     sample_points_cam = sample_points_cam.reshape(B*T*G, Q, P, 3)
+    # reorganize the tensor to stack T and G to the batch dim for better parallelism
     scale_weights = scale_weights.reshape(B, Q, G, T, P, -1)
     scale_weights = scale_weights.permute(0, 2, 3, 1, 4, 5)
     scale_weights = scale_weights.reshape(B*G*T, Q, P, -1)
+    # multi-scale multi-view grid sample
     final = msmv_sampling(mlvl_feats, sample_points_cam, scale_weights)
+    # reorganize the sampled features
     C = final.shape[2]  # [BTG, Q, C, P]
     final = final.reshape(B, T, G, Q, C, P)
     final = final.permute(0, 3, 2, 1, 5, 4)

models/sparsebev_transformer.py CHANGED Viewed

@@ -43,6 +43,7 @@ class SparseBEVTransformerDecoder(BaseModule):
         self.num_layers = num_layers
         self.pc_range = pc_range
         self.decoder_layer = SparseBEVTransformerDecoderLayer(
             embed_dims, num_frames, num_points, num_levels, num_classes, code_size, pc_range=pc_range
         )
@@ -54,6 +55,7 @@ class SparseBEVTransformerDecoder(BaseModule):
     def forward(self, query_bbox, query_feat, mlvl_feats, attn_mask, img_metas):
         cls_scores, bbox_preds = [], []
         timestamps = np.array([m['img_timestamp'] for m in img_metas], dtype=np.float64)
         timestamps = np.reshape(timestamps, [query_bbox.shape[0], -1, 6])
         time_diff = timestamps[:, :1, :] - timestamps
@@ -61,10 +63,12 @@ class SparseBEVTransformerDecoder(BaseModule):
         time_diff = torch.from_numpy(time_diff).to(query_bbox.device)  # [B, F]
         img_metas[0]['time_diff'] = time_diff
         lidar2img = np.asarray([m['lidar2img'] for m in img_metas]).astype(np.float32)
         lidar2img = torch.from_numpy(lidar2img).to(query_bbox.device)  # [B, N, 4, 4]
         img_metas[0]['lidar2img'] = lidar2img
         for lvl, feat in enumerate(mlvl_feats):
             B, TN, GC, H, W = feat.shape  # [B, TN, GC, H, W]
             N, T, G, C = 6, TN // 6, 4, GC // 4
@@ -164,6 +168,7 @@ class SparseBEVTransformerDecoderLayer(BaseModule):
         bbox_pred = self.reg_branch(query_feat)  # [B, Q, code_size]
         bbox_pred = self.refine_bbox(query_bbox, bbox_pred)
         time_diff = img_metas[0]['time_diff']  # [B, F]
         if time_diff.shape[1] > 1:
             time_diff = time_diff.clone()
@@ -182,6 +187,7 @@ class SparseBEVTransformerDecoderLayer(BaseModule):
 class SparseBEVSelfAttention(BaseModule):
     def __init__(self, embed_dims=256, num_heads=8, dropout=0.1, pc_range=[], init_cfg=None):
         super().__init__(init_cfg)
         self.pc_range = pc_range
@@ -207,8 +213,10 @@ class SparseBEVSelfAttention(BaseModule):
         tau = tau.permute(0, 2, 1)  # [B, 8, Q]
         attn_mask = dist[:, None, :, :] * tau[..., None]  # [B, 8, Q, Q]
-        if pre_attn_mask is not None:
             attn_mask[:, :, pre_attn_mask] = float('-inf')
         attn_mask = attn_mask.flatten(0, 1)  # [Bx8, Q, Q]
         return self.attention(query_feat, attn_mask=attn_mask)
@@ -234,6 +242,7 @@ class SparseBEVSelfAttention(BaseModule):
 class SparseBEVSampling(BaseModule):
     def __init__(self, embed_dims=256, num_frames=4, num_groups=4, num_points=8, num_levels=4, pc_range=[], init_cfg=None):
         super().__init__(init_cfg)
@@ -302,6 +311,7 @@ class SparseBEVSampling(BaseModule):
 class AdaptiveMixing(nn.Module):
     def __init__(self, in_dim, in_points, n_groups=1, query_dim=None, out_dim=None, out_points=None):
         super(AdaptiveMixing, self).__init__()

         self.num_layers = num_layers
         self.pc_range = pc_range
+        # params are shared across all decoder layers
         self.decoder_layer = SparseBEVTransformerDecoderLayer(
             embed_dims, num_frames, num_points, num_levels, num_classes, code_size, pc_range=pc_range
         )
     def forward(self, query_bbox, query_feat, mlvl_feats, attn_mask, img_metas):
         cls_scores, bbox_preds = [], []
+        # calculate time difference according to timestamps
         timestamps = np.array([m['img_timestamp'] for m in img_metas], dtype=np.float64)
         timestamps = np.reshape(timestamps, [query_bbox.shape[0], -1, 6])
         time_diff = timestamps[:, :1, :] - timestamps
         time_diff = torch.from_numpy(time_diff).to(query_bbox.device)  # [B, F]
         img_metas[0]['time_diff'] = time_diff
+        # organize projections matrix and copy to CUDA
         lidar2img = np.asarray([m['lidar2img'] for m in img_metas]).astype(np.float32)
         lidar2img = torch.from_numpy(lidar2img).to(query_bbox.device)  # [B, N, 4, 4]
         img_metas[0]['lidar2img'] = lidar2img
+        # group image features in advance for sampling, see `sampling_4d` for more details
         for lvl, feat in enumerate(mlvl_feats):
             B, TN, GC, H, W = feat.shape  # [B, TN, GC, H, W]
             N, T, G, C = 6, TN // 6, 4, GC // 4
         bbox_pred = self.reg_branch(query_feat)  # [B, Q, code_size]
         bbox_pred = self.refine_bbox(query_bbox, bbox_pred)
+        # calculate absolute velocity according to time difference
         time_diff = img_metas[0]['time_diff']  # [B, F]
         if time_diff.shape[1] > 1:
             time_diff = time_diff.clone()
 class SparseBEVSelfAttention(BaseModule):
+    """Scale-adaptive Self Attention"""
     def __init__(self, embed_dims=256, num_heads=8, dropout=0.1, pc_range=[], init_cfg=None):
         super().__init__(init_cfg)
         self.pc_range = pc_range
         tau = tau.permute(0, 2, 1)  # [B, 8, Q]
         attn_mask = dist[:, None, :, :] * tau[..., None]  # [B, 8, Q, Q]
+        if pre_attn_mask is not None:  # for query denoising
             attn_mask[:, :, pre_attn_mask] = float('-inf')
         attn_mask = attn_mask.flatten(0, 1)  # [Bx8, Q, Q]
         return self.attention(query_feat, attn_mask=attn_mask)
 class SparseBEVSampling(BaseModule):
+    """Adaptive Spatio-temporal Sampling"""
     def __init__(self, embed_dims=256, num_frames=4, num_groups=4, num_points=8, num_levels=4, pc_range=[], init_cfg=None):
         super().__init__(init_cfg)
 class AdaptiveMixing(nn.Module):
+    """Adaptive Mixing"""
     def __init__(self, in_dim, in_points, n_groups=1, query_dim=None, out_dim=None, out_points=None):
         super(AdaptiveMixing, self).__init__()