Alfred Liu commited on Aug 21, 2023

Commit

d19bd3e

1 Parent(s): b0800d3

Code release

Files changed (40) hide show

.gitignore +53 -0
README.md +131 -1
configs/r101_nuimg_1408x512_900q_24ep.py +96 -0
configs/r50_in1k_704x256_900q_36ep.py +21 -0
configs/r50_nuimg_704x256_400q_36ep.py +236 -0
gen_sweep_info.py +112 -0
loaders/__init__.py +6 -0
loaders/builder.py +49 -0
loaders/nuscenes_dataset.py +88 -0
loaders/pipelines/__init__.py +7 -0
loaders/pipelines/loading.py +154 -0
loaders/pipelines/transforms.py +394 -0
models/__init__.py +9 -0
models/backbones/__init__.py +3 -0
models/backbones/vovnet.py +383 -0
models/bbox/__init__.py +3 -0
models/bbox/assigners/__init__.py +3 -0
models/bbox/assigners/hungarian_assigner_3d.py +93 -0
models/bbox/coders/__init__.py +3 -0
models/bbox/coders/nms_free_coder.py +110 -0
models/bbox/match_costs/__init__.py +3 -0
models/bbox/match_costs/match_cost.py +53 -0
models/bbox/utils.py +77 -0
models/checkpoint.py +444 -0
models/csrc/__init__.py +0 -0
models/csrc/msmv_sampling/msmv_sampling.cpp +369 -0
models/csrc/msmv_sampling/msmv_sampling.h +43 -0
models/csrc/msmv_sampling/msmv_sampling_backward.cu +448 -0
models/csrc/msmv_sampling/msmv_sampling_forward.cu +333 -0
models/csrc/setup.py +24 -0
models/csrc/wrapper.py +87 -0
models/sparsebev.py +322 -0
models/sparsebev_head.py +469 -0
models/sparsebev_sampling.py +102 -0
models/sparsebev_transformer.py +370 -0
models/utils.py +309 -0
timing.py +100 -0
train.py +180 -0
utils.py +188 -0
val.py +137 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,53 @@

+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Compiled source
+build
+debug
+Debug
+release
+Release
+x64
+*.so
+*.whl
+# VS project files
+*.sln
+*.vcxproj
+*.vcxproj.filters
+*.vcxproj.user
+*.rc
+.vs
+# Byte-compiled / optimized / DLL files
+*__pycache__*
+*.py[cod]
+*$py.class
+# Distribution / packaging
+.Python
+build
+develop-eggs
+dist
+downloads
+# IDE
+.idea
+.vscode
+pyrightconfig.json
+# Custom
+data
+outputs
+prediction
+submission
+checkpoints
+pretrain
+*.png
+*.jpg

README.md CHANGED Viewed

@@ -1,2 +1,132 @@
 # SparseBEV
-[ICCV 2023] SparseBEV: High-Performance Sparse 3D Object Detection \\ from Multi-Camera Videos

 # SparseBEV
+This is the official PyTorch implementation for paper [SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera Videos](https://arxiv.org/abs/2308.09244). (ICCV 2023)
+## Model Zoo
+| Backbone | Pretrain | Input Size | Epochs | Training Cost | NDS | FPS | Config | Weights |
+|----------|----------|------------|--------|---------------|-----|-----|--------|---------|
+| R50 | [nuImages](https://github.com/open-mmlab/mmdetection3d/blob/main/configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e_20e_nuim.py) | 704x256 | 36 | 28h (8x2080Ti) | 55.8 | 23.5 | [config](configs/r50_nuimg_704x256_400q_36ep.py) | [weights](https://drive.google.com/file/d/1C_Vn3iiSnSW1Dw1r0DkjJMwvHC5Y3zTN/view?usp=sharing) |
+* FPS is measured on a machine with AMD 5800X and RTX 3090.
+* The noise is around 0.3 NDS.
+## Environment
+Install PyTorch 2.0 + CUDA 11.8:
+```
+conda create -n sparsebev python=3.8
+conda activate sparsebev
+conda install pytorch==2.0.0 torchvision==0.15.0 pytorch-cuda=11.8 -c pytorch -c nvidia
+```
+or PyTorch 1.10.2 + CUDA 10.2 for older GPUs:
+```
+conda create -n sparsebev python=3.8
+conda activate sparsebev
+conda install pytorch==1.10.2 torchvision==0.11.3 cudatoolkit=10.2 -c pytorch
+```
+Install other dependencies:
+```
+pip install openmim
+mim install mmcv-full==1.6.0
+mim install mmdet==2.28.2
+mim install mmsegmentation==0.30.0
+mim install mmdet3d==1.0.0rc6
+pip install setuptools==59.5.0
+pip install numpy==1.23.5
+```
+Install turbojpeg and pillow-simd to speed up data loading (optional but important):
+```
+sudo apt-get update
+sudo apt-get install -y libturbojpeg
+pip install pyturbojpeg
+pip uninstall pillow
+pip install pillow-simd==9.0.0.post1
+```
+Compile CUDA extensions:
+```
+cd models/csrc
+python setup.py build_ext --inplace
+```
+## Prepare Dataset
+1. Download nuScenes from [https://www.nuscenes.org/nuscenes](https://www.nuscenes.org/nuscenes) and put it in `data/nuscenes`.
+2. Download the generated info file from [Google Drive](https://drive.google.com/file/d/1uyoUuSRIVScrm_CUpge6V_UzwDT61ODO/view?usp=sharing) and unzip it.
+3. Folder structure:
+```
+data/nuscenes
+├── maps
+├── nuscenes_infos_test_sweep.pkl
+├── nuscenes_infos_train_mini_sweep.pkl
+├── nuscenes_infos_train_sweep.pkl
+├── nuscenes_infos_val_mini_sweep.pkl
+├── nuscenes_infos_val_sweep.pkl
+├── samples
+├── sweeps
+├── v1.0-mini
+├── v1.0-test
+└── v1.0-trainval
+```
+These `*.pkl` files can also be generated with our script: `gen_sweep_info.py`.
+## Training
+Train SparseBEV with 8 GPUs:
+```
+torchrun --nproc_per_node 8 train.py --config configs/r50_nuimg_704x256_400q_36ep.py
+```
+Train SparseBEV with 4 GPUs (i.e the last four GPUs):
+```
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+torchrun --nproc_per_node 4 train.py --config configs/r50_nuimg_704x256_400q_36ep.py
+```
+The batch size for each GPU will be scaled automatically. So there is no need to modify the `batch_size` in config files.
+## Evaluation
+Single-GPU evaluation:
+```
+export CUDA_VISIBLE_DEVICES=0
+python val.py --config configs/r50_nuimg_704x256_400q_36ep.py --weights checkpoints/r50_nuimg_704x256_400q_36ep.pth
+```
+Multi-GPU evaluation:
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+torchrun --nproc_per_node 8 val.py --config configs/r50_nuimg_704x256_400q_36ep.py --weights checkpoints/r50_nuimg_704x256_400q_36ep.pth
+```
+## Timing
+FPS is measured with a single GPU:
+```
+export CUDA_VISIBLE_DEVICES=0
+python timing.py --config configs/r50_nuimg_704x256_400q_36ep.py --weights checkpoints/r50_nuimg_704x256_400q_36ep.pth
+```
+## Acknowledgements
+Many thanks to these excellent open-source projects:
+* 3D Detection: [DETR3D](https://github.com/WangYueFt/detr3d), [PETR](https://github.com/megvii-research/PETR), [BEVFormer](https://github.com/fundamentalvision/BEVFormer), [BEVDet](https://github.com/HuangJunJie2017/BEVDet), [StreamPETR](https://github.com/exiawsh/StreamPETR)
+* 2D Detection: [AdaMixer](https://github.com/MCG-NJU/AdaMixer), [DN-DETR](https://github.com/IDEA-Research/DN-DETR)
+* Codebase: [MMDetection3D](https://github.com/open-mmlab/mmdetection3d), [CamLiFlow](https://github.com/MCG-NJU/CamLiFlow)

configs/r101_nuimg_1408x512_900q_24ep.py ADDED Viewed

	@@ -0,0 +1,96 @@

+_base_ = ['./r50_nuimg_704x256_400q_36ep.py']
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+img_backbone = dict(
+    type='ResNet',
+    depth=101,
+    with_cp=True,
+)
+img_neck = dict(
+    type='FPN',
+    in_channels=[256, 512, 1024, 2048],
+    out_channels=256,
+    num_outs=5,
+)
+model = dict(
+    img_backbone=img_backbone,
+    img_neck=img_neck,
+    pts_bbox_head=dict(
+        num_query=900,
+        transformer=dict(num_levels=5)),
+)
+ida_aug_conf = {
+    'resize_lim': (0.38 * 2, 0.55 * 2),
+    'final_dim': (512, 1408),
+    'bot_pct_lim': (0.0, 0.0),
+    'rot_lim': (0.0, 0.0),
+    'H': 900, 'W': 1600,
+    'rand_flip': True,
+}
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
+    dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=7),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
+    dict(type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05]),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], meta_keys=(
+        'filename', 'ori_shape', 'img_shape', 'pad_shape', 'lidar2img', 'img_timestamp'))
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
+    dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=7, test_mode=True),
+    dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False),
+            dict(type='Collect3D', keys=['img'], meta_keys=(
+                'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
+                'lidar2img', 'img_timestamp'))
+        ])
+]
+data = dict(
+    workers_per_gpu=4,
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline)
+)
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(custom_keys={
+        'img_backbone': dict(lr_mult=0.2),
+        'sampling_offset': dict(lr_mult=0.1),
+    }),
+    weight_decay=0.01
+)
+# load pretrained weights
+load_from = 'pretrain/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth'
+revise_keys = [('backbone', 'img_backbone')]
+total_epochs = 24
+eval_config = dict(interval=total_epochs)

configs/r50_in1k_704x256_900q_36ep.py ADDED Viewed

	@@ -0,0 +1,21 @@

+_base_ = ['./r50_nuimg_704x256_400q_36ep.py']
+img_backbone = dict(pretrained='torchvision://resnet50')
+model = dict(
+    img_backbone=img_backbone,
+    pts_bbox_head=dict(num_query=900)
+)
+optimizer = dict(
+    paramwise_cfg=dict(custom_keys={
+        'img_backbone': dict(lr_mult=0.4),
+        'sampling_offset': dict(lr_mult=0.1),
+    })
+)
+load_from = None
+revise_keys = None
+total_epochs = 36
+eval_config = dict(interval=total_epochs)

configs/r50_nuimg_704x256_400q_36ep.py ADDED Viewed

	@@ -0,0 +1,236 @@

+dataset_type = 'CustomNuScenesDataset'
+dataset_root = 'data/nuscenes/'
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True
+)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+# arch config
+embed_dims = 256
+num_layers = 6
+num_query = 400
+num_frames = 8
+num_levels = 4
+num_points = 4
+img_backbone = dict(
+    type='ResNet',
+    depth=50,
+    num_stages=4,
+    out_indices=(0, 1, 2, 3),
+    frozen_stages=1,
+    norm_cfg=dict(type='BN2d', requires_grad=True),
+    norm_eval=True,
+    style='pytorch',
+    with_cp=True)
+img_neck = dict(
+    type='FPN',
+    in_channels=[256, 512, 1024, 2048],
+    out_channels=embed_dims,
+    num_outs=num_levels)
+img_norm_cfg = dict(
+    mean=[123.675, 116.280, 103.530],
+    std=[58.395, 57.120, 57.375],
+    to_rgb=True)
+model = dict(
+    type='SparseBEV',
+    data_aug=dict(
+        img_color_aug=True,  # Move some augmentations to GPU
+        img_norm_cfg=img_norm_cfg,
+        img_pad_cfg=dict(size_divisor=32)),
+    stop_prev_grad=False,
+    img_backbone=img_backbone,
+    img_neck=img_neck,
+    pts_bbox_head=dict(
+        type='SparseBEVHead',
+        num_classes=10,
+        in_channels=embed_dims,
+        num_query=num_query,
+        query_denoising=True,
+        query_denoising_groups=10,
+        code_size=10,
+        code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        sync_cls_avg_factor=True,
+        transformer=dict(
+            type='SparseBEVTransformer',
+            embed_dims=embed_dims,
+            num_frames=num_frames,
+            num_points=num_points,
+            num_layers=num_layers,
+            num_levels=num_levels,
+            num_classes=10,
+            code_size=10,
+            pc_range=point_cloud_range),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            score_threshold=0.05,
+            num_classes=10),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=embed_dims // 2,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0),
+        )
+    ))
+)
+ida_aug_conf = {
+    'resize_lim': (0.38, 0.55),
+    'final_dim': (256, 704),
+    'bot_pct_lim': (0.0, 0.0),
+    'rot_lim': (0.0, 0.0),
+    'H': 900, 'W': 1600,
+    'rand_flip': True,
+}
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
+    dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
+    dict(type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05]),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], meta_keys=(
+        'filename', 'ori_shape', 'img_shape', 'pad_shape', 'lidar2img', 'img_timestamp'))
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
+    dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
+    dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False),
+            dict(type='Collect3D', keys=['img'], meta_keys=(
+                'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
+                'lidar2img', 'img_timestamp'))
+        ])
+]
+data = dict(
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_root=dataset_root,
+        ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        data_root=dataset_root,
+        ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=dataset_root,
+        ann_file=dataset_root + 'nuscenes_custom_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR')
+)
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(custom_keys={
+        'img_backbone': dict(lr_mult=0.1),
+        'sampling_offset': dict(lr_mult=0.1),
+    }),
+    weight_decay=0.01
+)
+optimizer_config = dict(
+    type='Fp16OptimizerHook',
+    loss_scale=512.0,
+    grad_clip=dict(max_norm=35, norm_type=2)
+)
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3
+)
+total_epochs = 36
+batch_size = 8
+# load pretrained weights
+load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth'
+revise_keys = [('backbone', 'img_backbone')]
+# resume the last training
+resume_from = None
+# checkpointing
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+# logging
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='MyTextLoggerHook', interval=1, reset_flag=True),
+        dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
+    ]
+)
+# evaluation
+eval_config = dict(interval=total_epochs)
+# other flags
+debug = False

gen_sweep_info.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Generate info files manually
+import os
+import mmcv
+import tqdm
+import pickle
+import argparse
+import numpy as np
+from nuscenes import NuScenes
+from pyquaternion import Quaternion
+parser = argparse.ArgumentParser()
+parser.add_argument('--data-root', default='data/nuscenes')
+parser.add_argument('--version', default='v1.0-trainval')
+args = parser.parse_args()
+def get_cam_info(nusc, sample_data):
+    pose_record = nusc.get('ego_pose', sample_data['ego_pose_token'])
+    cs_record = nusc.get('calibrated_sensor', sample_data['calibrated_sensor_token'])
+    sensor2ego_translation = cs_record['translation']
+    ego2global_translation = pose_record['translation']
+    sensor2ego_rotation = Quaternion(cs_record['rotation']).rotation_matrix
+    ego2global_rotation = Quaternion(pose_record['rotation']).rotation_matrix
+    cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+    sensor2global_rotation = sensor2ego_rotation.T @ ego2global_rotation.T
+    sensor2global_translation = sensor2ego_translation @ ego2global_rotation.T + ego2global_translation
+    return {
+        'data_path': os.path.join(args.data_root, sample_data['filename']),
+        'sensor2global_rotation': sensor2global_rotation,
+        'sensor2global_translation': sensor2global_translation,
+        'cam_intrinsic': cam_intrinsic,
+        'timestamp': sample_data['timestamp'],
+    }
+def add_sweep_info(nusc, sample_infos):
+    for curr_id in tqdm.tqdm(range(len(sample_infos['infos']))):
+        sample = nusc.get('sample', sample_infos['infos'][curr_id]['token'])
+        cam_types = [
+            'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT',
+            'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'
+        ]
+        curr_cams = dict()
+        for cam in cam_types:
+            curr_cams[cam] = nusc.get('sample_data', sample['data'][cam])
+        for cam in cam_types:
+            sample_data = nusc.get('sample_data', sample['data'][cam])
+            sweep_cam = get_cam_info(nusc, sample_data)
+            sample_infos['infos'][curr_id]['cams'][cam].update(sweep_cam)
+        # remove unnecessary
+        for cam in cam_types:
+            del sample_infos['infos'][curr_id]['cams'][cam]['sample_data_token']
+            del sample_infos['infos'][curr_id]['cams'][cam]['sensor2ego_translation']
+            del sample_infos['infos'][curr_id]['cams'][cam]['sensor2ego_rotation']
+            del sample_infos['infos'][curr_id]['cams'][cam]['ego2global_translation']
+            del sample_infos['infos'][curr_id]['cams'][cam]['ego2global_rotation']
+        sweep_infos = []
+        if sample['prev'] != '':  # add sweep frame between two key frame
+            for _ in range(5):
+                sweep_info = dict()
+                for cam in cam_types:
+                    if curr_cams[cam]['prev'] == '':
+                        sweep_info = sweep_infos[-1]
+                        break
+                    sample_data = nusc.get('sample_data', curr_cams[cam]['prev'])
+                    sweep_cam = get_cam_info(nusc, sample_data)
+                    curr_cams[cam] = sample_data
+                    sweep_info[cam] = sweep_cam
+                sweep_infos.append(sweep_info)
+        sample_infos['infos'][curr_id]['sweeps'] = sweep_infos
+    return sample_infos
+if __name__ == '__main__':
+    nusc = NuScenes(args.version, args.data_root)
+    if args.version == 'v1.0-trainval':
+        sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_train.pkl'), 'rb'))
+        sample_infos = add_sweep_info(nusc, sample_infos)
+        mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_train_sweep.pkl'))
+        sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_val.pkl'), 'rb'))
+        sample_infos = add_sweep_info(nusc, sample_infos)
+        mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_val_sweep.pkl'))
+    elif args.version == 'v1.0-test':
+        sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_test.pkl'), 'rb'))
+        sample_infos = add_sweep_info(nusc, sample_infos)
+        mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_test_sweep.pkl'))
+    elif args.version == 'v1.0-mini':
+        sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_train_mini.pkl'), 'rb'))
+        sample_infos = add_sweep_info(nusc, sample_infos)
+        mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_train_mini_sweep.pkl'))
+        sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_val_mini.pkl'), 'rb'))
+        sample_infos = add_sweep_info(nusc, sample_infos)
+        mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_val_mini_sweep.pkl'))
+    else:
+        raise ValueError

loaders/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .pipelines import __all__
+from .nuscenes_dataset import CustomNuScenesDataset
+__all__ = [
+    'CustomNuScenesDataset'
+]

loaders/builder.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from functools import partial
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from torch.utils.data import DataLoader
+from mmdet.datasets.builder import worker_init_fn
+from mmdet.datasets.samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     **kwargs):
+    rank, world_size = get_dist_info()
+    if dist:
+        # DistributedGroupSampler will definitely shuffle the data to satisfy
+        # that images on each GPU are in the same group
+        if shuffle:
+            sampler = DistributedGroupSampler(
+                dataset, samples_per_gpu, world_size, rank, seed=seed)
+        else:
+            sampler = DistributedSampler(
+                dataset, world_size, rank, shuffle=False, seed=seed)
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        worker_init_fn=init_fn,
+        **kwargs)
+    return data_loader

loaders/nuscenes_dataset.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import numpy as np
+from mmdet.datasets import DATASETS
+from mmdet3d.datasets import NuScenesDataset
+from pyquaternion import Quaternion
+@DATASETS.register_module()
+class CustomNuScenesDataset(NuScenesDataset):
+    def collect_sweeps(self, index, into_past=60, into_future=0):
+        all_sweeps_prev = []
+        curr_index = index
+        while len(all_sweeps_prev) < into_past:
+            curr_sweeps = self.data_infos[curr_index]['sweeps']
+            if len(curr_sweeps) == 0:
+                break
+            all_sweeps_prev.extend(curr_sweeps)
+            all_sweeps_prev.append(self.data_infos[curr_index - 1]['cams'])
+            curr_index = curr_index - 1
+        all_sweeps_next = []
+        curr_index = index + 1
+        while len(all_sweeps_next) < into_future:
+            if curr_index >= len(self.data_infos):
+                break
+            curr_sweeps = self.data_infos[curr_index]['sweeps']
+            all_sweeps_next.extend(curr_sweeps[::-1])
+            all_sweeps_next.append(self.data_infos[curr_index]['cams'])
+            curr_index = curr_index + 1
+        return all_sweeps_prev, all_sweeps_next
+    def get_data_info(self, index):
+        info = self.data_infos[index]
+        sweeps_prev, sweeps_next = self.collect_sweeps(index)
+        ego2global_translation = info['ego2global_translation']
+        ego2global_rotation = info['ego2global_rotation']
+        lidar2ego_translation = info['lidar2ego_translation']
+        lidar2ego_rotation = info['lidar2ego_rotation']
+        ego2global_rotation = Quaternion(ego2global_rotation).rotation_matrix
+        lidar2ego_rotation = Quaternion(lidar2ego_rotation).rotation_matrix
+        input_dict = dict(
+            sample_idx=info['token'],
+            sweeps={'prev': sweeps_prev, 'next': sweeps_next},
+            timestamp=info['timestamp'] / 1e6,
+            ego2global_translation=ego2global_translation,
+            ego2global_rotation=ego2global_rotation,
+            lidar2ego_translation=lidar2ego_translation,
+            lidar2ego_rotation=lidar2ego_rotation,
+        )
+        if self.modality['use_camera']:
+            img_paths = []
+            img_timestamps = []
+            lidar2img_rts = []
+            for _, cam_info in info['cams'].items():
+                img_paths.append(os.path.relpath(cam_info['data_path']))
+                img_timestamps.append(cam_info['timestamp'] / 1e6)
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info['sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+            input_dict.update(dict(
+                img_filename=img_paths,
+                img_timestamp=img_timestamps,
+                lidar2img=lidar2img_rts,
+            ))
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+        return input_dict

loaders/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .loading import LoadMultiViewImageFromMultiSweeps
+from .transforms import PadMultiViewImage, NormalizeMultiviewImage, PhotoMetricDistortionMultiViewImage
+__all__ = [
+    'LoadMultiViewImageFromMultiSweeps', 'PadMultiViewImage', 'NormalizeMultiviewImage',
+    'PhotoMetricDistortionMultiViewImage'
+]

loaders/pipelines/loading.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import mmcv
+import numpy as np
+from mmdet.datasets.builder import PIPELINES
+from numpy.linalg import inv
+from mmcv.runner import get_dist_info
+def compose_lidar2img(ego2global_translation_curr,
+                      ego2global_rotation_curr,
+                      lidar2ego_translation_curr,
+                      lidar2ego_rotation_curr,
+                      sensor2global_translation_past,
+                      sensor2global_rotation_past,
+                      cam_intrinsic_past):
+    R = sensor2global_rotation_past @ (inv(ego2global_rotation_curr).T @ inv(lidar2ego_rotation_curr).T)
+    T = sensor2global_translation_past @ (inv(ego2global_rotation_curr).T @ inv(lidar2ego_rotation_curr).T)
+    T -= ego2global_translation_curr @ (inv(ego2global_rotation_curr).T @ inv(lidar2ego_rotation_curr).T) + lidar2ego_translation_curr @ inv(lidar2ego_rotation_curr).T
+    lidar2cam_r = inv(R.T)
+    lidar2cam_t = T @ lidar2cam_r.T
+    lidar2cam_rt = np.eye(4)
+    lidar2cam_rt[:3, :3] = lidar2cam_r.T
+    lidar2cam_rt[3, :3] = -lidar2cam_t
+    viewpad = np.eye(4)
+    viewpad[:cam_intrinsic_past.shape[0], :cam_intrinsic_past.shape[1]] = cam_intrinsic_past
+    lidar2img = (viewpad @ lidar2cam_rt.T).astype(np.float32)
+    return lidar2img
+@PIPELINES.register_module()
+class LoadMultiViewImageFromMultiSweeps(object):
+    def __init__(self,
+                 sweeps_num=5,
+                 color_type='color',
+                 test_mode=False):
+        self.sweeps_num = sweeps_num
+        self.color_type = color_type
+        self.test_mode = test_mode
+        self.train_interval = [4, 8]
+        self.test_interval = 6
+        try:
+            mmcv.use_backend('turbojpeg')
+        except ImportError:
+            mmcv.use_backend('cv2')
+    def load_offline(self, results):
+        cam_types = [
+            'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
+            'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'
+        ]
+        if len(results['sweeps']['prev']) == 0:
+            for _ in range(self.sweeps_num):
+                for j in range(len(cam_types)):
+                    results['img'].append(results['img'][j])
+                    results['img_timestamp'].append(results['img_timestamp'][j])
+                    results['filename'].append(results['filename'][j])
+                    results['lidar2img'].append(np.copy(results['lidar2img'][j]))
+        else:
+            if self.test_mode:
+                interval = self.test_interval
+                choices = [(k + 1) * interval - 1 for k in range(self.sweeps_num)]
+            elif len(results['sweeps']['prev']) <= self.sweeps_num:
+                pad_len = self.sweeps_num - len(results['sweeps']['prev'])
+                choices = list(range(len(results['sweeps']['prev']))) + [len(results['sweeps']['prev']) - 1] * pad_len
+            else:
+                max_interval = len(results['sweeps']['prev']) // self.sweeps_num
+                max_interval = min(max_interval, self.train_interval[1])
+                min_interval = min(max_interval, self.train_interval[0])
+                interval = np.random.randint(min_interval, max_interval + 1)
+                choices = [(k + 1) * interval - 1 for k in range(self.sweeps_num)]
+            for idx in sorted(list(choices)):
+                sweep_idx = min(idx, len(results['sweeps']['prev']) - 1)
+                sweep = results['sweeps']['prev'][sweep_idx]
+                if len(sweep.keys()) < len(cam_types):
+                    sweep = results['sweeps']['prev'][sweep_idx - 1]
+                for sensor in cam_types:
+                    results['img'].append(mmcv.imread(sweep[sensor]['data_path'], self.color_type))
+                    results['img_timestamp'].append(sweep[sensor]['timestamp'] / 1e6)
+                    results['filename'].append(os.path.relpath(sweep[sensor]['data_path']))
+                    results['lidar2img'].append(compose_lidar2img(
+                        results['ego2global_translation'],
+                        results['ego2global_rotation'],
+                        results['lidar2ego_translation'],
+                        results['lidar2ego_rotation'],
+                        sweep[sensor]['sensor2global_translation'],
+                        sweep[sensor]['sensor2global_rotation'],
+                        sweep[sensor]['cam_intrinsic'],
+                    ))
+        return results
+    def load_online(self, results):
+        # only used when measuring FPS
+        assert self.test_mode
+        assert self.test_interval == 6
+        cam_types = [
+            'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
+            'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'
+        ]
+        if len(results['sweeps']['prev']) == 0:
+            for _ in range(self.sweeps_num):
+                for j in range(len(cam_types)):
+                    results['img_timestamp'].append(results['img_timestamp'][j])
+                    results['filename'].append(results['filename'][j])
+                    results['lidar2img'].append(np.copy(results['lidar2img'][j]))
+        else:
+            interval = self.test_interval
+            choices = [(k + 1) * interval - 1 for k in range(self.sweeps_num)]
+            for idx in sorted(list(choices)):
+                sweep_idx = min(idx, len(results['sweeps']['prev']) - 1)
+                sweep = results['sweeps']['prev'][sweep_idx]
+                if len(sweep.keys()) < len(cam_types):
+                    sweep = results['sweeps']['prev'][sweep_idx - 1]
+                for sensor in cam_types:
+                    # skip loading history frames
+                    results['img_timestamp'].append(sweep[sensor]['timestamp'] / 1e6)
+                    results['filename'].append(os.path.relpath(sweep[sensor]['data_path']))
+                    results['lidar2img'].append(compose_lidar2img(
+                        results['ego2global_translation'],
+                        results['ego2global_rotation'],
+                        results['lidar2ego_translation'],
+                        results['lidar2ego_rotation'],
+                        sweep[sensor]['sensor2global_translation'],
+                        sweep[sensor]['sensor2global_rotation'],
+                        sweep[sensor]['cam_intrinsic'],
+                    ))
+        return results
+    def __call__(self, results):
+        if self.sweeps_num == 0:
+            return results
+        world_size = get_dist_info()[1]
+        if world_size == 1 and self.test_mode:
+            return self.load_online(results)
+        else:
+            return self.load_offline(results)

loaders/pipelines/transforms.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import mmcv
+import torch
+import numpy as np
+from PIL import Image
+from numpy import random
+from mmdet.datasets.builder import PIPELINES
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+    """Pad the multi-view image.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+    def _pad_img(self, img):
+        if self.size_divisor is not None:
+            pad_h = int(np.ceil(img.shape[0] / self.size_divisor)) * self.size_divisor
+            pad_w = int(np.ceil(img.shape[1] / self.size_divisor)) * self.size_divisor
+        else:
+            pad_h, pad_w = self.size
+        pad_width = ((0, pad_h - img.shape[0]), (0, pad_w - img.shape[1]), (0, 0))
+        img = np.pad(img, pad_width, constant_values=self.pad_val)
+        return img
+    def _pad_imgs(self, results):
+        padded_img = [self._pad_img(img) for img in results['img']]
+        results['ori_shape'] = [img.shape for img in results['img']]
+        results['img'] = padded_img
+        results['img_shape'] = [img.shape for img in padded_img]
+        results['pad_shape'] = [img.shape for img in padded_img]
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_imgs(results)
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32).reshape(-1)
+        self.std = 1 / np.array(std, dtype=np.float32).reshape(-1)
+        self.to_rgb = to_rgb
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        normalized_imgs = []
+        for img in results['img']:
+            img = img.astype(np.float32)
+            if self.to_rgb:
+                img = img[..., ::-1]
+            img = img - self.mean
+            img = img * self.std
+            normalized_imgs.append(img)
+        results['img'] = normalized_imgs
+        results['img_norm_cfg'] = dict(
+            mean=self.mean,
+            std=self.std,
+            to_rgb=self.to_rgb
+        )
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+@PIPELINES.register_module()
+class PhotoMetricDistortionMultiViewImage:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for img in imgs:
+            ori_dtype = img.dtype
+            img = img.astype(np.float32)
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(-self.brightness_delta,
+                                    self.brightness_delta)
+                img += delta
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            mode = random.randint(2)
+            if mode == 1:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+            # convert color from BGR to HSV
+            img = mmcv.bgr2hsv(img)
+            # random saturation
+            if random.randint(2):
+                img[..., 1] *= random.uniform(self.saturation_lower,
+                                            self.saturation_upper)
+            # random hue
+            if random.randint(2):
+                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+                img[..., 0][img[..., 0] > 360] -= 360
+                img[..., 0][img[..., 0] < 0] += 360
+            # convert color from HSV to BGR
+            img = mmcv.hsv2bgr(img)
+            # random contrast
+            if mode == 0:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+            # randomly swap channels
+            if random.randint(2):
+                img = img[..., random.permutation(3)]
+            new_imgs.append(img.astype(ori_dtype))
+        results['img'] = new_imgs
+        return results
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+@PIPELINES.register_module()
+class RandomTransformImage(object):
+    def __init__(self, ida_aug_conf=None, training=True):
+        self.ida_aug_conf = ida_aug_conf
+        self.training = training
+    def __call__(self, results):
+        resize, resize_dims, crop, flip, rotate = self.sample_augmentation()
+        if len(results['lidar2img']) == len(results['img']):
+            for i in range(len(results['img'])):
+                img = Image.fromarray(np.uint8(results['img'][i]))
+                # resize, resize_dims, crop, flip, rotate = self._sample_augmentation()
+                img, ida_mat = self.img_transform(
+                    img,
+                    resize=resize,
+                    resize_dims=resize_dims,
+                    crop=crop,
+                    flip=flip,
+                    rotate=rotate,
+                )
+                results['img'][i] = np.array(img).astype(np.uint8)
+                results['lidar2img'][i] = ida_mat @ results['lidar2img'][i]
+        elif len(results['img']) == 6:
+            for i in range(len(results['img'])):
+                img = Image.fromarray(np.uint8(results['img'][i]))
+                # resize, resize_dims, crop, flip, rotate = self._sample_augmentation()
+                img, ida_mat = self.img_transform(
+                    img,
+                    resize=resize,
+                    resize_dims=resize_dims,
+                    crop=crop,
+                    flip=flip,
+                    rotate=rotate,
+                )
+                results['img'][i] = np.array(img).astype(np.uint8)
+            for i in range(len(results['lidar2img'])):
+                results['lidar2img'][i] = ida_mat @ results['lidar2img'][i]
+        else:
+            raise ValueError()
+        results['ori_shape'] = [img.shape for img in results['img']]
+        results['img_shape'] = [img.shape for img in results['img']]
+        results['pad_shape'] = [img.shape for img in results['img']]
+        return results
+    def img_transform(self, img, resize, resize_dims, crop, flip, rotate):
+        """
+        https://github.com/Megvii-BaseDetection/BEVStereo/blob/master/dataset/nusc_mv_det_dataset.py#L48
+        """
+        def get_rot(h):
+            return torch.Tensor([
+                [np.cos(h), np.sin(h)],
+                [-np.sin(h), np.cos(h)],
+            ])
+        ida_rot = torch.eye(2)
+        ida_tran = torch.zeros(2)
+        # adjust image
+        img = img.resize(resize_dims)
+        img = img.crop(crop)
+        if flip:
+            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
+        img = img.rotate(rotate)
+        # post-homography transformation
+        ida_rot *= resize
+        ida_tran -= torch.Tensor(crop[:2])
+        if flip:
+            A = torch.Tensor([[-1, 0], [0, 1]])
+            b = torch.Tensor([crop[2] - crop[0], 0])
+            ida_rot = A.matmul(ida_rot)
+            ida_tran = A.matmul(ida_tran) + b
+        A = get_rot(rotate / 180 * np.pi)
+        b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
+        b = A.matmul(-b) + b
+        ida_rot = A.matmul(ida_rot)
+        ida_tran = A.matmul(ida_tran) + b
+        ida_mat = torch.eye(4)
+        ida_mat[:2, :2] = ida_rot
+        ida_mat[:2, 2] = ida_tran
+        return img, ida_mat.numpy()
+    def sample_augmentation(self):
+        """
+        https://github.com/Megvii-BaseDetection/BEVStereo/blob/master/dataset/nusc_mv_det_dataset.py#L247
+        """
+        H, W = self.ida_aug_conf['H'], self.ida_aug_conf['W']
+        fH, fW = self.ida_aug_conf['final_dim']
+        if self.training:
+            resize = np.random.uniform(*self.ida_aug_conf['resize_lim'])
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int((1 - np.random.uniform(*self.ida_aug_conf['bot_pct_lim'])) * newH) - fH
+            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            if self.ida_aug_conf['rand_flip'] and np.random.choice([0, 1]):
+                flip = True
+            rotate = np.random.uniform(*self.ida_aug_conf['rot_lim'])
+        else:
+            resize = max(fH / H, fW / W)
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int((1 - np.mean(self.ida_aug_conf['bot_pct_lim'])) * newH) - fH
+            crop_w = int(max(0, newW - fW) / 2)
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            rotate = 0
+        return resize, resize_dims, crop, flip, rotate
+@PIPELINES.register_module()
+class GlobalRotScaleTransImage(object):
+    def __init__(self,
+                 rot_range=[-0.3925, 0.3925],
+                 scale_ratio_range=[0.95, 1.05],
+                 translation_std=[0, 0, 0]):
+        self.rot_range = rot_range
+        self.scale_ratio_range = scale_ratio_range
+        self.translation_std = translation_std
+    def __call__(self, results):
+        # random rotate
+        rot_angle = np.random.uniform(*self.rot_range)
+        self.rotate_z(results, rot_angle)
+        results["gt_bboxes_3d"].rotate(np.array(rot_angle))
+        # random scale
+        scale_ratio = np.random.uniform(*self.scale_ratio_range)
+        self.scale_xyz(results, scale_ratio)
+        results["gt_bboxes_3d"].scale(scale_ratio)
+        # TODO: support translation
+        return results
+    def rotate_z(self, results, rot_angle):
+        rot_cos = torch.cos(torch.tensor(rot_angle))
+        rot_sin = torch.sin(torch.tensor(rot_angle))
+        rot_mat = torch.tensor([
+            [rot_cos, -rot_sin, 0, 0],
+            [rot_sin, rot_cos, 0, 0],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1],
+        ])
+        rot_mat_inv = torch.inverse(rot_mat)
+        for view in range(len(results['lidar2img'])):
+            results['lidar2img'][view] = (torch.tensor(results['lidar2img'][view]).float() @ rot_mat_inv).numpy()
+    def scale_xyz(self, results, scale_ratio):
+        scale_mat = torch.tensor([
+            [scale_ratio, 0, 0, 0],
+            [0, scale_ratio, 0, 0],
+            [0, 0, scale_ratio, 0],
+            [0, 0, 0, 1],
+        ])
+        scale_mat_inv = torch.inverse(scale_mat)
+        for view in range(len(results['lidar2img'])):
+            results['lidar2img'][view] = (torch.tensor(results['lidar2img'][view]).float() @ scale_mat_inv).numpy()

models/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .backbones import __all__
+from .bbox import __all__
+from .sparsebev import SparseBEV
+from .sparsebev_head import SparseBEVHead
+from .sparsebev_transformer import SparseBEVTransformer
+__all__ = [
+    'SparseBEV', 'SparseBEVHead', 'SparseBEVTransformer'
+]

models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .vovnet import VoVNet
2	+
3	+ __all__ = ['VoVNet']

models/backbones/vovnet.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+import torch.utils.checkpoint as cp
+from collections import OrderedDict
+from mmcv.runner import BaseModule
+from mmdet.models.builder import BACKBONES
+from torch.nn.modules.batchnorm import _BatchNorm
+VoVNet19_slim_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+VoVNet19_dw_eSE = {
+    'stem': [64, 64, 64],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+VoVNet19_slim_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    "dw": False
+}
+VoVNet19_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": False
+}
+VoVNet39_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 2, 2],
+    "eSE": True,
+    "dw": False
+}
+VoVNet57_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 4, 3],
+    "eSE": True,
+    "dw": False
+}
+VoVNet99_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 3, 9, 3],
+    "eSE": True,
+    "dw": False
+}
+_STAGE_SPECS = {
+    "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE,
+    "V-19-dw-eSE": VoVNet19_dw_eSE,
+    "V-19-slim-eSE": VoVNet19_slim_eSE,
+    "V-19-eSE": VoVNet19_eSE,
+    "V-39-eSE": VoVNet39_eSE,
+    "V-57-eSE": VoVNet57_eSE,
+    "V-99-eSE": VoVNet99_eSE,
+}
+def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            '{}_{}/dw_conv3x3'.format(module_name, postfix),
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=out_channels,
+                bias=False
+            )
+        ),
+        (
+            '{}_{}/pw_conv1x1'.format(module_name, postfix),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False)
+        ),
+        ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0):
+    """1x1 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+class eSEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+    def forward(self, x):
+        inputs = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return inputs * x
+class _OSA_module(nn.Module):
+    def __init__(self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False, with_cp=False):
+        super(_OSA_module, self).__init__()
+        self.with_cp = with_cp
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0"))
+            )
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat")))
+        self.ese = eSEModule(concat_ch)
+    def _forward(self, x):
+        identity_feat = x
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+        xt = self.ese(xt)
+        if self.identity:
+            xt = xt + identity_feat
+        return xt
+    def forward(self, x):
+        if self.with_cp and self.training and x.requires_grad:
+            return cp.checkpoint(self._forward, x)
+        else:
+            return self._forward(x)
+class _OSA_stage(nn.Sequential):
+    def __init__(self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False, with_cp=False):
+        super(_OSA_stage, self).__init__()
+        if not stage_num == 2:
+            self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+        if block_per_stage != 1:
+            SE = False
+        module_name = f"OSA{stage_num}_1"
+        self.add_module(
+            module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise, with_cp=with_cp)
+        )
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f"OSA{stage_num}_{i + 2}"
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch,
+                    stage_ch,
+                    concat_ch,
+                    layer_per_block,
+                    module_name,
+                    SE,
+                    identity=True,
+                    depthwise=depthwise,
+                    with_cp=with_cp
+                ),
+            )
+@BACKBONES.register_module()
+class VoVNet(BaseModule):
+    def __init__(self, spec_name, input_ch=3, out_features=None, frozen_stages=-1, norm_eval=True, with_cp=False, pretrained=None, init_cfg=None):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNet, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+        stem_ch = stage_specs["stem"]
+        config_stage_ch = stage_specs["stage_conv_ch"]
+        config_concat_ch = stage_specs["stage_out_ch"]
+        block_per_stage = stage_specs["block_per_stage"]
+        layer_per_block = stage_specs["layer_per_block"]
+        SE = stage_specs["eSE"]
+        depthwise = stage_specs["dw"]
+        self._out_features = out_features
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2)
+        self.add_module("stem", nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
+        self._out_feature_channels = {"stem": stem_ch[2]}
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = "stage%d" % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                    with_cp=with_cp
+                ),
+            )
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(current_stirde * 2)
+        # initialize weights
+        # self._initialize_weights()
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs[name] = x
+        return outputs
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, 'stem')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'stage{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()

models/bbox/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .assigners import __all__
+from .coders import __all__
+from .match_costs import __all__

models/bbox/assigners/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .hungarian_assigner_3d import HungarianAssigner3D
2	+
3	+ __all__ = ['HungarianAssigner3D']

models/bbox/assigners/hungarian_assigner_3d.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.assigners import AssignResult
+from mmdet.core.bbox.assigners import BaseAssigner
+from mmdet.core.bbox.match_costs import build_match_cost
+from ..utils import normalize_bbox
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pc_range = pc_range
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes,
+               gt_labels,
+               gt_bboxes_ignore=None,
+               code_weights=None,
+               with_velo=False):
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes)
+        if code_weights is not None:
+            bbox_pred = bbox_pred * code_weights
+            normalized_gt_bboxes = normalized_gt_bboxes * code_weights
+        if with_velo:
+            reg_cost = self.reg_cost(bbox_pred, normalized_gt_bboxes)
+        else:
+            reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0)
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)

models/bbox/coders/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .nms_free_coder import NMSFreeCoder
2	+
3	+ __all__ = ['NMSFreeCoder']

models/bbox/coders/nms_free_coder.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from ..utils import denormalize_bbox
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+    def encode(self):
+        pass
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = torch.div(indexs, self.num_classes, rounding_mode='trunc')
+        bbox_preds = bbox_preds[bbox_index]
+        final_box_preds = denormalize_bbox(bbox_preds)
+        final_scores = scores
+        final_preds = labels
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+        if self.post_center_range is not None:
+            limit = torch.tensor(self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >= limit[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <= limit[3:]).all(1)
+            if self.score_threshold:
+                mask &= thresh_mask
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!'
+            )
+        return predictions_dict
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list

models/bbox/match_costs/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .match_cost import BBox3DL1Cost
2	+
3	+ __all__ = ['BBox3DL1Cost']

models/bbox/match_costs/match_cost.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+    def __init__(self, weight=1.0):
+        self.weight = weight
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+@MATCH_COST.register_module()
+class BBoxBEVL1Cost(object):
+    def __init__(self, weight, pc_range):
+        self.weight = weight
+        self.pc_range = pc_range
+    def __call__(self, bboxes, gt_bboxes):
+        pc_start = bboxes.new(self.pc_range[0:2])
+        pc_range = bboxes.new(self.pc_range[3:5]) - bboxes.new(self.pc_range[0:2])
+        # normalize the box center to [0, 1]
+        normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
+        normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
+        reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
+        return reg_cost * self.weight
+@MATCH_COST.register_module()
+class IoU3DCost(object):
+    def __init__(self, weight):
+        self.weight = weight
+    def __call__(self, iou):
+        iou_cost = - iou
+        return iou_cost * self.weight

models/bbox/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+def normalize_bbox(bboxes):
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    w = bboxes[..., 3:4].log()
+    l = bboxes[..., 4:5].log()
+    h = bboxes[..., 5:6].log()
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8]
+        vy = bboxes[..., 8:9]
+        out = torch.cat([cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy], dim=-1)
+    else:
+        out = torch.cat([cx, cy, w, l, cz, h, rot.sin(), rot.cos()], dim=-1)
+    return out
+def denormalize_bbox(normalized_bboxes):
+    rot_sin = normalized_bboxes[..., 6:7]
+    rot_cos = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sin, rot_cos)
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+    w = normalized_bboxes[..., 2:3].exp()
+    l = normalized_bboxes[..., 3:4].exp()
+    h = normalized_bboxes[..., 5:6].exp()
+    if normalized_bboxes.size(-1) > 8:
+        vx = normalized_bboxes[..., 8:9]
+        vy = normalized_bboxes[..., 9:10]
+        out = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
+    else:
+        out = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+    return out
+def encode_bbox(bboxes, pc_range=None):
+    xyz = bboxes[..., 0:3].clone()
+    wlh = bboxes[..., 3:6].log()
+    rot = bboxes[..., 6:7]
+    if pc_range is not None:
+        xyz[..., 0] = (xyz[..., 0] - pc_range[0]) / (pc_range[3] - pc_range[0])
+        xyz[..., 1] = (xyz[..., 1] - pc_range[1]) / (pc_range[4] - pc_range[1])
+        xyz[..., 2] = (xyz[..., 2] - pc_range[2]) / (pc_range[5] - pc_range[2])
+    if bboxes.shape[-1] > 7:
+        vel = bboxes[..., 7:9].clone()
+        return torch.cat([xyz, wlh, rot.sin(), rot.cos(), vel], dim=-1)
+    else:
+        return torch.cat([xyz, wlh, rot.sin(), rot.cos()], dim=-1)
+def decode_bbox(bboxes, pc_range=None):
+    xyz = bboxes[..., 0:3].clone()
+    wlh = bboxes[..., 3:6].exp()
+    rot = torch.atan2(bboxes[..., 6:7], bboxes[..., 7:8])
+    if pc_range is not None:
+        xyz[..., 0] = xyz[..., 0] * (pc_range[3] - pc_range[0]) + pc_range[0]
+        xyz[..., 1] = xyz[..., 1] * (pc_range[4] - pc_range[1]) + pc_range[1]
+        xyz[..., 2] = xyz[..., 2] * (pc_range[5] - pc_range[2]) + pc_range[2]
+    if bboxes.shape[-1] > 8:
+        vel = bboxes[..., 8:10].clone()
+        return torch.cat([xyz, wlh, rot, vel], dim=-1)
+    else:
+        return torch.cat([xyz, wlh, rot], dim=-1)

models/checkpoint.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# https://pytorch.org/docs/stable/_modules/torch/utils/checkpoint.html#checkpoint
+import torch
+import warnings
+import weakref
+from typing import Any, Iterable, List, Tuple
+__all__ = [
+    "checkpoint", "checkpoint_sequential", "CheckpointFunction",
+    "check_backward_validity", "detach_variable", "get_device_states",
+    "set_device_states",
+]
+def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+            x = inp.detach()
+            x.requires_grad = inp.requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            "Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
+def check_backward_validity(inputs: Iterable[Any]) -> None:
+    if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)):
+        warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
+# We can't know if the run_fn will internally move some args to different devices,
+# which would require logic to preserve rng states for those devices as well.
+# We could paranoically stash and restore ALL the rng states for all visible devices,
+# but that seems very wasteful for most cases.  Compromise:  Stash the RNG state for
+# the device of all Tensor args.
+#
+# To consider:  maybe get_device_states and set_device_states should reside in torch/random.py?
+def get_device_states(*args) -> Tuple[List[int], List[torch.Tensor]]:
+    # This will not error out if "arg" is a CPU tensor or a non-tensor type because
+    # the conditionals short-circuit.
+    fwd_gpu_devices = list({arg.get_device() for arg in args
+                            if isinstance(arg, torch.Tensor) and arg.is_cuda})
+    fwd_gpu_states = []
+    for device in fwd_gpu_devices:
+        with torch.cuda.device(device):
+            fwd_gpu_states.append(torch.cuda.get_rng_state())
+    return fwd_gpu_devices, fwd_gpu_states
+def set_device_states(devices, states) -> None:
+    for device, state in zip(devices, states):
+        with torch.cuda.device(device):
+            torch.cuda.set_rng_state(state)
+def _get_autocast_kwargs():
+    gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
+                           "dtype": torch.get_autocast_gpu_dtype(),
+                           "cache_enabled": torch.is_autocast_cache_enabled()}
+    cpu_autocast_kwargs = {"enabled": torch.is_autocast_cpu_enabled(),
+                           "dtype": torch.get_autocast_cpu_dtype(),
+                           "cache_enabled": torch.is_autocast_cache_enabled()}
+    return gpu_autocast_kwargs, cpu_autocast_kwargs
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, preserve_rng_state, *args):
+        check_backward_validity(args)
+        ctx.run_function = run_function
+        ctx.preserve_rng_state = preserve_rng_state
+        # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
+        ctx.gpu_autocast_kwargs, ctx.cpu_autocast_kwargs = _get_autocast_kwargs()
+        if preserve_rng_state:
+            ctx.fwd_cpu_state = torch.get_rng_state()
+            # Don't eagerly initialize the cuda context by accident.
+            # (If the user intends that the context is initialized later, within their
+            # run_function, we SHOULD actually stash the cuda state here.  Unfortunately,
+            # we have no way to anticipate this will happen before we run the function.)
+            ctx.had_cuda_in_fwd = False
+            if torch.cuda._initialized:
+                ctx.had_cuda_in_fwd = True
+                ctx.fwd_gpu_devices, ctx.fwd_gpu_states = get_device_states(*args)
+        # Save non-tensor inputs in ctx, keep a placeholder None for tensors
+        # to be filled out during the backward.
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        tensor_inputs = []
+        for i, arg in enumerate(args):
+            if torch.is_tensor(arg):
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+        ctx.save_for_backward(*tensor_inputs)
+        with torch.no_grad():
+            outputs = run_function(*args)
+        return outputs
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad() or when an `inputs` parameter"
+                " is passed to .backward(). Please use .backward() and do not pass its `inputs`"
+                " argument.")
+        # Copy the list to avoid modifying original list.
+        inputs = list(ctx.inputs)
+        tensor_indices = ctx.tensor_indices
+        tensors = ctx.saved_tensors
+        # Fill in inputs with appropriate saved tensors.
+        for i, idx in enumerate(tensor_indices):
+            inputs[idx] = tensors[i]
+        # Stash the surrounding rng state, and mimic the state that was
+        # present at this time during forward.  Restore the surrounding state
+        # when we're done.
+        rng_devices = []
+        if ctx.preserve_rng_state and ctx.had_cuda_in_fwd:
+            rng_devices = ctx.fwd_gpu_devices
+        with torch.random.fork_rng(devices=rng_devices, enabled=ctx.preserve_rng_state):
+            if ctx.preserve_rng_state:
+                torch.set_rng_state(ctx.fwd_cpu_state)
+                if ctx.had_cuda_in_fwd:
+                    set_device_states(ctx.fwd_gpu_devices, ctx.fwd_gpu_states)
+            detached_inputs = detach_variable(tuple(inputs))
+            with torch.enable_grad(), \
+                 torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs), \
+                 torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):
+                outputs = ctx.run_function(*detached_inputs)
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        # run backward() with only tensor that requires grad
+        outputs_with_grad = []
+        args_with_grad = []
+        for i in range(len(outputs)):
+            if torch.is_tensor(outputs[i]) and outputs[i].requires_grad:
+                outputs_with_grad.append(outputs[i])
+                args_with_grad.append(args[i])
+        if len(outputs_with_grad) == 0:
+            raise RuntimeError(
+                "none of output has requires_grad=True,"
+                " this checkpoint() is not necessary")
+        torch.autograd.backward(outputs_with_grad, args_with_grad)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else None
+                      for inp in detached_inputs)
+        return (None, None) + grads
+def checkpoint(function, *args, use_reentrant: bool = True, **kwargs):
+    r"""Checkpoint a model or part of the model
+    Checkpointing works by trading compute for memory. Rather than storing all
+    intermediate activations of the entire computation graph for computing
+    backward, the checkpointed part does **not** save intermediate activations,
+    and instead recomputes them in backward pass. It can be applied on any part
+    of a model.
+    Specifically, in the forward pass, :attr:`function` will run in
+    :func:`torch.no_grad` manner, i.e., not storing the intermediate
+    activations. Instead, the forward pass saves the inputs tuple and the
+    :attr:`function` parameter. In the backwards pass, the saved inputs and
+    :attr:`function` is retrieved, and the forward pass is computed on
+    :attr:`function` again, now tracking the intermediate activations, and then
+    the gradients are calculated using these activation values.
+    The output of :attr:`function` can contain non-Tensor values and gradient
+    recording is only performed for the Tensor values. Note that if the output
+    consists of nested structures (ex: custom objects, lists, dicts etc.)
+    consisting of Tensors, these Tensors nested in custom structures will not
+    be considered as part of autograd.
+    .. warning::
+        If :attr:`function` invocation during backward does anything different
+        than the one during forward, e.g., due to some global variable, the
+        checkpointed version won't be equivalent, and unfortunately it can't be
+        detected.
+    .. warning::
+        If ``use_reentrant=True`` is specified, then if the checkpointed segment
+        contains tensors detached from the computational graph by `detach()` or
+        `torch.no_grad()`, the backward pass will raise an error. This is
+        because `checkpoint` makes all the outputs require gradients which
+        causes issues when a tensor is defined to have no gradient in the model.
+        To circumvent this, detach the tensors outside of the `checkpoint`
+        function. Note that the checkpointed segment can contain tensors
+        detached from the computational graph if ``use_reentrant=False`` is
+        specified.
+    .. warning::
+        If ``use_reentrant=True`` is specified, at least one of the inputs needs
+        to have :code:`requires_grad=True` if grads are needed for model inputs,
+        otherwise the checkpointed part of the model won't have gradients. At
+        least one of the outputs needs to have :code:`requires_grad=True` as
+        well. Note that this does not apply if ``use_reentrant=False`` is
+        specified.
+    .. warning::
+        If ``use_reentrant=True`` is specified, checkpointing currently only
+        supports :func:`torch.autograd.backward` and only if its `inputs`
+        argument is not passed. :func:`torch.autograd.grad`
+        is not supported. If ``use_reentrant=False`` is specified, checkpointing
+        will work with :func:`torch.autograd.grad`.
+    Args:
+        function: describes what to run in the forward pass of the model or
+            part of the model. It should also know how to handle the inputs
+            passed as the tuple. For example, in LSTM, if user passes
+            ``(activation, hidden)``, :attr:`function` should correctly use the
+            first input as ``activation`` and the second input as ``hidden``
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+            Default: ``True``
+        use_reentrant(bool, optional): Use checkpointing
+            implementation that requires re-entrant autograd.
+            If ``use_reentrant=False`` is specified, ``checkpoint`` will use an
+            implementation that does not require re-entrant autograd. This
+            allows ``checkpoint`` to support additional functionality, such as
+            working as expected with ``torch.autograd.grad`` and support for
+            keyword arguments input into the checkpointed function. Note that future
+            versions of PyTorch will default to ``use_reentrant=False``.
+            Default: ``True``
+        args: tuple containing inputs to the :attr:`function`
+    Returns:
+        Output of running :attr:`function` on :attr:`*args`
+    """
+    # Hack to mix *args with **kwargs in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs and use_reentrant:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs))
+    if use_reentrant:
+        return CheckpointFunction.apply(function, preserve, *args)
+    else:
+        return _checkpoint_without_reentrant(
+            function,
+            preserve,
+            *args,
+            **kwargs,
+        )
+def checkpoint_sequential(functions, segments, input, use_reentrant=True, **kwargs):
+    r"""A helper function for checkpointing sequential models.
+    Sequential models execute a list of modules/functions in order
+    (sequentially). Therefore, we can divide such a model in various segments
+    and checkpoint each segment. All segments except the last will run in
+    :func:`torch.no_grad` manner, i.e., not storing the intermediate
+    activations. The inputs of each checkpointed segment will be saved for
+    re-running the segment in the backward pass.
+    See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works.
+    .. warning::
+        Checkpointing currently only supports :func:`torch.autograd.backward`
+        and only if its `inputs` argument is not passed. :func:`torch.autograd.grad`
+        is not supported.
+    .. warning:
+        At least one of the inputs needs to have :code:`requires_grad=True` if
+        grads are needed for model inputs, otherwise the checkpointed part of the
+        model won't have gradients.
+    .. warning:
+        Since PyTorch 1.4, it allows only one Tensor as the input and
+        intermediate outputs, just like :class:`torch.nn.Sequential`.
+    Args:
+        functions: A :class:`torch.nn.Sequential` or the list of modules or
+            functions (comprising the model) to run sequentially.
+        segments: Number of chunks to create in the model
+        input: A Tensor that is input to :attr:`functions`
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+            Default: ``True``
+        use_reentrant(bool, optional): Use checkpointing
+            implementation that requires re-entrant autograd.
+            If ``use_reentrant=False`` is specified, ``checkpoint`` will use an
+            implementation that does not require re-entrant autograd. This
+            allows ``checkpoint`` to support additional functionality, such as
+            working as expected with ``torch.autograd.grad`` and support for
+            keyword arguments input into the checkpointed function.
+            Default: ``True``
+    Returns:
+        Output of running :attr:`functions` sequentially on :attr:`*inputs`
+    Example:
+        >>> # xdoctest: +SKIP("stub")
+        >>> model = nn.Sequential(...)
+        >>> input_var = checkpoint_sequential(model, chunks, input_var)
+    """
+    # Hack for keyword-only parameter in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs))
+    def run_function(start, end, functions):
+        def forward(input):
+            for j in range(start, end + 1):
+                input = functions[j](input)
+            return input
+        return forward
+    if isinstance(functions, torch.nn.Sequential):
+        functions = list(functions.children())
+    segment_size = len(functions) // segments
+    # the last chunk has to be non-volatile
+    end = -1
+    for start in range(0, segment_size * (segments - 1), segment_size):
+        end = start + segment_size - 1
+        input = checkpoint(
+            run_function(start, end, functions),
+            input,
+            use_reentrant=use_reentrant,
+            preserve_rng_state=preserve
+        )
+    return run_function(end + 1, len(functions) - 1, functions)(input)
+def _checkpoint_without_reentrant(function, preserve_rng_state=True, *args, **kwargs):
+    """Checkpointining without re-entrant autograd
+    Args:
+        function: describes what to run in the forward pass of the model or
+            part of the model. It should also know how to handle the inputs
+            passed as the tuple. For example, in LSTM, if user passes
+            ``(activation, hidden)``, :attr:`function` should correctly use the
+            first input as ``activation`` and the second input as ``hidden``
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+            Default: ``True``
+        *args: Arguments to pass in to the given ``function``.
+        **kwargs: Keyword arguments to pass into the given ``function``.
+    """
+    # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
+    gpu_autocast_kwargs, cpu_autocast_kwargs = _get_autocast_kwargs()
+    if preserve_rng_state:
+        fwd_cpu_state = torch.get_rng_state()
+        # Don't eagerly initialize the cuda context by accident.
+        # (If the user intends that the context is initialized later, within their
+        # run_function, we SHOULD actually stash the cuda state here.  Unfortunately,
+        # we have no way to anticipate this will happen before we run the function.
+        # If they do so, we raise an error.)
+        had_cuda_in_fwd = False
+        if torch.cuda._initialized:
+            had_cuda_in_fwd = True
+            fwd_gpu_devices, fwd_gpu_states = get_device_states(*args)
+    # Custom class to be able to take weak references
+    class Holder():
+        pass
+    # The Holder object for each of the saved object is saved directly on the
+    # SavedVariable and is cleared when reset_data() is called on it. We MUST make
+    # sure that this is the only object having an owning reference to ensure that
+    # the Tensor stored in storage is deleted as soon as the corresponding SavedVariable
+    # data is cleared.
+    storage: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+    weak_holder_list = []
+    def pack(x):
+        # TODO(varal7): Instead of returning abstract object, we can return things metadata (such as
+        # size, device, ...) to catch certain cases of undeterministic behavior of the forward
+        res = Holder()
+        weak_holder_list.append(weakref.ref(res))
+        return res
+    def unpack(x):
+        unpack_counter = 0
+        if len(storage) == 0:
+            def inner_pack(inner):
+                nonlocal unpack_counter
+                unpack_counter += 1
+                # If the holder went out of scope, the SavedVariable is dead and so
+                # the value will never be read from the storage. Skip filling it.
+                if weak_holder_list[unpack_counter - 1]() is None:
+                    return
+                # Use detach here to ensure we don't keep the temporary autograd
+                # graph created during the second forward
+                storage[weak_holder_list[unpack_counter - 1]()] = inner.detach()
+                return
+            def inner_unpack(packed):
+                raise RuntimeError("You are calling backwards on a tensor that is never exposed. Please open an issue.")
+            # Stash the surrounding rng state, and mimic the state that was
+            # present at this time during forward.  Restore the surrounding state
+            # when we're done.
+            rng_devices = []
+            if preserve_rng_state and had_cuda_in_fwd:
+                rng_devices = fwd_gpu_devices
+            with torch.random.fork_rng(devices=rng_devices, enabled=preserve_rng_state):
+                if preserve_rng_state:
+                    torch.set_rng_state(fwd_cpu_state)
+                    if had_cuda_in_fwd:
+                        set_device_states(fwd_gpu_devices, fwd_gpu_states)
+                with torch.enable_grad(), \
+                     torch.cuda.amp.autocast(**gpu_autocast_kwargs), \
+                     torch.cpu.amp.autocast(**cpu_autocast_kwargs), \
+                     torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
+                    _unused = function(*args, **kwargs)
+        if x not in storage:
+            raise RuntimeError(
+                "Attempt to retrieve a tensor saved by autograd multiple times without checkpoint"
+                " recomputation being triggered in between, this is not currently supported. Please"
+                " open an issue with details on your use case so that we can prioritize adding this."
+            )
+        return storage[x]
+    with torch.autograd.graph.saved_tensors_hooks(pack, unpack):
+        output = function(*args, **kwargs)
+        if torch.cuda._initialized and preserve_rng_state and not had_cuda_in_fwd:
+            # Cuda was not initialized before running the forward, so we didn't
+            # stash the CUDA state.
+            raise RuntimeError(
+                "PyTorch's CUDA state was initialized in the forward pass "
+                "of a Checkpoint, which is not allowed. Please open an issue "
+                "if you need this feature.")
+    return output

models/csrc/__init__.py ADDED Viewed

File without changes

models/csrc/msmv_sampling/msmv_sampling.cpp ADDED Viewed

	@@ -0,0 +1,369 @@

+#include "msmv_sampling.h"
+#define MAX_POINT 32
+void ms_deformable_im2col_cuda_c2345(
+    const float* feat_c2,
+    const float* feat_c3,
+    const float* feat_c4,
+    const float* feat_c5,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const float* data_sampling_loc,
+    const float* data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float* data_col
+);
+void ms_deformable_im2col_cuda_c23456(
+    const float* feat_c2,
+    const float* feat_c3,
+    const float* feat_c4,
+    const float* feat_c5,
+    const float* feat_c6,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const int h_c6, const int w_c6,
+    const float* data_sampling_loc,
+    const float* data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float* data_col
+);
+void ms_deformable_col2im_cuda_c2345(
+    const float* grad_col,
+    const float* feat_c2,
+    const float* feat_c3,
+    const float* feat_c4,
+    const float* feat_c5,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const float* data_sampling_loc,
+    const float* data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float* grad_value_c2,
+    float* grad_value_c3,
+    float* grad_value_c4,
+    float* grad_value_c5,
+    float* grad_sampling_loc,
+    float* grad_attn_weight
+);
+void ms_deformable_col2im_cuda_c23456(
+    const float *grad_col,
+    const float *feat_c2,
+    const float *feat_c3,
+    const float *feat_c4,
+    const float *feat_c5,
+    const float *feat_c6,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const int h_c6, const int w_c6,
+    const float *data_sampling_loc,
+    const float *data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float *grad_value_c2,
+    float *grad_value_c3,
+    float *grad_value_c4,
+    float *grad_value_c5,
+    float *grad_value_c6,
+    float *grad_sampling_loc,
+    float *grad_attn_weight
+);
+at::Tensor ms_deform_attn_cuda_c2345_forward(
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight  // [B, Q, P, 4]
+    ) {
+    AT_ASSERTM(feat_c2.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c3.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c4.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c5.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(feat_c2.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c3.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c4.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c5.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    const int batch_size = feat_c2.size(0);
+    const int num_views = feat_c2.size(1);
+    const int channels = feat_c2.size(4);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(2);
+    AT_ASSERTM(num_point <= MAX_POINT, "num_point exceed limits");
+    const int h_c2 = feat_c2.size(2);
+    const int w_c2 = feat_c2.size(3);
+    const int h_c3 = feat_c3.size(2);
+    const int w_c3 = feat_c3.size(3);
+    const int h_c4 = feat_c4.size(2);
+    const int w_c4 = feat_c4.size(3);
+    const int h_c5 = feat_c5.size(2);
+    const int w_c5 = feat_c5.size(3);
+    auto output = at::zeros({ batch_size, num_query, channels, num_point }, feat_c2.options());
+    ms_deformable_im2col_cuda_c2345(
+        feat_c2.data_ptr<float>(),
+        feat_c3.data_ptr<float>(),
+        feat_c4.data_ptr<float>(),
+        feat_c5.data_ptr<float>(),
+        h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5,
+        sampling_loc.data_ptr<float>(),
+        attn_weight.data_ptr<float>(),
+        batch_size, channels, num_views, num_query, num_point,
+        output.data_ptr<float>()
+    );
+    return output;
+}
+at::Tensor ms_deform_attn_cuda_c23456_forward(
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& feat_c6,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight  // [B, Q, P, 4]
+    ) {
+    AT_ASSERTM(feat_c2.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c3.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c4.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c5.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c6.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(feat_c2.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c3.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c4.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c5.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c6.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    const int batch_size = feat_c2.size(0);
+    const int num_views = feat_c2.size(1);
+    const int channels = feat_c2.size(4);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(2);
+    AT_ASSERTM(num_point <= MAX_POINT, "num_point exceed limits");
+    const int h_c2 = feat_c2.size(2);
+    const int w_c2 = feat_c2.size(3);
+    const int h_c3 = feat_c3.size(2);
+    const int w_c3 = feat_c3.size(3);
+    const int h_c4 = feat_c4.size(2);
+    const int w_c4 = feat_c4.size(3);
+    const int h_c5 = feat_c5.size(2);
+    const int w_c5 = feat_c5.size(3);
+    const int h_c6 = feat_c6.size(2);
+    const int w_c6 = feat_c6.size(3);
+    auto output = at::zeros({ batch_size, num_query, channels, num_point }, feat_c2.options());
+    ms_deformable_im2col_cuda_c23456(
+        feat_c2.data_ptr<float>(),
+        feat_c3.data_ptr<float>(),
+        feat_c4.data_ptr<float>(),
+        feat_c5.data_ptr<float>(),
+        feat_c6.data_ptr<float>(),
+        h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5, h_c6, w_c6,
+        sampling_loc.data_ptr<float>(),
+        attn_weight.data_ptr<float>(),
+        batch_size, channels, num_views, num_query, num_point,
+        output.data_ptr<float>()
+    );
+    return output;
+}
+std::vector<at::Tensor> ms_deform_attn_cuda_c2345_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight  // [B, Q, P, 4]
+    ) {
+    AT_ASSERTM(feat_c2.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c3.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c4.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c5.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+    AT_ASSERTM(feat_c2.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c3.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c4.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c5.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+    const int batch_size = feat_c2.size(0);
+    const int num_views = feat_c2.size(1);
+    const int channels = feat_c2.size(4);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(2);
+    AT_ASSERTM(num_point <= MAX_POINT, "num_point exceed limits");
+    auto grad_value_c2 = at::zeros_like(feat_c2);
+    auto grad_value_c3 = at::zeros_like(feat_c3);
+    auto grad_value_c4 = at::zeros_like(feat_c4);
+    auto grad_value_c5 = at::zeros_like(feat_c5);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+    const int h_c2 = feat_c2.size(2);
+    const int w_c2 = feat_c2.size(3);
+    const int h_c3 = feat_c3.size(2);
+    const int w_c3 = feat_c3.size(3);
+    const int h_c4 = feat_c4.size(2);
+    const int w_c4 = feat_c4.size(3);
+    const int h_c5 = feat_c5.size(2);
+    const int w_c5 = feat_c5.size(3);
+    ms_deformable_col2im_cuda_c2345(
+        grad_output.data_ptr<float>(),
+        feat_c2.data_ptr<float>(),
+        feat_c3.data_ptr<float>(),
+        feat_c4.data_ptr<float>(),
+        feat_c5.data_ptr<float>(),
+        h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5,
+        sampling_loc.data_ptr<float>(),
+        attn_weight.data_ptr<float>(),
+        batch_size, channels, num_views, num_query, num_point,
+        grad_value_c2.data_ptr<float>(),
+        grad_value_c3.data_ptr<float>(),
+        grad_value_c4.data_ptr<float>(),
+        grad_value_c5.data_ptr<float>(),
+        grad_sampling_loc.data_ptr<float>(),
+        grad_attn_weight.data_ptr<float>()
+    );
+    return {
+        grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_sampling_loc, grad_attn_weight
+    };
+}
+std::vector<at::Tensor> ms_deform_attn_cuda_c23456_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& feat_c6,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight  // [B, Q, P, 4]
+    ) {
+    AT_ASSERTM(feat_c2.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c3.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c4.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c5.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(feat_c6.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+    AT_ASSERTM(feat_c2.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c3.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c4.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c5.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(feat_c6.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+    const int batch_size = feat_c2.size(0);
+    const int num_views = feat_c2.size(1);
+    const int channels = feat_c2.size(4);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(2);
+    AT_ASSERTM(num_point <= MAX_POINT, "num_point exceed limits");
+    auto grad_value_c2 = at::zeros_like(feat_c2);
+    auto grad_value_c3 = at::zeros_like(feat_c3);
+    auto grad_value_c4 = at::zeros_like(feat_c4);
+    auto grad_value_c5 = at::zeros_like(feat_c5);
+    auto grad_value_c6 = at::zeros_like(feat_c6);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+    const int h_c2 = feat_c2.size(2);
+    const int w_c2 = feat_c2.size(3);
+    const int h_c3 = feat_c3.size(2);
+    const int w_c3 = feat_c3.size(3);
+    const int h_c4 = feat_c4.size(2);
+    const int w_c4 = feat_c4.size(3);
+    const int h_c5 = feat_c5.size(2);
+    const int w_c5 = feat_c5.size(3);
+    const int h_c6 = feat_c6.size(2);
+    const int w_c6 = feat_c6.size(3);
+    ms_deformable_col2im_cuda_c23456(
+        grad_output.data_ptr<float>(),
+        feat_c2.data_ptr<float>(),
+        feat_c3.data_ptr<float>(),
+        feat_c4.data_ptr<float>(),
+        feat_c5.data_ptr<float>(),
+        feat_c6.data_ptr<float>(),
+        h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5, h_c6, w_c6,
+        sampling_loc.data_ptr<float>(),
+        attn_weight.data_ptr<float>(),
+        batch_size, channels, num_views, num_query, num_point,
+        grad_value_c2.data_ptr<float>(),
+        grad_value_c3.data_ptr<float>(),
+        grad_value_c4.data_ptr<float>(),
+        grad_value_c5.data_ptr<float>(),
+        grad_value_c6.data_ptr<float>(),
+        grad_sampling_loc.data_ptr<float>(),
+        grad_attn_weight.data_ptr<float>()
+    );
+    return {
+        grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6, grad_sampling_loc, grad_attn_weight
+    };
+}
+#ifdef TORCH_EXTENSION_NAME
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("_ms_deform_attn_cuda_c2345_forward", &ms_deform_attn_cuda_c2345_forward, "pass");
+    m.def("_ms_deform_attn_cuda_c2345_backward", &ms_deform_attn_cuda_c2345_backward, "pass");
+    m.def("_ms_deform_attn_cuda_c23456_forward", &ms_deform_attn_cuda_c23456_forward, "pass");
+    m.def("_ms_deform_attn_cuda_c23456_backward", &ms_deform_attn_cuda_c23456_backward, "pass");
+}
+#endif

models/csrc/msmv_sampling/msmv_sampling.h ADDED Viewed

	@@ -0,0 +1,43 @@

+#pragma once
+#include <torch/extension.h>
+at::Tensor ms_deform_attn_cuda_c2345_forward(
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight  // [B, Q, P, 4]
+);
+std::vector<at::Tensor> ms_deform_attn_cuda_c2345_backward(
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight,  // [B, Q, P, 4]
+    const at::Tensor& grad_output
+);
+at::Tensor ms_deform_attn_cuda_c23456_forward(
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& feat_c6,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight  // [B, Q, P, 4]
+);
+std::vector<at::Tensor> ms_deform_attn_cuda_c23456_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& feat_c2,  // [B, N, H, W, C]
+    const at::Tensor& feat_c3,  // [B, N, H, W, C]
+    const at::Tensor& feat_c4,  // [B, N, H, W, C]
+    const at::Tensor& feat_c5,  // [B, N, H, W, C]
+    const at::Tensor& feat_c6,  // [B, N, H, W, C]
+    const at::Tensor& sampling_loc,  // [B, Q, P, 3]
+    const at::Tensor& attn_weight  // [B, Q, P, 4]
+);

models/csrc/msmv_sampling/msmv_sampling_backward.cu ADDED Viewed

	@@ -0,0 +1,448 @@

+/*!
+ * Modified from Deformable DETR
+ */
+#include <cstdio>
+#include <iostream>
+#include <algorithm>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#define CUDA_KERNEL_LOOP(i, n)                          \
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+         i < (n);                                       \
+         i += blockDim.x * gridDim.x)
+#define CUDA_NUM_THREADS 512
+#define MAX_POINT 32
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+    return (N + num_threads - 1) / num_threads;
+}
+__device__ void ms_deform_attn_col2im_bilinear(const float *&bottom_data,
+                                               const int &height, const int &width, const int &channels,
+                                               const float &h, const float &w, const int &c,
+                                               const float &top_grad,
+                                               const float &attn_weight,
+                                               const float *&grad_value,
+                                               float *&grad_sampling_loc,
+                                               float *&grad_attn_weight)
+{
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+    const float lh = h - h_low;
+    const float lw = w - w_low;
+    const float hh = 1 - lh, hw = 1 - lw;
+    const int w_stride = channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const float top_grad_value = top_grad * attn_weight;
+    float grad_h_weight = 0, grad_w_weight = 0;
+    float *grad_ptr;
+    float v1 = 0;
+    if (h_low >= 0 && w_low >= 0)
+    {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + c;
+        grad_ptr = const_cast<float *>(grad_value + ptr1);
+        v1 = bottom_data[ptr1];
+        grad_h_weight -= hw * v1;
+        grad_w_weight -= hh * v1;
+        atomicAdd(grad_ptr, w1 * top_grad_value);
+    }
+    float v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1)
+    {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + c;
+        grad_ptr = const_cast<float *>(grad_value + ptr2);
+        v2 = bottom_data[ptr2];
+        grad_h_weight -= lw * v2;
+        grad_w_weight += hh * v2;
+        atomicAdd(grad_ptr, w2 * top_grad_value);
+    }
+    float v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0)
+    {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + c;
+        grad_ptr = const_cast<float *>(grad_value + ptr3);
+        v3 = bottom_data[ptr3];
+        grad_h_weight += hw * v3;
+        grad_w_weight -= lh * v3;
+        atomicAdd(grad_ptr, w3 * top_grad_value);
+    }
+    float v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1)
+    {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + c;
+        grad_ptr = const_cast<float *>(grad_value + ptr4);
+        v4 = bottom_data[ptr4];
+        grad_h_weight += lw * v4;
+        grad_w_weight += lh * v4;
+        atomicAdd(grad_ptr, w4 * top_grad_value);
+    }
+    const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    atomicAdd(grad_attn_weight, top_grad * val);
+    atomicAdd(grad_sampling_loc, (width - 1) * grad_w_weight * top_grad_value);
+    atomicAdd(grad_sampling_loc + 1, (height - 1) * grad_h_weight * top_grad_value);
+}
+// global_memory_way
+__global__ void ms_deformable_col2im_gpu_kernel_gm_c2345(
+    const float *grad_col,
+    const float *feat_c2,
+    const float *feat_c3,
+    const float *feat_c4,
+    const float *feat_c5,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const float *data_sampling_loc,
+    const float *data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float *grad_value_c2,
+    float *grad_value_c3,
+    float *grad_value_c4,
+    float *grad_value_c5,
+    float *grad_sampling_loc,
+    float *grad_attn_weight)
+{
+    CUDA_KERNEL_LOOP(index, batch_size * num_query * channels * num_point)
+    { // n: bs x query x channels
+        int _temp = index;
+        const int p_col = _temp % num_point;
+        _temp /= num_point;
+        const int c_col = _temp % channels;
+        _temp /= channels;
+        const int sampling_index = _temp;
+        _temp /= num_query;
+        const int b_col = _temp;
+        const float top_grad = grad_col[index];
+        // Sampling location in range [0, 1]
+        int data_loc_ptr = sampling_index * num_point * 3 + p_col * 3;
+        const float loc_w = data_sampling_loc[data_loc_ptr];
+        const float loc_h = data_sampling_loc[data_loc_ptr + 1];
+        const int loc_v = round(data_sampling_loc[data_loc_ptr + 2] * (num_views - 1));
+        // Attn weights
+        int data_weight_ptr = sampling_index * num_point * 4 + p_col * 4;
+        const float weight_c2 = data_attn_weight[data_weight_ptr];
+        const float weight_c3 = data_attn_weight[data_weight_ptr + 1];
+        const float weight_c4 = data_attn_weight[data_weight_ptr + 2];
+        const float weight_c5 = data_attn_weight[data_weight_ptr + 3];
+        // const float h_im = loc_h * spatial_h - 0.5;  // align_corners = False
+        // const float w_im = loc_w * spatial_w - 0.5;
+        // C2 Feature
+        float h_im = loc_h * (h_c2 - 1); // align_corners = True
+        float w_im = loc_w * (w_c2 - 1);
+        float *grad_location_ptr = grad_sampling_loc + data_loc_ptr;
+        float *grad_weights_ptr = grad_attn_weight + data_weight_ptr;
+        if (h_im > -1 && w_im > -1 && h_im < h_c2 && w_im < w_c2)
+        {
+            const float *feat_c2_ptr = feat_c2 + b_col * num_views * h_c2 * w_c2 * channels + loc_v * h_c2 * w_c2 * channels;
+            const float *grad_c2_ptr = grad_value_c2 + b_col * num_views * h_c2 * w_c2 * channels + loc_v * h_c2 * w_c2 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c2_ptr, h_c2, w_c2, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c2,
+                                           grad_c2_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+        grad_weights_ptr += 1;
+        // C3 Feature
+        h_im = loc_h * (h_c3 - 1); // align_corners = True
+        w_im = loc_w * (w_c3 - 1);
+        if (h_im > -1 && w_im > -1 && h_im < h_c3 && w_im < w_c3)
+        {
+            const float *feat_c3_ptr = feat_c3 + b_col * num_views * h_c3 * w_c3 * channels + loc_v * h_c3 * w_c3 * channels;
+            const float *grad_c3_ptr = grad_value_c3 + b_col * num_views * h_c3 * w_c3 * channels + loc_v * h_c3 * w_c3 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c3_ptr, h_c3, w_c3, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c3,
+                                           grad_c3_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+        grad_weights_ptr += 1;
+        // C4 Feature
+        h_im = loc_h * (h_c4 - 1); // align_corners = True
+        w_im = loc_w * (w_c4 - 1);
+        if (h_im > -1 && w_im > -1 && h_im < h_c4 && w_im < w_c4)
+        {
+            const float *feat_c4_ptr = feat_c4 + b_col * num_views * h_c4 * w_c4 * channels + loc_v * h_c4 * w_c4 * channels;
+            const float *grad_c4_ptr = grad_value_c4 + b_col * num_views * h_c4 * w_c4 * channels + loc_v * h_c4 * w_c4 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c4_ptr, h_c4, w_c4, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c4,
+                                           grad_c4_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+        grad_weights_ptr += 1;
+        // C5 Feature
+        h_im = loc_h * (h_c5 - 1); // align_corners = True
+        w_im = loc_w * (w_c5 - 1);
+        if (h_im > -1 && w_im > -1 && h_im < h_c5 && w_im < w_c5)
+        {
+            const float *feat_c5_ptr = feat_c5 + b_col * num_views * h_c5 * w_c5 * channels + loc_v * h_c5 * w_c5 * channels;
+            const float *grad_c5_ptr = grad_value_c5 + b_col * num_views * h_c5 * w_c5 * channels + loc_v * h_c5 * w_c5 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c5_ptr, h_c5, w_c5, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c5,
+                                           grad_c5_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+    }
+}
+__global__ void ms_deformable_col2im_gpu_kernel_gm_c23456(
+    const float *grad_col,
+    const float *feat_c2,
+    const float *feat_c3,
+    const float *feat_c4,
+    const float *feat_c5,
+    const float *feat_c6,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const int h_c6, const int w_c6,
+    const float *data_sampling_loc,
+    const float *data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float *grad_value_c2,
+    float *grad_value_c3,
+    float *grad_value_c4,
+    float *grad_value_c5,
+    float *grad_value_c6,
+    float *grad_sampling_loc,
+    float *grad_attn_weight)
+{
+    CUDA_KERNEL_LOOP(index, batch_size * num_query * channels * num_point)
+    { // n: bs x query x channels
+        int _temp = index;
+        const int p_col = _temp % num_point;
+        _temp /= num_point;
+        const int c_col = _temp % channels;
+        _temp /= channels;
+        const int sampling_index = _temp;
+        _temp /= num_query;
+        const int b_col = _temp;
+        const float top_grad = grad_col[index];
+        // Sampling location in range [0, 1]
+        int data_loc_ptr = sampling_index * num_point * 3 + p_col * 3;
+        const float loc_w = data_sampling_loc[data_loc_ptr];
+        const float loc_h = data_sampling_loc[data_loc_ptr + 1];
+        const int loc_v = round(data_sampling_loc[data_loc_ptr + 2] * (num_views - 1));
+        // Attn weights
+        int data_weight_ptr = sampling_index * num_point * 5 + p_col * 5;
+        const float weight_c2 = data_attn_weight[data_weight_ptr];
+        const float weight_c3 = data_attn_weight[data_weight_ptr + 1];
+        const float weight_c4 = data_attn_weight[data_weight_ptr + 2];
+        const float weight_c5 = data_attn_weight[data_weight_ptr + 3];
+        const float weight_c6 = data_attn_weight[data_weight_ptr + 4];
+        // const float h_im = loc_h * spatial_h - 0.5;  // align_corners = False
+        // const float w_im = loc_w * spatial_w - 0.5;
+        // C2 Feature
+        float h_im = loc_h * (h_c2 - 1); // align_corners = True
+        float w_im = loc_w * (w_c2 - 1);
+        float *grad_location_ptr = grad_sampling_loc + data_loc_ptr;
+        float *grad_weights_ptr = grad_attn_weight + data_weight_ptr;
+        if (h_im > -1 && w_im > -1 && h_im < h_c2 && w_im < w_c2)
+        {
+            const float *feat_c2_ptr = feat_c2 + b_col * num_views * h_c2 * w_c2 * channels + loc_v * h_c2 * w_c2 * channels;
+            const float *grad_c2_ptr = grad_value_c2 + b_col * num_views * h_c2 * w_c2 * channels + loc_v * h_c2 * w_c2 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c2_ptr, h_c2, w_c2, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c2,
+                                           grad_c2_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+        grad_weights_ptr += 1;
+        // C3 Feature
+        h_im = loc_h * (h_c3 - 1); // align_corners = True
+        w_im = loc_w * (w_c3 - 1);
+        if (h_im > -1 && w_im > -1 && h_im < h_c3 && w_im < w_c3)
+        {
+            const float *feat_c3_ptr = feat_c3 + b_col * num_views * h_c3 * w_c3 * channels + loc_v * h_c3 * w_c3 * channels;
+            const float *grad_c3_ptr = grad_value_c3 + b_col * num_views * h_c3 * w_c3 * channels + loc_v * h_c3 * w_c3 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c3_ptr, h_c3, w_c3, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c3,
+                                           grad_c3_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+        grad_weights_ptr += 1;
+        // C4 Feature
+        h_im = loc_h * (h_c4 - 1); // align_corners = True
+        w_im = loc_w * (w_c4 - 1);
+        if (h_im > -1 && w_im > -1 && h_im < h_c4 && w_im < w_c4)
+        {
+            const float *feat_c4_ptr = feat_c4 + b_col * num_views * h_c4 * w_c4 * channels + loc_v * h_c4 * w_c4 * channels;
+            const float *grad_c4_ptr = grad_value_c4 + b_col * num_views * h_c4 * w_c4 * channels + loc_v * h_c4 * w_c4 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c4_ptr, h_c4, w_c4, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c4,
+                                           grad_c4_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+        grad_weights_ptr += 1;
+        // C5 Feature
+        h_im = loc_h * (h_c5 - 1); // align_corners = True
+        w_im = loc_w * (w_c5 - 1);
+        if (h_im > -1 && w_im > -1 && h_im < h_c5 && w_im < w_c5)
+        {
+            const float *feat_c5_ptr = feat_c5 + b_col * num_views * h_c5 * w_c5 * channels + loc_v * h_c5 * w_c5 * channels;
+            const float *grad_c5_ptr = grad_value_c5 + b_col * num_views * h_c5 * w_c5 * channels + loc_v * h_c5 * w_c5 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c5_ptr, h_c5, w_c5, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c5,
+                                           grad_c5_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+        grad_weights_ptr += 1;
+        // C6 Feature
+        h_im = loc_h * (h_c6 - 1); // align_corners = True
+        w_im = loc_w * (w_c6 - 1);
+        if (h_im > -1 && w_im > -1 && h_im < h_c6 && w_im < w_c6)
+        {
+            const float *feat_c6_ptr = feat_c6 + b_col * num_views * h_c6 * w_c6 * channels + loc_v * h_c6 * w_c6 * channels;
+            const float *grad_c6_ptr = grad_value_c6 + b_col * num_views * h_c6 * w_c6 * channels + loc_v * h_c6 * w_c6 * channels;
+            ms_deform_attn_col2im_bilinear(feat_c6_ptr, h_c6, w_c6, channels, h_im, w_im, c_col,
+                                           top_grad, weight_c6,
+                                           grad_c6_ptr, grad_location_ptr, grad_weights_ptr);
+        }
+    }
+}
+void ms_deformable_col2im_cuda_c2345(
+    const float *grad_col,
+    const float *feat_c2,
+    const float *feat_c3,
+    const float *feat_c4,
+    const float *feat_c5,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const float *data_sampling_loc,
+    const float *data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float *grad_value_c2,
+    float *grad_value_c3,
+    float *grad_value_c4,
+    float *grad_value_c5,
+    float *grad_sampling_loc,
+    float *grad_attn_weight)
+{
+    const int num_kernels = batch_size * num_query * channels * num_point;
+    const int num_threads = (channels * num_point > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels * num_point;
+    ms_deformable_col2im_gpu_kernel_gm_c2345 <<<GET_BLOCKS(num_kernels, num_threads), num_threads>>>(
+        grad_col, feat_c2, feat_c3, feat_c4, feat_c5,
+        h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5,
+        data_sampling_loc, data_attn_weight,
+        batch_size, channels, num_views, num_query, num_point,
+        grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5,
+        grad_sampling_loc, grad_attn_weight);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("error in ms_deformable_col2im_cuda_c2345: %s\n", cudaGetErrorString(err));
+    }
+}
+void ms_deformable_col2im_cuda_c23456(
+    const float *grad_col,
+    const float *feat_c2,
+    const float *feat_c3,
+    const float *feat_c4,
+    const float *feat_c5,
+    const float *feat_c6,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const int h_c6, const int w_c6,
+    const float *data_sampling_loc,
+    const float *data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float *grad_value_c2,
+    float *grad_value_c3,
+    float *grad_value_c4,
+    float *grad_value_c5,
+    float *grad_value_c6,
+    float *grad_sampling_loc,
+    float *grad_attn_weight)
+{
+    const int num_kernels = batch_size * num_query * channels * num_point;
+    const int num_threads = (channels * num_point > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels * num_point;
+    ms_deformable_col2im_gpu_kernel_gm_c23456 <<<GET_BLOCKS(num_kernels, num_threads), num_threads>>>(
+        grad_col, feat_c2, feat_c3, feat_c4, feat_c5, feat_c6,
+        h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5, h_c6, w_c6,
+        data_sampling_loc, data_attn_weight,
+        batch_size, channels, num_views, num_query, num_point,
+        grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6,
+        grad_sampling_loc, grad_attn_weight);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("error in ms_deformable_col2im_cuda_c23456: %s\n", cudaGetErrorString(err));
+    }
+}

models/csrc/msmv_sampling/msmv_sampling_forward.cu ADDED Viewed

	@@ -0,0 +1,333 @@

+/*!
+* Modified from Deformable DETR
+*/
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+#define CUDA_NUM_THREADS 512
+#define MAX_POINT 32
+inline int GET_BLOCKS(const int N, const int num_threads) {
+    return (N + num_threads - 1) / num_threads;
+}
+__device__ float ms_deform_attn_im2col_bilinear(
+    const float*& bottom_data,
+    const int& height, const int& width, const int& channels,
+    const float& h, const float& w, const int& c) {
+    const int h_low = floor(h);
+    const int w_low = floor(w);
+    const int h_high = h_low + 1;
+    const int w_high = w_low + 1;
+    const float lh = h - h_low;
+    const float lw = w - w_low;
+    const float hh = 1 - lh, hw = 1 - lw;
+    const int w_stride = channels;
+    const int h_stride = width * w_stride;
+    const int h_low_ptr_offset = h_low * h_stride;
+    const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int w_low_ptr_offset = w_low * w_stride;
+    const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    float v1 = 0;
+    if (h_low >= 0 && w_low >= 0) {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + c;
+        v1 = bottom_data[ptr1];
+    }
+    float v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + c;
+        v2 = bottom_data[ptr2];
+    }
+    float v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + c;
+        v3 = bottom_data[ptr3];
+    }
+    float v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + c;
+        v4 = bottom_data[ptr4];
+    }
+    const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
+}
+__global__ void ms_deformable_im2col_gpu_kernel_c2345(
+    const float* feat_c2,
+    const float* feat_c3,
+    const float* feat_c4,
+    const float* feat_c5,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const float* data_sampling_loc,
+    const float* data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float* data_col) {
+    float res[MAX_POINT];
+    CUDA_KERNEL_LOOP(index, batch_size * num_query * channels) {  // n: bs x query x channels
+        int _temp = index;
+        const int c_col = _temp % channels;
+        _temp /= channels;
+        const int sampling_index = _temp;
+        _temp /= num_query;
+        const int b_col = _temp;
+        for (int p_col = 0; p_col < num_point; ++p_col) { res[p_col] = 0; }
+        for (int p_col = 0; p_col < num_point; ++p_col) {
+            // Sampling location in range [0, 1]
+            int data_loc_ptr = sampling_index * num_point * 3 + p_col * 3;
+            const float loc_w = data_sampling_loc[data_loc_ptr];
+            const float loc_h = data_sampling_loc[data_loc_ptr + 1];
+            const int loc_v = round(data_sampling_loc[data_loc_ptr + 2] * (num_views - 1));
+            // Attn weights
+            int data_weight_ptr = sampling_index * num_point * 4 + p_col * 4;
+            const float weight_c2 = data_attn_weight[data_weight_ptr];
+            const float weight_c3 = data_attn_weight[data_weight_ptr + 1];
+            const float weight_c4 = data_attn_weight[data_weight_ptr + 2];
+            const float weight_c5 = data_attn_weight[data_weight_ptr + 3];
+            //const float h_im = loc_h * spatial_h - 0.5;  // align_corners = False
+            //const float w_im = loc_w * spatial_w - 0.5;
+            // C2 Feature
+            float h_im = loc_h * (h_c2 - 1);  // align_corners = True
+            float w_im = loc_w * (w_c2 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c2 && w_im < w_c2) {
+                const float* feat_c2_ptr = feat_c2 + b_col * num_views * h_c2 * w_c2 * channels + loc_v * h_c2 * w_c2 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c2_ptr, h_c2, w_c2, channels, h_im, w_im, c_col) * weight_c2;
+            }
+            // C3 Feature
+            h_im = loc_h * (h_c3 - 1);  // align_corners = True
+            w_im = loc_w * (w_c3 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c3 && w_im < w_c3) {
+                const float* feat_c3_ptr = feat_c3 + b_col * num_views * h_c3 * w_c3 * channels + loc_v * h_c3 * w_c3 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c3_ptr, h_c3, w_c3, channels, h_im, w_im, c_col) * weight_c3;
+            }
+            // C4 Feature
+            h_im = loc_h * (h_c4 - 1);  // align_corners = True
+            w_im = loc_w * (w_c4 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c4 && w_im < w_c4) {
+                const float* feat_c4_ptr = feat_c4 + b_col * num_views * h_c4 * w_c4 * channels + loc_v * h_c4 * w_c4 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c4_ptr, h_c4, w_c4, channels, h_im, w_im, c_col) * weight_c4;
+            }
+            // C5 Feature
+            h_im = loc_h * (h_c5 - 1);  // align_corners = True
+            w_im = loc_w * (w_c5 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c5 && w_im < w_c5) {
+                const float* feat_c5_ptr = feat_c5 + b_col * num_views * h_c5 * w_c5 * channels + loc_v * h_c5 * w_c5 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c5_ptr, h_c5, w_c5, channels, h_im, w_im, c_col) * weight_c5;
+            }
+        }
+        for (int p_col = 0; p_col < num_point; ++p_col) {
+            float* data_col_ptr = data_col + index * num_point + p_col;
+            *data_col_ptr = res[p_col];
+        }
+    }
+}
+__global__ void ms_deformable_im2col_gpu_kernel_c23456(
+    const float* feat_c2,
+    const float* feat_c3,
+    const float* feat_c4,
+    const float* feat_c5,
+    const float* feat_c6,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const int h_c6, const int w_c6,
+    const float* data_sampling_loc,
+    const float* data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float* data_col) {
+    float res[MAX_POINT];
+    CUDA_KERNEL_LOOP(index, batch_size * num_query * channels) {  // n: bs x query x channels
+        int _temp = index;
+        const int c_col = _temp % channels;
+        _temp /= channels;
+        const int sampling_index = _temp;
+        _temp /= num_query;
+        const int b_col = _temp;
+        for (int p_col = 0; p_col < num_point; ++p_col) { res[p_col] = 0; }
+        for (int p_col = 0; p_col < num_point; ++p_col) {
+            // Sampling location in range [0, 1]
+            int data_loc_ptr = sampling_index * num_point * 3 + p_col * 3;
+            const float loc_w = data_sampling_loc[data_loc_ptr];
+            const float loc_h = data_sampling_loc[data_loc_ptr + 1];
+            const int loc_v = round(data_sampling_loc[data_loc_ptr + 2] * (num_views - 1));
+            // Attn weights
+            int data_weight_ptr = sampling_index * num_point * 5 + p_col * 5;
+            const float weight_c2 = data_attn_weight[data_weight_ptr];
+            const float weight_c3 = data_attn_weight[data_weight_ptr + 1];
+            const float weight_c4 = data_attn_weight[data_weight_ptr + 2];
+            const float weight_c5 = data_attn_weight[data_weight_ptr + 3];
+            const float weight_c6 = data_attn_weight[data_weight_ptr + 4];
+            //const float h_im = loc_h * spatial_h - 0.5;  // align_corners = False
+            //const float w_im = loc_w * spatial_w - 0.5;
+            // C2 Feature
+            float h_im = loc_h * (h_c2 - 1);  // align_corners = True
+            float w_im = loc_w * (w_c2 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c2 && w_im < w_c2) {
+                const float* feat_c2_ptr = feat_c2 + b_col * num_views * h_c2 * w_c2 * channels + loc_v * h_c2 * w_c2 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c2_ptr, h_c2, w_c2, channels, h_im, w_im, c_col) * weight_c2;
+            }
+            // C3 Feature
+            h_im = loc_h * (h_c3 - 1);  // align_corners = True
+            w_im = loc_w * (w_c3 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c3 && w_im < w_c3) {
+                const float* feat_c3_ptr = feat_c3 + b_col * num_views * h_c3 * w_c3 * channels + loc_v * h_c3 * w_c3 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c3_ptr, h_c3, w_c3, channels, h_im, w_im, c_col) * weight_c3;
+            }
+            // C4 Feature
+            h_im = loc_h * (h_c4 - 1);  // align_corners = True
+            w_im = loc_w * (w_c4 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c4 && w_im < w_c4) {
+                const float* feat_c4_ptr = feat_c4 + b_col * num_views * h_c4 * w_c4 * channels + loc_v * h_c4 * w_c4 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c4_ptr, h_c4, w_c4, channels, h_im, w_im, c_col) * weight_c4;
+            }
+            // C5 Feature
+            h_im = loc_h * (h_c5 - 1);  // align_corners = True
+            w_im = loc_w * (w_c5 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c5 && w_im < w_c5) {
+                const float* feat_c5_ptr = feat_c5 + b_col * num_views * h_c5 * w_c5 * channels + loc_v * h_c5 * w_c5 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c5_ptr, h_c5, w_c5, channels, h_im, w_im, c_col) * weight_c5;
+            }
+            // C6 Feature
+            h_im = loc_h * (h_c6 - 1);  // align_corners = True
+            w_im = loc_w * (w_c6 - 1);
+            if (h_im > -1 && w_im > -1 && h_im < h_c6 && w_im < w_c6) {
+                const float* feat_c6_ptr = feat_c6 + b_col * num_views * h_c6 * w_c6 * channels + loc_v * h_c6 * w_c6 * channels;
+                res[p_col] += ms_deform_attn_im2col_bilinear(feat_c6_ptr, h_c6, w_c6, channels, h_im, w_im, c_col) * weight_c6;
+            }
+        }
+        for (int p_col = 0; p_col < num_point; ++p_col) {
+            float* data_col_ptr = data_col + index * num_point + p_col;
+            *data_col_ptr = res[p_col];
+        }
+    }
+}
+void ms_deformable_im2col_cuda_c2345(
+    const float* feat_c2,
+    const float* feat_c3,
+    const float* feat_c4,
+    const float* feat_c5,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const float* data_sampling_loc,
+    const float* data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float* data_col) {
+    const int num_kernels = batch_size * num_query * channels;
+    const int num_threads = CUDA_NUM_THREADS;
+    ms_deformable_im2col_gpu_kernel_c2345 <<<GET_BLOCKS(num_kernels, num_threads), num_threads>>> (
+        feat_c2, feat_c3, feat_c4, feat_c5, h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5,
+        data_sampling_loc, data_attn_weight, batch_size, channels, num_views, num_query, num_point, data_col
+    );
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("error in ms_deformable_im2col_cuda_c2345: %s\n", cudaGetErrorString(err));
+    }
+}
+void ms_deformable_im2col_cuda_c23456(
+    const float* feat_c2,
+    const float* feat_c3,
+    const float* feat_c4,
+    const float* feat_c5,
+    const float* feat_c6,
+    const int h_c2, const int w_c2,
+    const int h_c3, const int w_c3,
+    const int h_c4, const int w_c4,
+    const int h_c5, const int w_c5,
+    const int h_c6, const int w_c6,
+    const float* data_sampling_loc,
+    const float* data_attn_weight,
+    const int batch_size,
+    const int channels,
+    const int num_views,
+    const int num_query,
+    const int num_point,
+    float* data_col) {
+    const int num_kernels = batch_size * num_query * channels;
+    const int num_threads = CUDA_NUM_THREADS;
+    ms_deformable_im2col_gpu_kernel_c23456 <<<GET_BLOCKS(num_kernels, num_threads), num_threads>>> (
+        feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, h_c2, w_c2, h_c3, w_c3, h_c4, w_c4, h_c5, w_c5, h_c6, w_c6,
+        data_sampling_loc, data_attn_weight, batch_size, channels, num_views, num_query, num_point, data_col
+    );
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("error in ms_deformable_im2col_cuda_c23456: %s\n", cudaGetErrorString(err));
+    }
+}

models/csrc/setup.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+def get_ext_modules():
+    return [
+        CUDAExtension(
+            name='_msmv_sampling_cuda',
+            sources=[
+                'msmv_sampling/msmv_sampling.cpp',
+                'msmv_sampling/msmv_sampling_forward.cu',
+                'msmv_sampling/msmv_sampling_backward.cu'
+            ],
+            include_dirs=['msmv_sampling']
+        )
+    ]
+setup(
+    name='csrc',
+    ext_modules=get_ext_modules(),
+    cmdclass={'build_ext': BuildExtension}
+)

models/csrc/wrapper.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import torch.nn.functional as F
+from ._msmv_sampling_cuda import _ms_deform_attn_cuda_c2345_forward, _ms_deform_attn_cuda_c2345_backward
+from ._msmv_sampling_cuda import _ms_deform_attn_cuda_c23456_forward, _ms_deform_attn_cuda_c23456_backward
+def msmv_sampling_pytorch(mlvl_feats, sampling_locations, scale_weights):
+    """
+    value: [B, N, H1W1 + H2W2..., C]
+    sampling_locations: [B, Q, P, 3]
+    scale_weights: [B, Q, P, 4]
+    """
+    assert scale_weights.shape[-1] == len(mlvl_feats)
+    B, _, _, _, C = mlvl_feats[0].shape
+    _, Q, P, _ = sampling_locations.shape
+    sampling_locations = sampling_locations * 2 - 1
+    sampling_locations = sampling_locations[:, :, :, None, :]  # [B, Q, P, 1, 3]
+    final = torch.zeros([B, C, Q, P], device=mlvl_feats[0].device)
+    for lvl, feat in enumerate(mlvl_feats):
+        feat = feat.permute(0, 4, 1, 2, 3)
+        out = F.grid_sample(
+            feat, sampling_locations, mode='bilinear',
+            padding_mode='zeros', align_corners=True,
+        )[..., 0]  # [B, C, Q, P]
+        out = out * scale_weights[..., lvl].reshape(B, 1, Q, P)
+        final += out
+    return final.permute(0, 2, 1, 3)
+class MSMVSamplingC2345(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights):
+        ctx.save_for_backward(feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights)
+        assert callable(_ms_deform_attn_cuda_c2345_forward)
+        return _ms_deform_attn_cuda_c2345_forward(
+            feat_c2, feat_c3, feat_c4, feat_c5,
+            sampling_locations, scale_weights)
+    @staticmethod
+    def backward(ctx, grad_output):
+        feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights = ctx.saved_tensors
+        assert callable(_ms_deform_attn_cuda_c2345_backward)
+        grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_sampling_loc, grad_attn_weight = _ms_deform_attn_cuda_c2345_backward(grad_output.contiguous(),
+            feat_c2, feat_c3, feat_c4, feat_c5,
+            sampling_locations, scale_weights
+        )
+        return grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_sampling_loc, grad_attn_weight
+class MSMVSamplingC23456(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights):
+        ctx.save_for_backward(feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights)
+        assert callable(_ms_deform_attn_cuda_c23456_forward)
+        return _ms_deform_attn_cuda_c23456_forward(
+            feat_c2, feat_c3, feat_c4, feat_c5, feat_c6,
+            sampling_locations, scale_weights)
+    @staticmethod
+    def backward(ctx, grad_output):
+        feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights = ctx.saved_tensors
+        assert callable(_ms_deform_attn_cuda_c23456_backward)
+        grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6, grad_sampling_loc, grad_attn_weight = _ms_deform_attn_cuda_c23456_backward(grad_output.contiguous(),
+            feat_c2, feat_c3, feat_c4, feat_c5, feat_c6,
+            sampling_locations, scale_weights
+        )
+        return grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6, grad_sampling_loc, grad_attn_weight
+def msmv_sampling(mlvl_feats, sampling_locations, scale_weights):
+    if len(mlvl_feats) == 4:
+        return MSMVSamplingC2345.apply(*mlvl_feats, sampling_locations, scale_weights)
+    elif len(mlvl_feats) == 5:
+        return MSMVSamplingC23456.apply(*mlvl_feats, sampling_locations, scale_weights)
+    else:
+        return msmv_sampling_pytorch(mlvl_feats, sampling_locations, scale_weights)

models/sparsebev.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import queue
+import torch
+import numpy as np
+from mmcv.runner import force_fp32, auto_fp16
+from mmcv.runner import get_dist_info
+from mmcv.runner.fp16_utils import cast_tensor_type
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from .utils import GridMask, pad_multiple, GpuPhotoMetricDistortion
+@DETECTORS.register_module()
+class SparseBEV(MVXTwoStageDetector):
+    def __init__(self,
+                 data_aug=None,
+                 stop_prev_grad=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(SparseBEV, self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.data_aug = data_aug
+        self.stop_prev_grad = stop_prev_grad
+        self.color_aug = GpuPhotoMetricDistortion()
+        self.grid_mask = GridMask(ratio=0.5, prob=0.7)
+        self.use_grid_mask = True
+        self.memory = {}
+        self.queue = queue.Queue()
+    @auto_fp16(apply_to=('img'), out_fp32=True)
+    def extract_img_feat(self, img):
+        if self.use_grid_mask:
+            img = self.grid_mask(img)
+        img_feats = self.img_backbone(img)
+        if isinstance(img_feats, dict):
+            img_feats = list(img_feats.values())
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        return img_feats
+    def extract_feat(self, img, img_metas):
+        if isinstance(img, list):
+            img = torch.stack(img, dim=0)
+        assert img.dim() == 5
+        B, N, C, H, W = img.size()
+        img = img.view(B * N, C, H, W)
+        img = img.float()
+        # move some augmentations to GPU
+        if self.data_aug is not None:
+            if 'img_color_aug' in self.data_aug and self.data_aug['img_color_aug'] and self.training:
+                img = self.color_aug(img)
+            if 'img_norm_cfg' in self.data_aug:
+                img_norm_cfg = self.data_aug['img_norm_cfg']
+                norm_mean = torch.tensor(img_norm_cfg['mean'], device=img.device)
+                norm_std = torch.tensor(img_norm_cfg['std'], device=img.device)
+                if img_norm_cfg['to_rgb']:
+                    img = img[:, [2, 1, 0], :, :]  # BGR to RGB
+                img = img - norm_mean.reshape(1, 3, 1, 1)
+                img = img / norm_std.reshape(1, 3, 1, 1)
+            for b in range(B):
+                img_shape = (img.shape[2], img.shape[3], img.shape[1])
+                img_metas[b]['img_shape'] = [img_shape for _ in range(N)]
+                img_metas[b]['ori_shape'] = [img_shape for _ in range(N)]
+            if 'img_pad_cfg' in self.data_aug:
+                img_pad_cfg = self.data_aug['img_pad_cfg']
+                img = pad_multiple(img, img_metas, size_divisor=img_pad_cfg['size_divisor'])
+        input_shape = img.shape[-2:]
+        # update real input shape of each single img
+        for img_meta in img_metas:
+            img_meta.update(input_shape=input_shape)
+        if self.training and self.stop_prev_grad:
+            H, W = input_shape
+            img = img.reshape(B, -1, 6, C, H, W)
+            img_grad = img[:, :1]
+            img_nograd = img[:, 1:]
+            all_img_feats = [self.extract_img_feat(img_grad.reshape(-1, C, H, W))]
+            with torch.no_grad():
+                self.eval()
+                for k in range(img_nograd.shape[1]):
+                    all_img_feats.append(self.extract_img_feat(img_nograd[:, k].reshape(-1, C, H, W)))
+                self.train()
+            img_feats = []
+            for lvl in range(len(all_img_feats[0])):
+                C, H, W = all_img_feats[0][lvl].shape[1:]
+                img_feat = torch.cat([feat[lvl].reshape(B, -1, 6, C, H, W) for feat in all_img_feats], dim=1)
+                img_feat = img_feat.reshape(-1, C, H, W)
+                img_feats.append(img_feat)
+        else:
+            img_feats = self.extract_img_feat(img)
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None):
+        """Forward function for point cloud branch.
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self.pts_bbox_head(pts_feats, img_metas)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss(*loss_inputs)
+        return losses
+    @force_fp32(apply_to=('img', 'points'))
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        img_feats = self.extract_feat(img, img_metas)
+        for i in range(len(img_metas)):
+            img_metas[i]['gt_bboxes_3d'] = gt_bboxes_3d[i]
+            img_metas[i]['gt_labels_3d'] = gt_labels_3d[i]
+        losses = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore)
+        return losses
+    def forward_test(self, img_metas, img=None, **kwargs):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+        return self.simple_test(img_metas[0], img[0], **kwargs)
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        outs = self.pts_bbox_head(x, img_metas)
+        bbox_list = self.pts_bbox_head.get_bboxes(outs, img_metas[0], rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+    def simple_test(self, img_metas, img=None, rescale=False):
+        world_size = get_dist_info()[1]
+        if world_size == 1:  # online
+            return self.simple_test_online(img_metas, img, rescale)
+        elif world_size > 1:  # offline
+            return self.simple_test_offline(img_metas, img, rescale)
+    def simple_test_offline(self, img_metas, img=None, rescale=False):
+        self.fp16_enabled = False
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        bbox_list = [dict() for _ in range(len(img_metas))]
+        bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return bbox_list
+    def simple_test_online(self, img_metas, img=None, rescale=False):
+        self.fp16_enabled = False
+        assert len(img_metas) == 1  # batch_size = 1
+        B, N, C, H, W = img.shape
+        img = img.reshape(B, N//6, 6, C, H, W)
+        img_filenames = img_metas[0]['filename']
+        num_frames = len(img_filenames) // 6
+        # assert num_frames == img.shape[1]
+        img_shape = (H, W, C)
+        img_metas[0]['img_shape'] = [img_shape for _ in range(len(img_filenames))]
+        img_metas[0]['ori_shape'] = [img_shape for _ in range(len(img_filenames))]
+        img_metas[0]['pad_shape'] = [img_shape for _ in range(len(img_filenames))]
+        img_feats_large, img_metas_large = [], []
+        for i in range(num_frames):
+            img_indices = list(np.arange(i * 6, (i + 1) * 6))
+            img_curr_large = img[:, 0]  # [B, 6, C, H, W]
+            img_metas_curr_large = [{}]
+            for k in img_metas[0].keys():
+                if isinstance(img_metas[0][k], list):
+                    img_metas_curr_large[0][k] = [img_metas[0][k][i] for i in img_indices]
+            if img_filenames[img_indices[0]] in self.memory:
+                img_feats_curr_large = self.memory[img_filenames[img_indices[0]]]
+            else:
+                assert i == 0
+                img_feats_curr_large = self.extract_feat(img_curr_large, img_metas_curr_large)
+                self.memory[img_filenames[img_indices[0]]] = img_feats_curr_large
+                self.queue.put(img_filenames[img_indices[0]])
+            img_feats_large.append(img_feats_curr_large)
+            img_metas_large.append(img_metas_curr_large)
+        # reorganize
+        feat_levels = len(img_feats_large[0])
+        img_feats_large_reorganized = []
+        for j in range(feat_levels):
+            feat_l = torch.cat([img_feats_large[i][j] for i in range(len(img_feats_large))], dim=0)
+            feat_l = feat_l.flatten(0, 1)[None, ...]
+            img_feats_large_reorganized.append(feat_l)
+        img_metas_large_reorganized = img_metas_large[0]
+        for i in range(1, len(img_metas_large)):
+            for k, v in img_metas_large[i][0].items():
+                if isinstance(v, list):
+                    img_metas_large_reorganized[0][k].extend(v)
+        img_feats = img_feats_large_reorganized
+        img_metas = img_metas_large_reorganized
+        img_feats = cast_tensor_type(img_feats, torch.half, torch.float32)
+        bbox_list = [dict() for _ in range(1)]
+        bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        while self.queue.qsize() >= 8:
+            pop_key = self.queue.get()
+            self.memory.pop(pop_key)
+        return bbox_list

models/sparsebev_head.py ADDED Viewed

	@@ -0,0 +1,469 @@

+import math
+import torch
+import torch.nn as nn
+from mmcv.runner import force_fp32
+from mmdet.core import multi_apply, reduce_mean
+from mmdet.models import HEADS
+from mmdet.models.dense_heads import DETRHead
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from .bbox.utils import normalize_bbox, encode_bbox
+@HEADS.register_module()
+class SparseBEVHead(DETRHead):
+    def __init__(self,
+                 *args,
+                 num_classes,
+                 in_channels,
+                 query_denoising=True,
+                 query_denoising_groups=10,
+                 bbox_coder=None,
+                 code_size=10,
+                 code_weights=[1.0] * 10,
+                 train_cfg=dict(),
+                 test_cfg=dict(max_per_img=100),
+                 **kwargs):
+        self.code_size = code_size
+        self.code_weights = code_weights
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.embed_dims = in_channels
+        super(SparseBEVHead, self).__init__(num_classes, in_channels, train_cfg=train_cfg, test_cfg=test_cfg, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(self.code_weights), requires_grad=False)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.dn_enabled = query_denoising
+        self.dn_group_num = query_denoising_groups
+        self.dn_weight = 1.0
+        self.dn_bbox_noise_scale = 0.5
+        self.dn_label_noise_scale = 0.5
+    def _init_layers(self):
+        self.init_query_bbox = nn.Embedding(self.num_query, 10)  # (x, y, z, w, l, h, sin, cos, vx, vy)
+        self.label_enc = nn.Embedding(self.num_classes + 1, self.embed_dims - 1)  # DAB-DETR
+        nn.init.zeros_(self.init_query_bbox.weight[:, 2:3])
+        nn.init.zeros_(self.init_query_bbox.weight[:, 8:10])
+        nn.init.constant_(self.init_query_bbox.weight[:, 5:6], 1.5)
+        grid_size = int(math.sqrt(self.num_query))
+        assert grid_size * grid_size == self.num_query
+        x = y = torch.arange(grid_size)
+        xx, yy = torch.meshgrid(x, y, indexing='ij')  # [0, grid_size - 1]
+        xy = torch.cat([xx[..., None], yy[..., None]], dim=-1)
+        xy = (xy + 0.5) / grid_size  # [0.5, grid_size - 0.5] / grid_size ~= (0, 1)
+        with torch.no_grad():
+            self.init_query_bbox.weight[:, :2] = xy.reshape(-1, 2)  # [Q, 2]
+    def init_weights(self):
+        self.transformer.init_weights()
+    def forward(self, mlvl_feats, img_metas):
+        query_bbox = self.init_query_bbox.weight.clone()  # [Q, 10]
+        #query_bbox[..., :3] = query_bbox[..., :3].sigmoid()
+        B = mlvl_feats[0].shape[0]
+        query_bbox, query_feat, attn_mask, mask_dict = self.prepare_for_dn_input(B, query_bbox, self.label_enc, img_metas)
+        cls_scores, bbox_preds = self.transformer(
+            query_bbox,
+            query_feat,
+            mlvl_feats,
+            attn_mask=attn_mask,
+            img_metas=img_metas,
+        )
+        bbox_preds[..., 0] = bbox_preds[..., 0] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]
+        bbox_preds[..., 1] = bbox_preds[..., 1] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]
+        bbox_preds[..., 2] = bbox_preds[..., 2] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2]
+        bbox_preds = torch.cat([
+            bbox_preds[..., 0:2],
+            bbox_preds[..., 3:5],
+            bbox_preds[..., 2:3],
+            bbox_preds[..., 5:10],
+        ], dim=-1)  # [cx, cy, w, l, cz, h, sin, cos, vx, vy]
+        if mask_dict is not None and mask_dict['pad_size'] > 0:
+            output_known_cls_scores = cls_scores[:, :, :mask_dict['pad_size'], :]
+            output_known_bbox_preds = bbox_preds[:, :, :mask_dict['pad_size'], :]
+            output_cls_scores = cls_scores[:, :, mask_dict['pad_size']:, :]
+            output_bbox_preds = bbox_preds[:, :, mask_dict['pad_size']:, :]
+            mask_dict['output_known_lbs_bboxes'] = (output_known_cls_scores, output_known_bbox_preds)
+            outs = {
+                'all_cls_scores': output_cls_scores,
+                'all_bbox_preds': output_bbox_preds,
+                'enc_cls_scores': None,
+                'enc_bbox_preds': None,
+                'dn_mask_dict': mask_dict,
+            }
+        else:
+            outs = {
+                'all_cls_scores': cls_scores,
+                'all_bbox_preds': bbox_preds,
+                'enc_cls_scores': None,
+                'enc_bbox_preds': None,
+            }
+        return outs
+    def prepare_for_dn_input(self, batch_size, init_query_bbox, label_enc, img_metas):
+        device = init_query_bbox.device
+        indicator0 = torch.zeros([self.num_query, 1], device=device)
+        init_query_feat = label_enc.weight[self.num_classes].repeat(self.num_query, 1)
+        init_query_feat = torch.cat([init_query_feat, indicator0], dim=1)
+        if self.training and self.dn_enabled:
+            targets = [{
+                'bboxes': torch.cat([m['gt_bboxes_3d'].gravity_center,
+                                     m['gt_bboxes_3d'].tensor[:, 3:]], dim=1).cuda(),
+                'labels': m['gt_labels_3d'].cuda().long()
+            } for m in img_metas]
+            known = [torch.ones_like(t['labels'], device=device) for t in targets]
+            known_num = [sum(k) for k in known]
+            # can be modified to selectively denosie some label or boxes; also known label prediction
+            unmask_bbox = unmask_label = torch.cat(known)
+            labels = torch.cat([t['labels'] for t in targets]).clone()
+            bboxes = torch.cat([t['bboxes'] for t in targets]).clone()
+            batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
+            known_indice = torch.nonzero(unmask_label + unmask_bbox)
+            known_indice = known_indice.view(-1)
+            # add noise
+            known_indice = known_indice.repeat(self.dn_group_num, 1).view(-1)
+            known_labels = labels.repeat(self.dn_group_num, 1).view(-1)
+            known_bid = batch_idx.repeat(self.dn_group_num, 1).view(-1)
+            known_bboxs = bboxes.repeat(self.dn_group_num, 1) # 9
+            known_labels_expand = known_labels.clone()
+            known_bbox_expand = known_bboxs.clone()
+            # noise on the box
+            if self.dn_bbox_noise_scale > 0:
+                wlh = known_bbox_expand[..., 3:6].clone()
+                rand_prob = torch.rand_like(known_bbox_expand) * 2 - 1.0
+                known_bbox_expand[..., 0:3] += torch.mul(rand_prob[..., 0:3], wlh / 2) * self.dn_bbox_noise_scale
+                # known_bbox_expand[..., 3:6] += torch.mul(rand_prob[..., 3:6], wlh) * self.dn_bbox_noise_scale
+                # known_bbox_expand[..., 6:7] += torch.mul(rand_prob[..., 6:7], 3.14159) * self.dn_bbox_noise_scale
+            known_bbox_expand = encode_bbox(known_bbox_expand, self.pc_range)
+            known_bbox_expand[..., 0:3].clamp_(min=0.0, max=1.0)
+            # nn.init.constant(known_bbox_expand[..., 8:10], 0.0)
+            # noise on the label
+            if self.dn_label_noise_scale > 0:
+                p = torch.rand_like(known_labels_expand.float())
+                chosen_indice = torch.nonzero(p < self.dn_label_noise_scale).view(-1)  # usually half of bbox noise
+                new_label = torch.randint_like(chosen_indice, 0, self.num_classes)  # randomly put a new one here
+                known_labels_expand.scatter_(0, chosen_indice, new_label)
+            known_feat_expand = label_enc(known_labels_expand)
+            indicator1 = torch.ones([known_feat_expand.shape[0], 1], device=device)  # add dn part indicator
+            known_feat_expand = torch.cat([known_feat_expand, indicator1], dim=1)
+            # construct final query
+            dn_single_pad = int(max(known_num))
+            dn_pad_size = int(dn_single_pad * self.dn_group_num)
+            dn_query_bbox = torch.zeros([dn_pad_size, init_query_bbox.shape[-1]], device=device)
+            dn_query_feat = torch.zeros([dn_pad_size, self.embed_dims], device=device)
+            input_query_bbox = torch.cat([dn_query_bbox, init_query_bbox], dim=0).repeat(batch_size, 1, 1)
+            input_query_feat = torch.cat([dn_query_feat, init_query_feat], dim=0).repeat(batch_size, 1, 1)
+            if len(known_num):
+                map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
+                map_known_indice = torch.cat([map_known_indice + dn_single_pad * i for i in range(self.dn_group_num)]).long()
+            if len(known_bid):
+                input_query_bbox[known_bid.long(), map_known_indice] = known_bbox_expand
+                input_query_feat[(known_bid.long(), map_known_indice)] = known_feat_expand
+            total_size = dn_pad_size + self.num_query
+            attn_mask = torch.ones([total_size, total_size], device=device) < 0
+            # match query cannot see the reconstruct
+            attn_mask[dn_pad_size:, :dn_pad_size] = True
+            for i in range(self.dn_group_num):
+                if i == 0:
+                    attn_mask[dn_single_pad * i:dn_single_pad * (i + 1), dn_single_pad * (i + 1):dn_pad_size] = True
+                if i == self.dn_group_num - 1:
+                    attn_mask[dn_single_pad * i:dn_single_pad * (i + 1), :dn_single_pad * i] = True
+                else:
+                    attn_mask[dn_single_pad * i:dn_single_pad * (i + 1), dn_single_pad * (i + 1):dn_pad_size] = True
+                    attn_mask[dn_single_pad * i:dn_single_pad * (i + 1), :dn_single_pad * i] = True
+            mask_dict = {
+                'known_indice': torch.as_tensor(known_indice).long(),
+                'batch_idx': torch.as_tensor(batch_idx).long(),
+                'map_known_indice': torch.as_tensor(map_known_indice).long(),
+                'known_lbs_bboxes': (known_labels, known_bboxs),
+                'pad_size': dn_pad_size
+            }
+        else:
+            input_query_bbox = init_query_bbox.repeat(batch_size, 1, 1)
+            input_query_feat = init_query_feat.repeat(batch_size, 1, 1)
+            attn_mask = None
+            mask_dict = None
+        return input_query_bbox, input_query_feat, attn_mask, mask_dict
+    def prepare_for_dn_loss(self, mask_dict):
+        cls_scores, bbox_preds = mask_dict['output_known_lbs_bboxes']
+        known_labels, known_bboxs = mask_dict['known_lbs_bboxes']
+        map_known_indice = mask_dict['map_known_indice'].long()
+        known_indice = mask_dict['known_indice'].long()
+        batch_idx = mask_dict['batch_idx'].long()
+        bid = batch_idx[known_indice]
+        num_tgt = known_indice.numel()
+        if len(cls_scores) > 0:
+            cls_scores = cls_scores.permute(1, 2, 0, 3)[(bid, map_known_indice)].permute(1, 0, 2)
+            bbox_preds = bbox_preds.permute(1, 2, 0, 3)[(bid, map_known_indice)].permute(1, 0, 2)
+        return known_labels, known_bboxs, cls_scores, bbox_preds, num_tgt
+    def dn_loss_single(self,
+                       cls_scores,
+                       bbox_preds,
+                       known_bboxs,
+                       known_labels,
+                       num_total_pos=None):
+        # Compute the average number of gt boxes accross all gpus
+        num_total_pos = cls_scores.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1.0).item()
+        # cls loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        bbox_weights = torch.ones_like(bbox_preds)
+        label_weights = torch.ones_like(known_labels)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            known_labels.long(),
+            label_weights,
+            avg_factor=num_total_pos
+        )
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(known_bboxs)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos
+        )
+        loss_cls = self.dn_weight * torch.nan_to_num(loss_cls)
+        loss_bbox = self.dn_weight * torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+    @force_fp32(apply_to=('preds_dicts'))
+    def calc_dn_loss(self, loss_dict, preds_dicts, num_dec_layers):
+        known_labels, known_bboxs, cls_scores, bbox_preds, num_tgt = \
+            self.prepare_for_dn_loss(preds_dicts['dn_mask_dict'])
+        all_known_bboxs_list = [known_bboxs for _ in range(num_dec_layers)]
+        all_known_labels_list = [known_labels for _ in range(num_dec_layers)]
+        all_num_tgts_list = [num_tgt for _ in range(num_dec_layers)]
+        dn_losses_cls, dn_losses_bbox = multi_apply(
+            self.dn_loss_single, cls_scores, bbox_preds,
+            all_known_bboxs_list, all_known_labels_list, all_num_tgts_list)
+        loss_dict['loss_cls_dn'] = dn_losses_cls[-1]
+        loss_dict['loss_bbox_dn'] = dn_losses_bbox[-1]
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(dn_losses_cls[:-1], dn_losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls_dn'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox_dn'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_bboxes_ignore=None):
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, gt_labels, gt_bboxes_ignore, self.code_weights, True)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred, gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ), self.num_classes, dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :9]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds)
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [gt_bboxes_ignore_list for _ in range(num_imgs)]
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+                self._get_target_single, cls_scores_list, bbox_preds_list,
+             gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos
+        )
+        loss_cls = torch.nan_to_num(loss_cls)
+        loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_bboxes_ignore=None):
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [gt_bboxes_ignore for _ in range(num_dec_layers)]
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_bboxes_ignore_list)
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+        if 'dn_mask_dict' in preds_dicts and preds_dicts['dn_mask_dict'] is not None:
+            loss_dict = self.calc_dn_loss(loss_dict, preds_dicts, num_dec_layers)
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+        num_samples = len(preds_dicts)
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+            bboxes = LiDARInstance3DBoxes(bboxes, 9)
+            scores = preds['scores']
+            labels = preds['labels']
+            ret_list.append([bboxes, scores, labels])
+        return ret_list

models/sparsebev_sampling.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+import torch.nn.functional as F
+from .bbox.utils import decode_bbox
+from .utils import rotation_3d_in_axis, DUMP
+from .csrc.wrapper import msmv_sampling, msmv_sampling_pytorch
+def make_sample_points(query_bbox, offset, pc_range):
+    '''
+    query_bbox: [B, Q, 10]
+    offset: [B, Q, num_points, 4], normalized by stride
+    '''
+    query_bbox = decode_bbox(query_bbox, pc_range)  # [B, Q, 9]
+    xyz = query_bbox[..., 0:3]  # [B, Q, 3]
+    wlh = query_bbox[..., 3:6]  # [B, Q, 3]
+    ang = query_bbox[..., 6:7]  # [B, Q, 1]
+    delta_xyz = offset[..., 0:3]  # [B, Q, P, 3]
+    delta_xyz = wlh[:, :, None, :] * delta_xyz  # [B, Q, P, 3]
+    delta_xyz = rotation_3d_in_axis(delta_xyz, ang)  # [B, Q, P, 3]
+    sample_xyz = xyz[:, :, None, :] + delta_xyz  # [B, Q, P, 3]
+    return sample_xyz  # [B, Q, P, 3]
+def sampling_4d(sample_points, mlvl_feats, scale_weights, lidar2img, image_h, image_w, eps=1e-5):
+    B, Q, T, G, P, _ = sample_points.shape  # [B, Q, T, G, P, 4]
+    N = 6
+    sample_points = sample_points.reshape(B, Q, T, G * P, 3)
+    # get the projection matrix
+    lidar2img = lidar2img[:, :, None, None, :, :]  # [B, TN, 1, 1, 4, 4]
+    lidar2img = lidar2img.expand(B, T*N, Q, G * P, 4, 4)
+    lidar2img = lidar2img.reshape(B, T, N, Q, G*P, 4, 4)
+    # expand the points
+    ones = torch.ones_like(sample_points[..., :1])
+    sample_points = torch.cat([sample_points, ones], dim=-1)  # [B, Q, GP, 4]
+    sample_points = sample_points[:, :, None, ..., None]     # [B, Q, T, GP, 4]
+    sample_points = sample_points.expand(B, Q, N, T, G * P, 4, 1)
+    sample_points = sample_points.transpose(1, 3)   # [B, T, N, Q, GP, 4, 1]
+    # project 3d sampling points to image
+    sample_points_cam = torch.matmul(lidar2img, sample_points).squeeze(-1)  # [B, T, N, Q, GP, 4]
+    # homo coord -> pixel coord
+    homo = sample_points_cam[..., 2:3]
+    homo_nonzero = torch.maximum(homo, torch.zeros_like(homo) + eps)
+    sample_points_cam = sample_points_cam[..., 0:2] / homo_nonzero  # [B, T, N, Q, GP, 2]
+    # normalize
+    sample_points_cam[..., 0] /= image_w
+    sample_points_cam[..., 1] /= image_h
+    # check if out of image
+    valid_mask = ((homo > eps) \
+        & (sample_points_cam[..., 1:2] > 0.0)
+        & (sample_points_cam[..., 1:2] < 1.0)
+        & (sample_points_cam[..., 0:1] > 0.0)
+        & (sample_points_cam[..., 0:1] < 1.0)
+    ).squeeze(-1).float()  # [B, T, N, Q, GP]
+    if DUMP.enabled:
+        torch.save(torch.cat([sample_points_cam, homo_nonzero], dim=-1),
+                   '{}/sample_points_cam_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
+        torch.save(valid_mask,
+                   '{}/sample_points_cam_valid_mask_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
+    valid_mask = valid_mask.permute(0, 1, 3, 4, 2)  # [B, T, Q, GP, N]
+    sample_points_cam = sample_points_cam.permute(0, 1, 3, 4, 2, 5)  # [B, T, Q, GP, N, 2]
+    i_batch = torch.arange(B, dtype=torch.long, device=sample_points.device)
+    i_query = torch.arange(Q, dtype=torch.long, device=sample_points.device)
+    i_time = torch.arange(T, dtype=torch.long, device=sample_points.device)
+    i_point = torch.arange(G * P, dtype=torch.long, device=sample_points.device)
+    i_batch = i_batch.view(B, 1, 1, 1, 1).expand(B, T, Q, G * P, 1)
+    i_time = i_time.view(1, T, 1, 1, 1).expand(B, T, Q, G * P, 1)
+    i_query = i_query.view(1, 1, Q, 1, 1).expand(B, T, Q, G * P, 1)
+    i_point = i_point.view(1, 1, 1, G * P, 1).expand(B, T, Q, G * P, 1)
+    i_view = torch.argmax(valid_mask, dim=-1)[..., None]  # [B, T, Q, GP, 1]
+    sample_points_cam = sample_points_cam[i_batch, i_time, i_query, i_point, i_view, :]  # [B, Q, GP, 1, 2]
+    valid_mask = valid_mask[i_batch, i_time, i_query, i_point, i_view]  # [B, Q, GP, 1]
+    sample_points_cam = torch.cat([sample_points_cam, i_view[..., None].float() / 5], dim=-1)
+    sample_points_cam = sample_points_cam.reshape(B, T, Q, G, P, 1, 3)
+    sample_points_cam = sample_points_cam.permute(0, 1, 3, 2, 4, 5, 6)  # [B, T, G, Q, P, 1, 3]
+    sample_points_cam = sample_points_cam.reshape(B*T*G, Q, P, 3)
+    scale_weights = scale_weights.reshape(B, Q, G, T, P, -1)
+    scale_weights = scale_weights.permute(0, 2, 3, 1, 4, 5)
+    scale_weights = scale_weights.reshape(B*G*T, Q, P, -1)
+    final = msmv_sampling(mlvl_feats, sample_points_cam, scale_weights)
+    C = final.shape[2]  # [BTG, Q, C, P]
+    final = final.reshape(B, T, G, Q, C, P)
+    final = final.permute(0, 3, 2, 1, 5, 4)
+    final = final.flatten(3, 4)  # [B, Q, G, FP, C]
+    return final

models/sparsebev_transformer.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from mmcv.runner import BaseModule
+from mmcv.cnn import bias_init_with_prob
+from mmcv.cnn.bricks.transformer import MultiheadAttention, FFN
+from mmdet.models.utils.builder import TRANSFORMER
+from .bbox.utils import decode_bbox
+from .utils import inverse_sigmoid, DUMP
+from .sparsebev_sampling import sampling_4d, make_sample_points
+from .checkpoint import checkpoint as cp
+@TRANSFORMER.register_module()
+class SparseBEVTransformer(BaseModule):
+    def __init__(self, embed_dims, num_frames=8, num_points=4, num_layers=6, num_levels=4, num_classes=10, code_size=10, pc_range=[], init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                            'behavior, init_cfg is not allowed to be set'
+        super(SparseBEVTransformer, self).__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.pc_range = pc_range
+        self.decoder = SparseBEVTransformerDecoder(embed_dims, num_frames, num_points, num_layers, num_levels, num_classes, code_size, pc_range=pc_range)
+    @torch.no_grad()
+    def init_weights(self):
+        self.decoder.init_weights()
+    def forward(self, query_bbox, query_feat, mlvl_feats, attn_mask, img_metas):
+        cls_scores, bbox_preds = self.decoder(query_bbox, query_feat, mlvl_feats, attn_mask, img_metas)
+        cls_scores = torch.nan_to_num(cls_scores)
+        bbox_preds = torch.nan_to_num(bbox_preds)
+        return cls_scores, bbox_preds
+class SparseBEVTransformerDecoder(BaseModule):
+    def __init__(self, embed_dims, num_frames=8, num_points=4, num_layers=6, num_levels=4, num_classes=10, code_size=10, pc_range=[], init_cfg=None):
+        super(SparseBEVTransformerDecoder, self).__init__(init_cfg)
+        self.num_layers = num_layers
+        self.pc_range = pc_range
+        self.decoder_layer = SparseBEVTransformerDecoderLayer(
+            embed_dims, num_frames, num_points, num_levels, num_classes, code_size, pc_range=pc_range
+        )
+    @torch.no_grad()
+    def init_weights(self):
+        self.decoder_layer.init_weights()
+    def forward(self, query_bbox, query_feat, mlvl_feats, attn_mask, img_metas):
+        cls_scores, bbox_preds = [], []
+        timestamps = np.array([m['img_timestamp'] for m in img_metas], dtype=np.float64)
+        timestamps = np.reshape(timestamps, [query_bbox.shape[0], -1, 6])
+        time_diff = timestamps[:, :1, :] - timestamps
+        time_diff = np.mean(time_diff, axis=-1).astype(np.float32)  # [B, F]
+        time_diff = torch.from_numpy(time_diff).to(query_bbox.device)  # [B, F]
+        img_metas[0]['time_diff'] = time_diff
+        lidar2img = np.asarray([m['lidar2img'] for m in img_metas]).astype(np.float32)
+        lidar2img = torch.from_numpy(lidar2img).to(query_bbox.device)  # [B, N, 4, 4]
+        img_metas[0]['lidar2img'] = lidar2img
+        for lvl, feat in enumerate(mlvl_feats):
+            B, TN, GC, H, W = feat.shape  # [B, TN, GC, H, W]
+            N, T, G, C = 6, TN // 6, 4, GC // 4
+            feat = feat.reshape(B, T, N, G, C, H, W)
+            feat = feat.permute(0, 1, 3, 2, 5, 6, 4)  # [B, T, G, N, H, W, C]
+            feat = feat.reshape(B*T*G, N, H, W, C)  # [BTG, C, N, H, W]
+            mlvl_feats[lvl] = feat.contiguous()
+        for i in range(self.num_layers):
+            DUMP.stage_count = i
+            query_feat, cls_score, bbox_pred = self.decoder_layer(
+                query_bbox, query_feat, mlvl_feats, attn_mask, img_metas
+            )
+            query_bbox = bbox_pred.clone().detach()
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        cls_scores = torch.stack(cls_scores)
+        bbox_preds = torch.stack(bbox_preds)
+        return cls_scores, bbox_preds
+class SparseBEVTransformerDecoderLayer(BaseModule):
+    def __init__(self, embed_dims, num_frames=8, num_points=4, num_levels=4, num_classes=10, code_size=10, num_cls_fcs=2, num_reg_fcs=2, pc_range=[], init_cfg=None):
+        super(SparseBEVTransformerDecoderLayer, self).__init__(init_cfg)
+        self.embed_dims = embed_dims
+        self.num_classes = num_classes
+        self.code_size = code_size
+        self.pc_range = pc_range
+        self.position_encoder = nn.Sequential(
+            nn.Linear(3, self.embed_dims),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims, self.embed_dims),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        self.self_attn = SparseBEVSelfAttention(embed_dims, num_heads=8, dropout=0.1, pc_range=pc_range)
+        self.sampling = SparseBEVSampling(embed_dims, num_frames=num_frames, num_groups=4, num_points=num_points, num_levels=num_levels, pc_range=pc_range)
+        self.mixing = AdaptiveMixing(in_dim=embed_dims, in_points=num_points * num_frames, n_groups=4, out_points=128)
+        self.ffn = FFN(embed_dims, feedforward_channels=512, ffn_drop=0.1)
+        self.norm1 = nn.LayerNorm(embed_dims)
+        self.norm2 = nn.LayerNorm(embed_dims)
+        self.norm3 = nn.LayerNorm(embed_dims)
+        cls_branch = []
+        for _ in range(num_cls_fcs):
+            cls_branch.append(nn.Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(nn.Linear(self.embed_dims, self.num_classes))
+        self.cls_branch = nn.Sequential(*cls_branch)
+        reg_branch = []
+        for _ in range(num_reg_fcs):
+            reg_branch.append(nn.Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU(inplace=True))
+        reg_branch.append(nn.Linear(self.embed_dims, self.code_size))
+        self.reg_branch = nn.Sequential(*reg_branch)
+    @torch.no_grad()
+    def init_weights(self):
+        self.self_attn.init_weights()
+        self.sampling.init_weights()
+        self.mixing.init_weights()
+        bias_init = bias_init_with_prob(0.01)
+        nn.init.constant_(self.cls_branch[-1].bias, bias_init)
+    def refine_bbox(self, bbox_proposal, bbox_delta):
+        xyz = inverse_sigmoid(bbox_proposal[..., 0:3])
+        xyz_delta = bbox_delta[..., 0:3]
+        xyz_new = torch.sigmoid(xyz_delta + xyz)
+        return torch.cat([xyz_new, bbox_delta[..., 3:]], dim=-1)
+    def forward(self, query_bbox, query_feat, mlvl_feats, attn_mask, img_metas):
+        """
+        query_bbox: [B, Q, 10] [cx, cy, cz, w, h, d, rot.sin, rot.cos, vx, vy]
+        """
+        query_pos = self.position_encoder(query_bbox[..., :3])
+        query_feat = query_feat + query_pos
+        query_feat = self.norm1(self.self_attn(query_bbox, query_feat, attn_mask))
+        sampled_feat = self.sampling(query_bbox, query_feat, mlvl_feats, img_metas)
+        query_feat = self.norm2(self.mixing(sampled_feat, query_feat))
+        query_feat = self.norm3(self.ffn(query_feat))
+        cls_score = self.cls_branch(query_feat)  # [B, Q, num_classes]
+        bbox_pred = self.reg_branch(query_feat)  # [B, Q, code_size]
+        bbox_pred = self.refine_bbox(query_bbox, bbox_pred)
+        time_diff = img_metas[0]['time_diff']  # [B, F]
+        if time_diff.shape[1] > 1:
+            time_diff = time_diff.clone()
+            time_diff[time_diff < 1e-5] = 1.0
+            bbox_pred[..., 8:] = bbox_pred[..., 8:] / time_diff[:, 1:2, None]
+        if DUMP.enabled:
+            query_bbox_dec = decode_bbox(query_bbox, self.pc_range)
+            bbox_pred_dec = decode_bbox(bbox_pred, self.pc_range)
+            cls_score_sig = torch.sigmoid(cls_score)
+            torch.save(query_bbox_dec, '{}/query_bbox_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
+            torch.save(bbox_pred_dec, '{}/bbox_pred_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
+            torch.save(cls_score_sig, '{}/cls_score_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
+        return query_feat, cls_score, bbox_pred
+class SparseBEVSelfAttention(BaseModule):
+    def __init__(self, embed_dims=256, num_heads=8, dropout=0.1, pc_range=[], init_cfg=None):
+        super().__init__(init_cfg)
+        self.pc_range = pc_range
+        self.attention = MultiheadAttention(embed_dims, num_heads, dropout, batch_first=True)
+        self.gen_tau = nn.Linear(embed_dims, num_heads)
+    @torch.no_grad()
+    def init_weights(self):
+        nn.init.zeros_(self.gen_tau.weight)
+        nn.init.uniform_(self.gen_tau.bias, 0.0, 2.0)
+    def inner_forward(self, query_bbox, query_feat, pre_attn_mask):
+        """
+        query_bbox: [B, Q, 10]
+        query_feat: [B, Q, C]
+        """
+        dist = self.calc_bbox_dists(query_bbox)
+        tau = self.gen_tau(query_feat)  # [B, Q, 8]
+        if DUMP.enabled:
+            torch.save(tau, '{}/sasa_tau_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
+        tau = tau.permute(0, 2, 1)  # [B, 8, Q]
+        attn_mask = dist[:, None, :, :] * tau[..., None]  # [B, 8, Q, Q]
+        if pre_attn_mask is not None:
+            attn_mask[:, :, pre_attn_mask] = float('-inf')
+        attn_mask = attn_mask.flatten(0, 1)  # [Bx8, Q, Q]
+        return self.attention(query_feat, attn_mask=attn_mask)
+    def forward(self, query_bbox, query_feat, pre_attn_mask):
+        if self.training and query_feat.requires_grad:
+            return cp(self.inner_forward, query_bbox, query_feat, pre_attn_mask, use_reentrant=False)
+        else:
+            return self.inner_forward(query_bbox, query_feat, pre_attn_mask)
+    @torch.no_grad()
+    def calc_bbox_dists(self, bboxes):
+        centers = decode_bbox(bboxes, self.pc_range)[..., :2]  # [B, Q, 2]
+        dist = []
+        for b in range(centers.shape[0]):
+            dist_b = torch.norm(centers[b].reshape(-1, 1, 2) - centers[b].reshape(1, -1, 2), dim=-1)
+            dist.append(dist_b[None, ...])
+        dist = torch.cat(dist, dim=0)  # [B, Q, Q]
+        dist = -dist
+        return dist
+class SparseBEVSampling(BaseModule):
+    def __init__(self, embed_dims=256, num_frames=4, num_groups=4, num_points=8, num_levels=4, pc_range=[], init_cfg=None):
+        super().__init__(init_cfg)
+        self.num_frames = num_frames
+        self.num_points = num_points
+        self.num_groups = num_groups
+        self.num_levels = num_levels
+        self.pc_range = pc_range
+        self.sampling_offset = nn.Linear(embed_dims, num_groups * num_points * 3)
+        self.scale_weights = nn.Linear(embed_dims, num_groups * num_points * num_levels)
+    def init_weights(self):
+        bias = self.sampling_offset.bias.data.view(self.num_groups * self.num_points, 3)
+        nn.init.zeros_(self.sampling_offset.weight)
+        nn.init.uniform_(bias[:, 0:3], -0.5, 0.5)
+    def inner_forward(self, query_bbox, query_feat, mlvl_feats, img_metas):
+        '''
+        query_bbox: [B, Q, 10]
+        query_feat: [B, Q, C]
+        '''
+        B, Q = query_bbox.shape[:2]
+        image_h, image_w, _ = img_metas[0]['img_shape'][0]
+        # sampling offset of all frames
+        sampling_offset = self.sampling_offset(query_feat)
+        sampling_offset = sampling_offset.view(B, Q, self.num_groups * self.num_points, 3)
+        sampling_points = make_sample_points(query_bbox, sampling_offset, self.pc_range)  # [B, Q, GP, 3]
+        sampling_points = sampling_points.reshape(B, Q, 1, self.num_groups, self.num_points, 3)
+        sampling_points = sampling_points.expand(B, Q, self.num_frames, self.num_groups, self.num_points, 3)
+        # warp sample points based on velocity
+        time_diff = img_metas[0]['time_diff']  # [B, F]
+        time_diff = time_diff[:, None, :, None]  # [B, 1, F, 1]
+        vel = query_bbox[..., 8:].detach()  # [B, Q, 2]
+        vel = vel[:, :, None, :]  # [B, Q, 1, 2]
+        dist = vel * time_diff  # [B, Q, F, 2]
+        dist = dist[:, :, :, None, None, :]  # [B, Q, F, 1, 1, 2]
+        sampling_points = torch.cat([
+            sampling_points[..., 0:2] - dist,
+            sampling_points[..., 2:3]
+        ], dim=-1)
+        # scale weights
+        scale_weights = self.scale_weights(query_feat).view(B, Q, self.num_groups, 1, self.num_points, self.num_levels)
+        scale_weights = torch.softmax(scale_weights, dim=-1)
+        scale_weights = scale_weights.expand(B, Q, self.num_groups, self.num_frames, self.num_points, self.num_levels)
+        # sampling
+        sampled_feats = sampling_4d(
+            sampling_points,
+            mlvl_feats,
+            scale_weights,
+            img_metas[0]['lidar2img'],
+            image_h, image_w
+        )  # [B, Q, G, FP, C]
+        return sampled_feats
+    def forward(self, query_bbox, query_feat, mlvl_feats, img_metas):
+        if self.training and query_feat.requires_grad:
+            return cp(self.inner_forward, query_bbox, query_feat, mlvl_feats, img_metas, use_reentrant=False)
+        else:
+            return self.inner_forward(query_bbox, query_feat, mlvl_feats, img_metas)
+class AdaptiveMixing(nn.Module):
+    def __init__(self, in_dim, in_points, n_groups=1, query_dim=None, out_dim=None, out_points=None):
+        super(AdaptiveMixing, self).__init__()
+        out_dim = out_dim if out_dim is not None else in_dim
+        out_points = out_points if out_points is not None else in_points
+        query_dim = query_dim if query_dim is not None else in_dim
+        self.query_dim = query_dim
+        self.in_dim = in_dim
+        self.in_points = in_points
+        self.n_groups = n_groups
+        self.out_dim = out_dim
+        self.out_points = out_points
+        self.eff_in_dim = in_dim // n_groups
+        self.eff_out_dim = out_dim // n_groups
+        self.m_parameters = self.eff_in_dim * self.eff_out_dim
+        self.s_parameters = self.in_points * self.out_points
+        self.total_parameters = self.m_parameters + self.s_parameters
+        self.parameter_generator = nn.Linear(self.query_dim, self.n_groups * self.total_parameters)
+        self.out_proj = nn.Linear(self.eff_out_dim * self.out_points * self.n_groups, self.query_dim)
+        self.act = nn.ReLU(inplace=True)
+    @torch.no_grad()
+    def init_weights(self):
+        nn.init.zeros_(self.parameter_generator.weight)
+    def inner_forward(self, x, query):
+        B, Q, G, P, C = x.shape
+        assert G == self.n_groups
+        assert P == self.in_points
+        assert C == self.eff_in_dim
+        '''generate mixing parameters'''
+        params = self.parameter_generator(query)
+        params = params.reshape(B*Q, G, -1)
+        out = x.reshape(B*Q, G, P, C)
+        M, S = params.split([self.m_parameters, self.s_parameters], 2)
+        M = M.reshape(B*Q, G, self.eff_in_dim, self.eff_out_dim)
+        S = S.reshape(B*Q, G, self.out_points, self.in_points)
+        '''adaptive channel mixing'''
+        out = torch.matmul(out, M)
+        out = F.layer_norm(out, [out.size(-2), out.size(-1)])
+        out = self.act(out)
+        '''adaptive point mixing'''
+        out = torch.matmul(S, out)  # implicitly transpose and matmul
+        out = F.layer_norm(out, [out.size(-2), out.size(-1)])
+        out = self.act(out)
+        '''linear transfomation to query dim'''
+        out = out.reshape(B, Q, -1)
+        out = self.out_proj(out)
+        out = query + out
+        return out
+    def forward(self, x, query):
+        if self.training and x.requires_grad:
+            return cp(self.inner_forward, x, query, use_reentrant=False)
+        else:
+            return self.inner_forward(x, query)

models/utils.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from numpy import random
+class GridMask(nn.Module):
+    def __init__(self, ratio=0.5, prob=0.7):
+        super(GridMask, self).__init__()
+        self.ratio = ratio
+        self.prob = prob
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n, c, h, w = x.size()
+        x = x.view(-1, h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(2, h)
+        l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.uint8)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        for i in range(hh // d):
+            s = d*i + st_h
+            t = min(s + l, hh)
+            mask[s:t, :] = 0
+        for i in range(ww // d):
+            s = d*i + st_w
+            t = min(s + l, ww)
+            mask[:, s:t] = 0
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+        mask = torch.tensor(mask, dtype=x.dtype, device=x.device)
+        mask = 1 - mask
+        mask = mask.expand_as(x)
+        x = x * mask
+        return x.view(n, c, h, w)
+def rotation_3d_in_axis(points, angles):
+    assert points.shape[-1] == 3
+    assert angles.shape[-1] == 1
+    angles = angles[..., 0]
+    n_points = points.shape[-2]
+    input_dims = angles.shape
+    if len(input_dims) > 1:
+        points = points.reshape(-1, n_points, 3)
+        angles = angles.reshape(-1)
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    ones = torch.ones_like(rot_cos)
+    zeros = torch.zeros_like(rot_cos)
+    rot_mat_T = torch.stack([
+        rot_cos, rot_sin, zeros,
+        -rot_sin, rot_cos, zeros,
+        zeros, zeros, ones,
+    ]).transpose(0, 1).reshape(-1, 3, 3)
+    points = torch.bmm(points, rot_mat_T)
+    if len(input_dims) > 1:
+        points = points.reshape(*input_dims, n_points, 3)
+    return points
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+def pad_multiple(inputs, img_metas, size_divisor=32):
+    _, _, img_h, img_w = inputs.shape
+    pad_h = 0 if img_h % size_divisor == 0 else size_divisor - (img_h % size_divisor)
+    pad_w = 0 if img_w % size_divisor == 0 else size_divisor - (img_w % size_divisor)
+    B = len(img_metas)
+    N = len(img_metas[0]['ori_shape'])
+    for b in range(B):
+        img_metas[b]['img_shape'] = [(img_h + pad_h, img_w + pad_w, 3) for _ in range(N)]
+        img_metas[b]['pad_shape'] = [(img_h + pad_h, img_w + pad_w, 3) for _ in range(N)]
+    if pad_h == 0 and pad_w == 0:
+        return inputs
+    else:
+        return F.pad(inputs, [0, pad_w, 0, pad_h], value=0)
+def rgb_to_hsv(image: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
+    r"""Convert an image from RGB to HSV.
+    .. image:: _static/img/rgb_to_hsv.png
+    The image data is assumed to be in the range of (0, 1).
+    Args:
+        image: RGB Image to be converted to HSV with shape of :math:`(*, 3, H, W)`.
+        eps: scalar to enforce numarical stability.
+    Returns:
+        HSV version of the image with shape of :math:`(*, 3, H, W)`.
+        The H channel values are in the range 0..2pi. S and V are in the range 0..1.
+    .. note::
+       See a working example `here <https://kornia-tutorials.readthedocs.io/en/latest/
+       color_conversions.html>`__.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = rgb_to_hsv(input)  # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")
+    image = image / 255.0
+    max_rgb, argmax_rgb = image.max(-3)
+    min_rgb, argmin_rgb = image.min(-3)
+    deltac = max_rgb - min_rgb
+    v = max_rgb
+    s = deltac / (max_rgb + eps)
+    deltac = torch.where(deltac == 0, torch.ones_like(deltac), deltac)
+    rc, gc, bc = torch.unbind((max_rgb.unsqueeze(-3) - image), dim=-3)
+    h1 = bc - gc
+    h2 = (rc - bc) + 2.0 * deltac
+    h3 = (gc - rc) + 4.0 * deltac
+    h = torch.stack((h1, h2, h3), dim=-3) / deltac.unsqueeze(-3)
+    h = torch.gather(h, dim=-3, index=argmax_rgb.unsqueeze(-3)).squeeze(-3)
+    h = (h / 6.0) % 1.0
+    h = h * 360.0
+    v = v * 255.0
+    return torch.stack((h, s, v), dim=-3)
+def hsv_to_rgb(image: torch.Tensor) -> torch.Tensor:
+    r"""Convert an image from HSV to RGB.
+    The H channel values are assumed to be in the range 0..2pi. S and V are in the range 0..1.
+    Args:
+        image: HSV Image to be converted to HSV with shape of :math:`(*, 3, H, W)`.
+    Returns:
+        RGB version of the image with shape of :math:`(*, 3, H, W)`.
+    Example:
+        >>> input = torch.rand(2, 3, 4, 5)
+        >>> output = hsv_to_rgb(input)  # 2x3x4x5
+    """
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")
+    if len(image.shape) < 3 or image.shape[-3] != 3:
+        raise ValueError(f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")
+    h: torch.Tensor = image[..., 0, :, :] / 360.0
+    s: torch.Tensor = image[..., 1, :, :]
+    v: torch.Tensor = image[..., 2, :, :] / 255.0
+    hi: torch.Tensor = torch.floor(h * 6) % 6
+    f: torch.Tensor = ((h * 6) % 6) - hi
+    one: torch.Tensor = torch.tensor(1.0, device=image.device, dtype=image.dtype)
+    p: torch.Tensor = v * (one - s)
+    q: torch.Tensor = v * (one - f * s)
+    t: torch.Tensor = v * (one - (one - f) * s)
+    hi = hi.long()
+    indices: torch.Tensor = torch.stack([hi, hi + 6, hi + 12], dim=-3)
+    out = torch.stack((v, q, p, p, t, v, t, v, v, q, p, p, p, p, t, v, v, q), dim=-3)
+    out = torch.gather(out, -3, indices)
+    out = out * 255.0
+    return out
+class GpuPhotoMetricDistortion:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+    def __call__(self, imgs):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = imgs[:, [2, 1, 0], :, :]  # BGR to RGB
+        contrast_modes = []
+        for _ in range(imgs.shape[0]):
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            contrast_modes.append(random.randint(2))
+        for idx in range(imgs.shape[0]):
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(-self.brightness_delta, self.brightness_delta)
+                imgs[idx] += delta
+            if contrast_modes[idx] == 0:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower, self.contrast_upper)
+                    imgs[idx] *= alpha
+        # convert color from BGR to HSV
+        imgs = rgb_to_hsv(imgs)
+        for idx in range(imgs.shape[0]):
+            # random saturation
+            if random.randint(2):
+                imgs[idx, 1] *= random.uniform(self.saturation_lower, self.saturation_upper)
+            # random hue
+            if random.randint(2):
+                imgs[idx, 0] += random.uniform(-self.hue_delta, self.hue_delta)
+        imgs[:, 0][imgs[:, 0] > 360] -= 360
+        imgs[:, 0][imgs[:, 0] < 0] += 360
+        # convert color from HSV to BGR
+        imgs = hsv_to_rgb(imgs)
+        for idx in range(imgs.shape[0]):
+            # random contrast
+            if contrast_modes[idx] == 1:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower, self.contrast_upper)
+                    imgs[idx] *= alpha
+            # randomly swap channels
+            if random.randint(2):
+                imgs[idx] = imgs[idx, random.permutation(3)]
+        imgs = imgs[:, [2, 1, 0], :, :]  # RGB to BGR
+        return imgs
+class DumpConfig:
+    def __init__(self):
+        self.enabled = False
+        self.out_dir = 'outputs'
+        self.stage_count = 0
+        self.frame_count = 0
+DUMP = DumpConfig()

timing.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import time
+import utils
+import logging
+import argparse
+import importlib
+import torch
+import torch.distributed
+import torch.backends.cudnn as cudnn
+from mmcv import Config, DictAction
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint
+from mmdet.apis import set_random_seed
+from mmdet3d.datasets import build_dataset, build_dataloader
+from mmdet3d.models import build_model
+def main():
+    parser = argparse.ArgumentParser(description='Validate a detector')
+    parser.add_argument('--config', required=True)
+    parser.add_argument('--weights', required=True)
+    parser.add_argument('--num_warmup', default=10)
+    parser.add_argument('--samples', default=500)
+    parser.add_argument('--log-interval', default=50, help='interval of logging')
+    parser.add_argument('--override', nargs='+', action=DictAction)
+    args = parser.parse_args()
+    # parse configs
+    cfgs = Config.fromfile(args.config)
+    if args.override is not None:
+        cfgs.merge_from_dict(args.override)
+    # register custom module
+    importlib.import_module('models')
+    importlib.import_module('loaders')
+    # MMCV, please shut up
+    from mmcv.utils.logging import logger_initialized
+    logger_initialized['root'] = logging.Logger(__name__, logging.WARNING)
+    logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING)
+    utils.init_logging(None, cfgs.debug)
+    # you need GPUs
+    assert torch.cuda.is_available() and torch.cuda.device_count() == 1
+    logging.info('Using GPU: %s' % torch.cuda.get_device_name(0))
+    torch.cuda.set_device(0)
+    logging.info('Setting random seed: 0')
+    set_random_seed(0, deterministic=True)
+    cudnn.benchmark = True
+    logging.info('Loading validation set from %s' % cfgs.data.val.data_root)
+    val_dataset = build_dataset(cfgs.data.val)
+    val_loader = build_dataloader(
+        val_dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfgs.data.workers_per_gpu,
+        num_gpus=1,
+        dist=False,
+        shuffle=False,
+        seed=0,
+    )
+    logging.info('Creating model: %s' % cfgs.model.type)
+    model = build_model(cfgs.model)
+    model.cuda()
+    assert torch.cuda.device_count() == 1
+    model = MMDataParallel(model, [0])
+    logging.info('Loading checkpoint from %s' % args.weights)
+    load_checkpoint(
+        model, args.weights, map_location='cuda', strict=False,
+        logger=logging.Logger(__name__, logging.ERROR)
+    )
+    model.eval()
+    pure_inf_time = 0
+    with torch.no_grad():
+        for i, data in enumerate(val_loader):
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+            model(return_loss=False, rescale=True, **data)
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+            if i >= args.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % args.log_interval == 0:
+                    fps = (i + 1 - args.num_warmup) / pure_inf_time
+                    print(f'Done sample [{i + 1:<3}/ {args.samples}], '
+                        f'fps: {fps:.1f} sample / s')
+            if (i + 1) == args.samples:
+                break
+if __name__ == '__main__':
+    main()

train.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+import utils
+import shutil
+import logging
+import argparse
+import importlib
+import torch
+import torch.distributed as dist
+from datetime import datetime
+from mmcv import Config, DictAction
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import EpochBasedRunner, build_optimizer, load_checkpoint
+from mmdet.apis import set_random_seed
+from mmdet.core import DistEvalHook, EvalHook
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_model
+from loaders.builder import build_dataloader
+def main():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('--config', required=True)
+    parser.add_argument('--override', nargs='+', action=DictAction)
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--world_size', type=int, default=1)
+    args = parser.parse_args()
+    # parse configs
+    cfgs = Config.fromfile(args.config)
+    if args.override is not None:
+        cfgs.merge_from_dict(args.override)
+    # register custom module
+    importlib.import_module('models')
+    importlib.import_module('loaders')
+    # MMCV, please shut up
+    from mmcv.utils.logging import logger_initialized
+    logger_initialized['root'] = logging.Logger(__name__, logging.WARNING)
+    logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING)
+    logger_initialized['mmdet3d'] = logging.Logger(__name__, logging.WARNING)
+    # you need GPUs
+    assert torch.cuda.is_available()
+    # determine local_rank and world_size
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    if 'WORLD_SIZE' not in os.environ:
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+    local_rank = int(os.environ['LOCAL_RANK'])
+    world_size = int(os.environ['WORLD_SIZE'])
+    if local_rank == 0:
+        # resume or start a new run
+        if cfgs.resume_from is not None:
+            assert os.path.isfile(cfgs.resume_from)
+            work_dir = os.path.dirname(cfgs.resume_from)
+        else:
+            run_name = ''
+            if not cfgs.debug:
+                run_name = input('Name your run (leave blank for default): ')
+            if run_name == '':
+                run_name = datetime.now().strftime("%Y-%m-%d/%H-%M-%S")
+            work_dir = os.path.join('outputs', cfgs.model.type, run_name)
+            if os.path.exists(work_dir):  # must be an empty dir
+                if input('Path "%s" already exists, overwrite it? [Y/n] ' % work_dir) == 'n':
+                    print('Bye.')
+                    exit(0)
+                shutil.rmtree(work_dir)
+            os.makedirs(work_dir, exist_ok=False)
+        # init logging, backup code
+        utils.init_logging(os.path.join(work_dir, 'train.log'), cfgs.debug)
+        utils.backup_code(work_dir)
+        logging.info('Logs will be saved to %s' % work_dir)
+    else:
+        # disable logging on other workers
+        logging.root.disabled = True
+        work_dir = '/tmp'
+    logging.info('Using GPU: %s' % torch.cuda.get_device_name(local_rank))
+    torch.cuda.set_device(local_rank)
+    if world_size > 1:
+        logging.info('Initializing DDP with %d GPUs...' % world_size)
+        dist.init_process_group('nccl', init_method='env://')
+    logging.info('Setting random seed: 0')
+    set_random_seed(0, deterministic=True)
+    logging.info('Loading training set from %s' % cfgs.dataset_root)
+    train_dataset = build_dataset(cfgs.data.train)
+    train_loader = build_dataloader(
+        train_dataset,
+        samples_per_gpu=cfgs.batch_size // world_size,
+        workers_per_gpu=cfgs.data.workers_per_gpu,
+        num_gpus=world_size,
+        dist=world_size > 1,
+        shuffle=True,
+        seed=0,
+    )
+    logging.info('Loading validation set from %s' % cfgs.dataset_root)
+    val_dataset = build_dataset(cfgs.data.val)
+    val_loader = build_dataloader(
+        val_dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfgs.data.workers_per_gpu,
+        num_gpus=world_size,
+        dist=world_size > 1,
+        shuffle=False
+    )
+    logging.info('Creating model: %s' % cfgs.model.type)
+    model = build_model(cfgs.model)
+    model.init_weights()
+    model.cuda()
+    model.train()
+    n_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
+    logging.info('Trainable parameters: %d (%.1fM)' % (n_params, n_params / 1e6))
+    logging.info('Batch size per GPU: %d' % (cfgs.batch_size // world_size))
+    if world_size > 1:
+        model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False)
+    else:
+        model = MMDataParallel(model, [0])
+    logging.info('Creating optimizer: %s' % cfgs.optimizer.type)
+    optimizer = build_optimizer(model, cfgs.optimizer)
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir=work_dir,
+        logger=logging.root,
+        max_epochs=cfgs.total_epochs,
+        meta=dict(),
+    )
+    runner.register_lr_hook(cfgs.lr_config)
+    runner.register_optimizer_hook(cfgs.optimizer_config)
+    runner.register_checkpoint_hook(cfgs.checkpoint_config)
+    runner.register_logger_hooks(cfgs.log_config)
+    runner.register_timer_hook(dict(type='IterTimerHook'))
+    runner.register_custom_hooks(dict(type='DistSamplerSeedHook'))
+    if cfgs.eval_config['interval'] > 0:
+        if world_size > 1:
+            runner.register_hook(DistEvalHook(val_loader, interval=cfgs.eval_config['interval'], gpu_collect=True))
+        else:
+            runner.register_hook(EvalHook(val_loader, interval=cfgs.eval_config['interval']))
+    if cfgs.resume_from is not None:
+        logging.info('Resuming from %s' % cfgs.resume_from)
+        runner.resume(cfgs.resume_from)
+    elif cfgs.load_from is not None:
+        logging.info('Loading checkpoint from %s' % cfgs.load_from)
+        if cfgs.revise_keys is not None:
+            load_checkpoint(
+                model, cfgs.load_from, map_location='cpu',
+                revise_keys=cfgs.revise_keys
+            )
+        else:
+            load_checkpoint(
+                model, cfgs.load_from, map_location='cpu',
+            )
+    runner.run([train_loader], [('train', 1)])
+if __name__ == '__main__':
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import sys
+import glob
+import torch
+import shutil
+import logging
+import datetime
+from mmcv.runner.hooks import HOOKS
+from mmcv.runner.hooks.logger import LoggerHook, TextLoggerHook
+from mmcv.runner.dist_utils import master_only
+from torch.utils.tensorboard import SummaryWriter
+def init_logging(filename=None, debug=False):
+    logging.root = logging.RootLogger('DEBUG' if debug else 'INFO')
+    formatter = logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s')
+    stream_handler = logging.StreamHandler(sys.stdout)
+    stream_handler.setFormatter(formatter)
+    logging.root.addHandler(stream_handler)
+    if filename is not None:
+        file_handler = logging.FileHandler(filename)
+        file_handler.setFormatter(formatter)
+        logging.root.addHandler(file_handler)
+def backup_code(work_dir, verbose=False):
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    for pattern in ['*.py', 'configs/*.py', 'models/*.py', 'loaders/*.py', 'loaders/pipelines/*.py']:
+        for file in glob.glob(pattern):
+            src = os.path.join(base_dir, file)
+            dst = os.path.join(work_dir, 'backup', os.path.dirname(file))
+            if verbose:
+                logging.info('Copying %s -> %s' % (os.path.relpath(src), os.path.relpath(dst)))
+            os.makedirs(dst, exist_ok=True)
+            shutil.copy2(src, dst)
+@HOOKS.register_module()
+class MyTextLoggerHook(TextLoggerHook):
+    def _log_info(self, log_dict, runner):
+        # print exp name for users to distinguish experiments
+        # at every ``interval_exp_name`` iterations and the end of each epoch
+        if runner.meta is not None and 'exp_name' in runner.meta:
+            if (self.every_n_iters(runner, self.interval_exp_name)) or (
+                    self.by_epoch and self.end_of_epoch(runner)):
+                exp_info = f'Exp name: {runner.meta["exp_name"]}'
+                runner.logger.info(exp_info)
+        # by epoch: Epoch [4][100/1000]
+        # by iter:  Iter [100/100000]
+        if self.by_epoch:
+            log_str = f'Epoch [{log_dict["epoch"]}/{runner.max_epochs}]' \
+                        f'[{log_dict["iter"]}/{len(runner.data_loader)}] '
+        else:
+            log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}] '
+        log_str += 'loss: %.2f, ' % log_dict['loss']
+        if 'time' in log_dict.keys():
+            # MOD: skip the first iteration since it's not accurate
+            if runner.iter == self.start_iter:
+                time_sec_avg = log_dict['time']
+            else:
+                self.time_sec_tot += (log_dict['time'] * self.interval)
+                time_sec_avg = self.time_sec_tot / (runner.iter - self.start_iter)
+            eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+            eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+            log_str += f'eta: {eta_str}, '
+            log_str += f'time: {log_dict["time"]:.2f}s, ' \
+                        f'data: {log_dict["data_time"] * 1000:.0f}ms, '
+            # statistic memory
+            if torch.cuda.is_available():
+                log_str += f'mem: {log_dict["memory"]}M'
+        runner.logger.info(log_str)
+    def log(self, runner):
+        if 'eval_iter_num' in runner.log_buffer.output:
+            # this doesn't modify runner.iter and is regardless of by_epoch
+            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
+        else:
+            cur_iter = self.get_iter(runner, inner_iter=True)
+        log_dict = {
+            'mode': self.get_mode(runner),
+            'epoch': self.get_epoch(runner),
+            'iter': cur_iter
+        }
+        # only record lr of the first param group
+        cur_lr = runner.current_lr()
+        if isinstance(cur_lr, list):
+            log_dict['lr'] = cur_lr[0]
+        else:
+            assert isinstance(cur_lr, dict)
+            log_dict['lr'] = {}
+            for k, lr_ in cur_lr.items():
+                assert isinstance(lr_, list)
+                log_dict['lr'].update({k: lr_[0]})
+        if 'time' in runner.log_buffer.output:
+            # statistic memory
+            if torch.cuda.is_available():
+                log_dict['memory'] = self._get_max_memory(runner)
+        log_dict = dict(log_dict, **runner.log_buffer.output)
+        # MOD: disable writing to files
+        # self._dump_log(log_dict, runner)
+        self._log_info(log_dict, runner)
+        return log_dict
+    def after_train_epoch(self, runner):
+        if runner.log_buffer.ready:
+            metrics = self.get_loggable_tags(runner)
+            runner.logger.info('--- Evaluation Results ---')
+            runner.logger.info('mAP: %.4f' % metrics['val/pts_bbox_NuScenes/mAP'])
+            runner.logger.info('mATE: %.4f' % metrics['val/pts_bbox_NuScenes/mATE'])
+            runner.logger.info('mASE: %.4f' % metrics['val/pts_bbox_NuScenes/mASE'])
+            runner.logger.info('mAOE: %.4f' % metrics['val/pts_bbox_NuScenes/mAOE'])
+            runner.logger.info('mAVE: %.4f' % metrics['val/pts_bbox_NuScenes/mAVE'])
+            runner.logger.info('mAAE: %.4f' % metrics['val/pts_bbox_NuScenes/mAAE'])
+            runner.logger.info('NDS: %.4f' % metrics['val/pts_bbox_NuScenes/NDS'])
+@HOOKS.register_module()
+class MyTensorboardLoggerHook(LoggerHook):
+    def __init__(self, log_dir=None, interval=10, ignore_last=True, reset_flag=False, by_epoch=True):
+        super(MyTensorboardLoggerHook, self).__init__(
+            interval, ignore_last, reset_flag, by_epoch)
+        self.log_dir = log_dir
+    @master_only
+    def before_run(self, runner):
+        super(MyTensorboardLoggerHook, self).before_run(runner)
+        if self.log_dir is None:
+            self.log_dir = runner.work_dir
+        self.writer = SummaryWriter(self.log_dir)
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner)
+        for key, value in tags.items():
+            # MOD: merge into the 'train' group
+            if key == 'learning_rate':
+                key = 'train/learning_rate'
+            # MOD: skip momentum
+            ignore = False
+            if key == 'momentum':
+                ignore = True
+            # MOD: skip intermediate losses
+            for i in range(5):
+                if key[:13] == 'train/d%d.loss' % i:
+                    ignore = True
+            if key[:3] == 'val':
+                metric_name = key[22:]
+                if metric_name in ['mAP', 'mATE', 'mASE', 'mAOE', 'mAVE', 'mAAE', 'NDS']:
+                    key = 'val/' + metric_name
+                else:
+                    ignore = True
+            if self.get_mode(runner) == 'train' and key[:5] != 'train':
+                ignore = True
+            if self.get_mode(runner) != 'train' and key[:3] != 'val':
+                ignore = True
+            if ignore:
+                continue
+            if key[:5] == 'train':
+                self.writer.add_scalar(key, value, self.get_iter(runner))
+            elif key[:3] == 'val':
+                self.writer.add_scalar(key, value, self.get_epoch(runner))
+    @master_only
+    def after_run(self, runner):
+        self.writer.close()

val.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import utils
+import logging
+import argparse
+import importlib
+import torch
+import torch.distributed
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from mmcv import Config
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import load_checkpoint
+from mmdet.apis import set_random_seed, multi_gpu_test, single_gpu_test
+from mmdet3d.datasets import build_dataset, build_dataloader
+from mmdet3d.models import build_model
+def evaluate(dataset, results, epoch):
+    metrics = dataset.evaluate(results, jsonfile_prefix=None)
+    mAP = metrics['pts_bbox_NuScenes/mAP']
+    mATE = metrics['pts_bbox_NuScenes/mATE']
+    mASE = metrics['pts_bbox_NuScenes/mASE']
+    mAOE = metrics['pts_bbox_NuScenes/mAOE']
+    mAVE = metrics['pts_bbox_NuScenes/mAVE']
+    mAAE = metrics['pts_bbox_NuScenes/mAAE']
+    NDS = metrics['pts_bbox_NuScenes/NDS']
+    logging.info('--- Evaluation Results (Epoch %d) ---' % epoch)
+    logging.info('mAP: %.4f' % metrics['pts_bbox_NuScenes/mAP'])
+    logging.info('mATE: %.4f' % metrics['pts_bbox_NuScenes/mATE'])
+    logging.info('mASE: %.4f' % metrics['pts_bbox_NuScenes/mASE'])
+    logging.info('mAOE: %.4f' % metrics['pts_bbox_NuScenes/mAOE'])
+    logging.info('mAVE: %.4f' % metrics['pts_bbox_NuScenes/mAVE'])
+    logging.info('mAAE: %.4f' % metrics['pts_bbox_NuScenes/mAAE'])
+    logging.info('NDS: %.4f' % metrics['pts_bbox_NuScenes/NDS'])
+    return {
+        'mAP': mAP,
+        'mATE': mATE,
+        'mASE': mASE,
+        'mAOE': mAOE,
+        'mAVE': mAVE,
+        'mAAE': mAAE,
+        'NDS': NDS,
+    }
+def main():
+    parser = argparse.ArgumentParser(description='Validate a detector')
+    parser.add_argument('--config', required=True)
+    parser.add_argument('--weights', required=True)
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--world_size', type=int, default=1)
+    parser.add_argument('--batch_size', type=int, default=1)
+    args = parser.parse_args()
+    # parse configs
+    cfgs = Config.fromfile(args.config)
+    # register custom module
+    importlib.import_module('models')
+    importlib.import_module('loaders')
+    # MMCV, please shut up
+    from mmcv.utils.logging import logger_initialized
+    logger_initialized['root'] = logging.Logger(__name__, logging.WARNING)
+    logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING)
+    # you need GPUs
+    assert torch.cuda.is_available()
+    # determine local_rank and world_size
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    if 'WORLD_SIZE' not in os.environ:
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+    local_rank = int(os.environ['LOCAL_RANK'])
+    world_size = int(os.environ['WORLD_SIZE'])
+    if local_rank == 0:
+        utils.init_logging(None, cfgs.debug)
+    else:
+        logging.root.disabled = True
+    logging.info('Using GPU: %s' % torch.cuda.get_device_name(local_rank))
+    torch.cuda.set_device(local_rank)
+    if world_size > 1:
+        logging.info('Initializing DDP with %d GPUs...' % world_size)
+        dist.init_process_group('nccl', init_method='env://')
+    logging.info('Setting random seed: 0')
+    set_random_seed(0, deterministic=True)
+    cudnn.benchmark = True
+    logging.info('Loading validation set from %s' % cfgs.data.val.data_root)
+    val_dataset = build_dataset(cfgs.data.val)
+    val_loader = build_dataloader(
+        val_dataset,
+        samples_per_gpu=args.batch_size,
+        workers_per_gpu=cfgs.data.workers_per_gpu,
+        num_gpus=world_size,
+        dist=world_size > 1,
+        shuffle=False,
+        seed=0,
+    )
+    logging.info('Creating model: %s' % cfgs.model.type)
+    model = build_model(cfgs.model)
+    model.cuda()
+    if world_size > 1:
+        model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False)
+    else:
+        model = MMDataParallel(model, [0])
+    if os.path.isfile(args.weights):
+        logging.info('Loading checkpoint from %s' % args.weights)
+        load_checkpoint(
+            model, args.weights, map_location='cuda', strict=True,
+            logger=logging.Logger(__name__, logging.ERROR)
+        )
+        if world_size > 1:
+            results = multi_gpu_test(model, val_loader, gpu_collect=True)
+        else:
+            results = single_gpu_test(model, val_loader)
+        if local_rank == 0:
+            evaluate(val_dataset, results, -1)
+if __name__ == '__main__':
+    main()