point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] dataset_type = 'CustomNuScenesDataset' data_root = 'data/nuscenes/' input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True) file_client_args = dict(backend='disk') train_pipeline = [ dict( type='LoadMultiViewImageFromFilesInCeph', to_float32=True, file_client_args=dict(backend='disk'), img_root=''), dict(type='PhotoMetricDistortionMultiViewImage'), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), dict( type='ObjectRangeFilter', point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]), dict( type='ObjectNameFilter', classes=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ]), dict( type='NormalizeMultiviewImage', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='PadMultiViewImage', size_divisor=32), dict( type='DefaultFormatBundle3D', class_names=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ]), dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) ] test_pipeline = [ dict( type='LoadMultiViewImageFromFilesInCeph', to_float32=True, file_client_args=dict(backend='disk'), img_root=''), dict( type='NormalizeMultiviewImage', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='PadMultiViewImage', size_divisor=32), dict( type='MultiScaleFlipAug3D', img_scale=(1600, 900), pts_scale_ratio=1, flip=False, transforms=[ dict( type='DefaultFormatBundle3D', class_names=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ], with_label=False), dict(type='CustomCollect3D', keys=['img']) ]) ] eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=dict(backend='disk')), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=dict(backend='disk')), dict( type='DefaultFormatBundle3D', class_names=[ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ], with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=1, workers_per_gpu=4, train=dict( type='CustomNuScenesDataset', data_root='data/nuscenes/', ann_file='data/infos/nuscenes_infos_temporal_train.pkl', pipeline=[ dict( type='LoadMultiViewImageFromFilesInCeph', to_float32=True, file_client_args=dict(backend='disk'), img_root=''), dict(type='PhotoMetricDistortionMultiViewImage'), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), dict( type='ObjectRangeFilter', point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]), dict( type='ObjectNameFilter', classes=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ]), dict( type='NormalizeMultiviewImage', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='PadMultiViewImage', size_divisor=32), dict( type='DefaultFormatBundle3D', class_names=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ]), dict( type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) ], classes=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ], modality=dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True), test_mode=False, box_type_3d='LiDAR', use_valid_flag=True, bev_size=(200, 200), queue_length=4), val=dict( type='CustomNuScenesDataset', data_root='data/nuscenes/', ann_file='data/infos/nuscenes_infos_temporal_val.pkl', pipeline=[ dict( type='LoadMultiViewImageFromFilesInCeph', to_float32=True, file_client_args=dict(backend='disk'), img_root=''), dict( type='NormalizeMultiviewImage', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='PadMultiViewImage', size_divisor=32), dict( type='MultiScaleFlipAug3D', img_scale=(1600, 900), pts_scale_ratio=1, flip=False, transforms=[ dict( type='DefaultFormatBundle3D', class_names=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ], with_label=False), dict(type='CustomCollect3D', keys=['img']) ]) ], classes=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ], modality=dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True), test_mode=True, box_type_3d='LiDAR', bev_size=(200, 200), samples_per_gpu=1), test=dict( type='CustomNuScenesDataset', data_root='data/nuscenes/', ann_file='data/infos/nuscenes_infos_temporal_val.pkl', pipeline=[ dict( type='LoadMultiViewImageFromFilesInCeph', to_float32=True, file_client_args=dict(backend='disk'), img_root=''), dict( type='NormalizeMultiviewImage', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='PadMultiViewImage', size_divisor=32), dict( type='MultiScaleFlipAug3D', img_scale=(1600, 900), pts_scale_ratio=1, flip=False, transforms=[ dict( type='DefaultFormatBundle3D', class_names=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ], with_label=False), dict(type='CustomCollect3D', keys=['img']) ]) ], classes=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ], modality=dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True), test_mode=True, box_type_3d='LiDAR', bev_size=(200, 200)), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler')) evaluation = dict( interval=6, pipeline=[ dict( type='LoadMultiViewImageFromFilesInCeph', to_float32=True, file_client_args=dict(backend='disk'), img_root=''), dict( type='NormalizeMultiviewImage', mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False), dict(type='PadMultiViewImage', size_divisor=32), dict( type='MultiScaleFlipAug3D', img_scale=(1600, 900), pts_scale_ratio=1, flip=False, transforms=[ dict( type='DefaultFormatBundle3D', class_names=[ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ], with_label=False), dict(type='CustomCollect3D', keys=['img']) ]) ]) checkpoint_config = dict(interval=1) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = 'projects/work_dirs/bevformer/base_bevformer/' load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth' resume_from = None workflow = [('train', 1)] plugin = True plugin_dir = 'projects/mmdet3d_plugin/' voxel_size = [0.2, 0.2, 8] img_norm_cfg = dict( mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) _dim_ = 256 _pos_dim_ = 128 _ffn_dim_ = 512 _num_levels_ = 4 bev_h_ = 200 bev_w_ = 200 queue_length = 4 model = dict( type='BEVFormer', use_grid_mask=True, video_test_mode=True, img_backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, style='caffe', dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), stage_with_dcn=(False, False, True, True)), img_neck=dict( type='FPN', in_channels=[512, 1024, 2048], out_channels=256, start_level=0, add_extra_convs='on_output', num_outs=4, relu_before_extra_convs=True), pts_bbox_head=dict( type='BEVFormerHead', bev_h=200, bev_w=200, num_query=900, num_classes=10, in_channels=256, sync_cls_avg_factor=True, with_box_refine=True, as_two_stage=False, transformer=dict( type='PerceptionTransformer', rotate_prev_bev=True, use_shift=True, use_can_bus=True, embed_dims=256, encoder=dict( type='BEVFormerEncoder', num_layers=6, pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], num_points_in_pillar=4, return_intermediate=False, transformerlayers=dict( type='BEVFormerLayer', attn_cfgs=[ dict( type='TemporalSelfAttention', embed_dims=256, num_levels=1), dict( type='SpatialCrossAttention', pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=256, num_points=8, num_levels=4), embed_dims=256) ], feedforward_channels=512, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm'))), decoder=dict( type='DetectionTransformerDecoder', num_layers=6, return_intermediate=True, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=256, num_heads=8, dropout=0.1), dict( type='CustomMSDeformableAttention', embed_dims=256, num_levels=1) ], feedforward_channels=512, ffn_dropout=0.1, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')))), bbox_coder=dict( type='NMSFreeCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], max_num=300, voxel_size=[0.2, 0.2, 8], num_classes=10), positional_encoding=dict( type='LearnedPositionalEncoding', num_feats=128, row_num_embed=200, col_num_embed=200), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0)), train_cfg=dict( pts=dict( grid_size=[512, 512, 1], voxel_size=[0.2, 0.2, 8], point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], out_size_factor=4, assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0])))) info_root = 'data/infos/' ann_file_train = 'data/infos/nuscenes_infos_temporal_train.pkl' ann_file_val = 'data/infos/nuscenes_infos_temporal_val.pkl' ann_file_test = 'data/infos/nuscenes_infos_temporal_val.pkl' optimizer = dict( type='AdamW', lr=0.0002, paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.1))), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=0.3333333333333333, min_lr_ratio=0.001) total_epochs = 24 runner = dict(type='EpochBasedRunner', max_epochs=24) logger_name = 'mmdet' gpu_ids = range(0, 1)