UniAD2.0_R101_nuScenes / config /base_bevformer.py
Zizizi-hao's picture
UniAD V2.0 training config file
39bf596 verified
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
dataset_type = 'CustomNuScenesDataset'
data_root = 'data/nuscenes/'
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True)
file_client_args = dict(backend='disk')
train_pipeline = [
dict(
type='LoadMultiViewImageFromFilesInCeph',
to_float32=True,
file_client_args=dict(backend='disk'),
img_root=''),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False),
dict(
type='ObjectRangeFilter',
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
dict(
type='ObjectNameFilter',
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]),
dict(
type='NormalizeMultiviewImage',
mean=[103.53, 116.28, 123.675],
std=[1.0, 1.0, 1.0],
to_rgb=False),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]),
dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
]
test_pipeline = [
dict(
type='LoadMultiViewImageFromFilesInCeph',
to_float32=True,
file_client_args=dict(backend='disk'),
img_root=''),
dict(
type='NormalizeMultiviewImage',
mean=[103.53, 116.28, 123.675],
std=[1.0, 1.0, 1.0],
to_rgb=False),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 900),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian',
'traffic_cone'
],
with_label=False),
dict(type='CustomCollect3D', keys=['img'])
])
]
eval_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=dict(backend='disk')),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=dict(backend='disk')),
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
],
with_label=False),
dict(type='Collect3D', keys=['points'])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=4,
train=dict(
type='CustomNuScenesDataset',
data_root='data/nuscenes/',
ann_file='data/infos/nuscenes_infos_temporal_train.pkl',
pipeline=[
dict(
type='LoadMultiViewImageFromFilesInCeph',
to_float32=True,
file_client_args=dict(backend='disk'),
img_root=''),
dict(type='PhotoMetricDistortionMultiViewImage'),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False),
dict(
type='ObjectRangeFilter',
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
dict(
type='ObjectNameFilter',
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian',
'traffic_cone'
]),
dict(
type='NormalizeMultiviewImage',
mean=[103.53, 116.28, 123.675],
std=[1.0, 1.0, 1.0],
to_rgb=False),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian',
'traffic_cone'
]),
dict(
type='CustomCollect3D',
keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
],
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
],
modality=dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True),
test_mode=False,
box_type_3d='LiDAR',
use_valid_flag=True,
bev_size=(200, 200),
queue_length=4),
val=dict(
type='CustomNuScenesDataset',
data_root='data/nuscenes/',
ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
pipeline=[
dict(
type='LoadMultiViewImageFromFilesInCeph',
to_float32=True,
file_client_args=dict(backend='disk'),
img_root=''),
dict(
type='NormalizeMultiviewImage',
mean=[103.53, 116.28, 123.675],
std=[1.0, 1.0, 1.0],
to_rgb=False),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 900),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus',
'trailer', 'barrier', 'motorcycle', 'bicycle',
'pedestrian', 'traffic_cone'
],
with_label=False),
dict(type='CustomCollect3D', keys=['img'])
])
],
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
],
modality=dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True),
test_mode=True,
box_type_3d='LiDAR',
bev_size=(200, 200),
samples_per_gpu=1),
test=dict(
type='CustomNuScenesDataset',
data_root='data/nuscenes/',
ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
pipeline=[
dict(
type='LoadMultiViewImageFromFilesInCeph',
to_float32=True,
file_client_args=dict(backend='disk'),
img_root=''),
dict(
type='NormalizeMultiviewImage',
mean=[103.53, 116.28, 123.675],
std=[1.0, 1.0, 1.0],
to_rgb=False),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 900),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus',
'trailer', 'barrier', 'motorcycle', 'bicycle',
'pedestrian', 'traffic_cone'
],
with_label=False),
dict(type='CustomCollect3D', keys=['img'])
])
],
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
],
modality=dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True),
test_mode=True,
box_type_3d='LiDAR',
bev_size=(200, 200)),
shuffler_sampler=dict(type='DistributedGroupSampler'),
nonshuffler_sampler=dict(type='DistributedSampler'))
evaluation = dict(
interval=6,
pipeline=[
dict(
type='LoadMultiViewImageFromFilesInCeph',
to_float32=True,
file_client_args=dict(backend='disk'),
img_root=''),
dict(
type='NormalizeMultiviewImage',
mean=[103.53, 116.28, 123.675],
std=[1.0, 1.0, 1.0],
to_rgb=False),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1600, 900),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=[
'car', 'truck', 'construction_vehicle', 'bus',
'trailer', 'barrier', 'motorcycle', 'bicycle',
'pedestrian', 'traffic_cone'
],
with_label=False),
dict(type='CustomCollect3D', keys=['img'])
])
])
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = 'projects/work_dirs/bevformer/base_bevformer/'
load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
resume_from = None
workflow = [('train', 1)]
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'
voxel_size = [0.2, 0.2, 8]
img_norm_cfg = dict(
mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
_dim_ = 256
_pos_dim_ = 128
_ffn_dim_ = 512
_num_levels_ = 4
bev_h_ = 200
bev_w_ = 200
queue_length = 4
model = dict(
type='BEVFormer',
use_grid_mask=True,
video_test_mode=True,
img_backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN2d', requires_grad=False),
norm_eval=True,
style='caffe',
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True)),
img_neck=dict(
type='FPN',
in_channels=[512, 1024, 2048],
out_channels=256,
start_level=0,
add_extra_convs='on_output',
num_outs=4,
relu_before_extra_convs=True),
pts_bbox_head=dict(
type='BEVFormerHead',
bev_h=200,
bev_w=200,
num_query=900,
num_classes=10,
in_channels=256,
sync_cls_avg_factor=True,
with_box_refine=True,
as_two_stage=False,
transformer=dict(
type='PerceptionTransformer',
rotate_prev_bev=True,
use_shift=True,
use_can_bus=True,
embed_dims=256,
encoder=dict(
type='BEVFormerEncoder',
num_layers=6,
pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
num_points_in_pillar=4,
return_intermediate=False,
transformerlayers=dict(
type='BEVFormerLayer',
attn_cfgs=[
dict(
type='TemporalSelfAttention',
embed_dims=256,
num_levels=1),
dict(
type='SpatialCrossAttention',
pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
deformable_attention=dict(
type='MSDeformableAttention3D',
embed_dims=256,
num_points=8,
num_levels=4),
embed_dims=256)
],
feedforward_channels=512,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm'))),
decoder=dict(
type='DetectionTransformerDecoder',
num_layers=6,
return_intermediate=True,
transformerlayers=dict(
type='DetrTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.1),
dict(
type='CustomMSDeformableAttention',
embed_dims=256,
num_levels=1)
],
feedforward_channels=512,
ffn_dropout=0.1,
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')))),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
max_num=300,
voxel_size=[0.2, 0.2, 8],
num_classes=10),
positional_encoding=dict(
type='LearnedPositionalEncoding',
num_feats=128,
row_num_embed=200,
col_num_embed=200),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='L1Loss', loss_weight=0.25),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=[0.2, 0.2, 8],
point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
iou_cost=dict(type='IoUCost', weight=0.0),
pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]))))
info_root = 'data/infos/'
ann_file_train = 'data/infos/nuscenes_infos_temporal_train.pkl'
ann_file_val = 'data/infos/nuscenes_infos_temporal_val.pkl'
ann_file_test = 'data/infos/nuscenes_infos_temporal_val.pkl'
optimizer = dict(
type='AdamW',
lr=0.0002,
paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.1))),
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.3333333333333333,
min_lr_ratio=0.001)
total_epochs = 24
runner = dict(type='EpochBasedRunner', max_epochs=24)
logger_name = 'mmdet'
gpu_ids = range(0, 1)