_dim_ = 256
_ffn_dim_ = 512
_num_levels_ = 1
_pos_dim_ = 128
auto_scale_lr = dict(base_batch_size=16, enable=False)
bev_h_ = 50
bev_w_ = 50
by_epoch = False
class_names = [
    'car',
    'truck',
    'construction_vehicle',
    'bus',
    'trailer',
    'barrier',
    'motorcycle',
    'bicycle',
    'pedestrian',
    'traffic_cone',
]
custom_hooks = [
    dict(
        by_epoch=False,
        clean_local=False,
        interval=1,
        repo_id='5421Project',
        type='CheckpointUploader'),
    dict(repo_id='5421Project', resume_type='last', type='CheckpointResumer'),
]
data = dict(
    nonshuffler_sampler=dict(type='DistributedSampler'),
    samples_per_gpu=1,
    shuffler_sampler=dict(type='DistributedGroupSampler'),
    test=dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
        bev_size=(
            50,
            50,
        ),
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        data_root='data/nuscenes/v1.0-mini/',
        frame=[
            -3,
            -2,
            -1,
        ],
        modality=dict(
            use_camera=True,
            use_external=False,
            use_lidar=False,
            use_map=False,
            use_radar=False),
        pipeline=[
            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
            dict(
                mean=[
                    123.675,
                    116.28,
                    103.53,
                ],
                std=[
                    58.395,
                    57.12,
                    57.375,
                ],
                to_rgb=True,
                type='NormalizeMultiviewImage'),
            dict(
                flip=False,
                img_scale=(
                    800,
                    450,
                ),
                pts_scale_ratio=[
                    1.0,
                ],
                transforms=[
                    dict(
                        scales=[
                            0.5,
                        ], type='RandomScaleImageMultiViewImage'),
                    dict(size_divisor=32, type='PadMultiViewImage'),
                    dict(
                        class_names=[
                            'car',
                            'truck',
                            'construction_vehicle',
                            'bus',
                            'trailer',
                            'barrier',
                            'motorcycle',
                            'bicycle',
                            'pedestrian',
                            'traffic_cone',
                        ],
                        type='CustomDefaultFormatBundle3D'),
                    dict(keys=[
                        'img',
                    ], type='CustomCollect3D'),
                ],
                type='MultiScaleFlipAug3D'),
        ],
        test_mode=True,
        type='CustomNuScenesDataset'),
    train=dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
        bev_size=(
            50,
            50,
        ),
        box_type_3d='LiDAR',
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        data_root='data/nuscenes/v1.0-mini/',
        modality=dict(
            use_camera=True,
            use_external=False,
            use_lidar=False,
            use_map=False,
            use_radar=False),
        pipeline=[
            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
            dict(
                type='LoadAnnotations3D',
                with_bbox_3d=True,
                with_label_3d=True),
            dict(
                point_cloud_range=[
                    -51.2,
                    -51.2,
                    -5.0,
                    51.2,
                    51.2,
                    3.0,
                ],
                type='ObjectRangeFilter'),
            dict(
                classes=[
                    'car',
                    'truck',
                    'construction_vehicle',
                    'bus',
                    'trailer',
                    'barrier',
                    'motorcycle',
                    'bicycle',
                    'pedestrian',
                    'traffic_cone',
                ],
                type='ObjectNameFilter'),
            dict(type='PhotoMetricDistortionMultiViewImage'),
            dict(
                mean=[
                    123.675,
                    116.28,
                    103.53,
                ],
                std=[
                    58.395,
                    57.12,
                    57.375,
                ],
                to_rgb=True,
                type='NormalizeMultiviewImage'),
            dict(scales=[
                0.5,
            ], type='RandomScaleImageMultiViewImage'),
            dict(size_divisor=32, type='PadMultiViewImage'),
            dict(
                class_names=[
                    'car',
                    'truck',
                    'construction_vehicle',
                    'bus',
                    'trailer',
                    'barrier',
                    'motorcycle',
                    'bicycle',
                    'pedestrian',
                    'traffic_cone',
                ],
                type='CustomDefaultFormatBundle3D'),
            dict(
                keys=[
                    'gt_bboxes_3d',
                    'gt_labels_3d',
                    'img',
                ],
                type='CustomCollect3D'),
            dict(type='TypeConverter'),
        ],
        queue_length=4,
        test_mode=False,
        type='CustomNuScenesDataset',
        use_valid_flag=True),
    val=dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
        bev_size=(
            50,
            50,
        ),
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        data_root='data/nuscenes/v1.0-mini/',
        frame=(),
        frames=[
            -3,
            -2,
            -1,
        ],
        modality=dict(
            use_camera=True,
            use_external=False,
            use_lidar=False,
            use_map=False,
            use_radar=False),
        pipeline=[
            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
            dict(
                mean=[
                    123.675,
                    116.28,
                    103.53,
                ],
                std=[
                    58.395,
                    57.12,
                    57.375,
                ],
                to_rgb=True,
                type='NormalizeMultiviewImage'),
            dict(
                flip=False,
                img_scale=(
                    800,
                    450,
                ),
                pts_scale_ratio=[
                    1.0,
                ],
                transforms=[
                    dict(
                        scales=[
                            0.5,
                        ], type='RandomScaleImageMultiViewImage'),
                    dict(size_divisor=32, type='PadMultiViewImage'),
                    dict(
                        class_names=[
                            'car',
                            'truck',
                            'construction_vehicle',
                            'bus',
                            'trailer',
                            'barrier',
                            'motorcycle',
                            'bicycle',
                            'pedestrian',
                            'traffic_cone',
                        ],
                        type='CustomDefaultFormatBundle3D'),
                    dict(keys=[
                        'img',
                    ], type='CustomCollect3D'),
                ],
                type='MultiScaleFlipAug3D'),
        ],
        samples_per_gpu=1,
        test_mode=True,
        type='CustomNuScenesDataset'),
    workers_per_gpu=4)
data_root = 'data/nuscenes/v1.0-mini/'
dataset_type = 'CustomNuScenesDataset'
decoder = dict(
    num_layers=6,
    return_intermediate=True,
    transformerlayers=dict(
        attn_cfgs=[
            dict(
                dropout=0.1,
                embed_dims=256,
                num_heads=8,
                type='MultiheadAttention'),
            dict(
                embed_dims=256,
                num_levels=1,
                type='CustomMSDeformableAttention'),
        ],
        ffn_cfgs=dict(
            feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
        operation_order=(
            'self_attn',
            'norm',
            'cross_attn',
            'norm',
            'ffn',
            'norm',
        ),
        type='DetrTransformerDecoderLayer'),
    type='DetectionTransformerDecoder')
default_hooks = dict(
    checkpoint=dict(
        by_epoch=False,
        interval=1,
        max_keep_ckpts=1,
        save_best=[
            'loss',
            'mAP',
            'NDS',
        ],
        type='CheckpointHookV2'),
    logger=dict(
        interval=1,
        interval_exp_name=1000,
        log_metric_by_epoch=False,
        type='LoggerHook'),
    param_scheduler=dict(type='ParamSchedulerHook'),
    runtime_info=dict(type='RuntimeInfoHook'),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    timer=dict(type='IterTimerHook'))
encoder = dict(
    num_layers=3,
    num_points_in_pillar=8,
    pc_range=[
        -51.2,
        -51.2,
        -5.0,
        51.2,
        51.2,
        3.0,
    ],
    return_intermediate=False,
    transformerlayers=dict(
        attn_cfgs=[
            dict(embed_dims=256, num_levels=1, type='TemporalSelfAttention'),
            dict(
                deformable_attention=dict(
                    embed_dims=256,
                    num_levels=1,
                    num_points=8,
                    type='MSDeformableAttention3D'),
                embed_dims=256,
                pc_range=[
                    -51.2,
                    -51.2,
                    -5.0,
                    51.2,
                    51.2,
                    3.0,
                ],
                type='SpatialCrossAttention'),
        ],
        ffn_cfgs=dict(
            feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
        operation_order=(
            'self_attn',
            'norm',
            'cross_attn',
            'norm',
            'ffn',
            'norm',
        ),
        type='BEVFormerLayer'),
    type='BEVFormerEncoder')
env_cfg = dict(dist_cfg=dict(backend='nccl'))
experiment_name = 'debug'
file_client_args = dict(backend='disk')
frames = [
    -3,
    -2,
    -1,
]
gpu_ids = range(0, 1)
img_norm_cfg = dict(
    mean=[
        123.675,
        116.28,
        103.53,
    ],
    std=[
        58.395,
        57.12,
        57.375,
    ],
    to_rgb=True)
input_modality = dict(
    use_camera=True,
    use_external=False,
    use_lidar=False,
    use_map=False,
    use_radar=False)
interval = 1
launcher = 'none'
load_from = None
log_interval = 1
log_processor = dict(window_size=20)
lr_config = dict(
    min_lr_ratio=0.001,
    policy='CosineAnnealing',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.3333333333333333)
max_epochs = 5
max_iters = 2
model = dict(
    img_backbone=dict(
        depth=50,
        frozen_stages=1,
        norm_cfg=dict(requires_grad=False, type='BN'),
        norm_eval=True,
        num_stages=4,
        out_indices=(3, ),
        style='pytorch',
        type='ResNet'),
    img_neck=dict(
        add_extra_convs='on_output',
        in_channels=[
            2048,
        ],
        num_outs=1,
        out_channels=256,
        relu_before_extra_convs=True,
        start_level=0,
        type='FPN'),
    pretrained=dict(img='torchvision://resnet50'),
    pts_bbox_head=dict(
        as_two_stage=False,
        bbox_coder=dict(
            max_num=300,
            num_classes=10,
            pc_range=[
                -51.2,
                -51.2,
                -5.0,
                51.2,
                51.2,
                3.0,
            ],
            post_center_range=[
                -61.2,
                -61.2,
                -10.0,
                61.2,
                61.2,
                10.0,
            ],
            type='NMSFreeCoder',
            voxel_size=[
                0.2,
                0.2,
                8,
            ]),
        bev_h=50,
        bev_w=50,
        in_channels=256,
        loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
        loss_cls=dict(
            alpha=0.25,
            gamma=2.0,
            loss_weight=2.0,
            type='FocalLoss',
            use_sigmoid=True),
        loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
        num_classes=10,
        num_query=900,
        positional_encoding=dict(
            col_num_embed=50,
            num_feats=128,
            row_num_embed=50,
            type='LearnedPositionalEncoding'),
        sync_cls_avg_factor=True,
        transformer=dict(
            decoder=dict(
                num_layers=6,
                return_intermediate=True,
                transformerlayers=dict(
                    attn_cfgs=[
                        dict(
                            dropout=0.1,
                            embed_dims=256,
                            num_heads=8,
                            type='MultiheadAttention'),
                        dict(
                            embed_dims=256,
                            num_levels=1,
                            type='CustomMSDeformableAttention'),
                    ],
                    ffn_cfgs=dict(
                        feedforward_channels=512,
                        ffn_drop=0.1,
                        num_fcs=2,
                        type='FFN'),
                    operation_order=(
                        'self_attn',
                        'norm',
                        'cross_attn',
                        'norm',
                        'ffn',
                        'norm',
                    ),
                    type='DetrTransformerDecoderLayer'),
                type='DetectionTransformerDecoder'),
            embed_dims=256,
            encoder=dict(
                num_layers=3,
                num_points_in_pillar=8,
                pc_range=[
                    -51.2,
                    -51.2,
                    -5.0,
                    51.2,
                    51.2,
                    3.0,
                ],
                return_intermediate=False,
                transformerlayers=dict(
                    attn_cfgs=[
                        dict(
                            embed_dims=256,
                            num_levels=1,
                            type='TemporalSelfAttention'),
                        dict(
                            deformable_attention=dict(
                                embed_dims=256,
                                num_levels=1,
                                num_points=8,
                                type='MSDeformableAttention3D'),
                            embed_dims=256,
                            pc_range=[
                                -51.2,
                                -51.2,
                                -5.0,
                                51.2,
                                51.2,
                                3.0,
                            ],
                            type='SpatialCrossAttention'),
                    ],
                    ffn_cfgs=dict(
                        feedforward_channels=512,
                        ffn_drop=0.1,
                        num_fcs=2,
                        type='FFN'),
                    operation_order=(
                        'self_attn',
                        'norm',
                        'cross_attn',
                        'norm',
                        'ffn',
                        'norm',
                    ),
                    type='BEVFormerLayer'),
                type='BEVFormerEncoder'),
            num_cams=6,
            num_feature_levels=1,
            rotate_prev_bev=True,
            type='PerceptionTransformer',
            use_can_bus=True,
            use_shift=True),
        type='BEVFormerHead',
        with_box_refine=True),
    train_cfg=dict(
        pts=dict(
            assigner=dict(
                cls_cost=dict(type='FocalCost', weight=2.0),
                iou_cost=dict(type='SmoothL1Cost', weight=0.25),
                pc_range=[
                    -51.2,
                    -51.2,
                    -5.0,
                    51.2,
                    51.2,
                    3.0,
                ],
                reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
                type='HungarianAssigner3D'),
            grid_size=[
                512,
                512,
                1,
            ],
            out_size_factor=4,
            point_cloud_range=[
                -51.2,
                -51.2,
                -5.0,
                51.2,
                51.2,
                3.0,
            ],
            voxel_size=[
                0.2,
                0.2,
                8,
            ])),
    type='BEVFormerDetector',
    use_grid_mask=True,
    video_test_mode=True)
optim_wrapper = dict(
    optimizer=dict(lr=0.0001, type='AdamW', weight_decay=0.01),
    type='OptimWrapper')
optimizer = dict(lr=0.0001, type='AdamW', weight_decay=0.01)
param_scheduler = dict(
    milestones=[
        1,
        2,
    ], type='MultiStepLR')
point_cloud_range = [
    -51.2,
    -51.2,
    -5.0,
    51.2,
    51.2,
    3.0,
]
pts_bbox_head = dict(
    as_two_stage=False,
    bbox_coder=dict(
        max_num=300,
        num_classes=10,
        pc_range=[
            -51.2,
            -51.2,
            -5.0,
            51.2,
            51.2,
            3.0,
        ],
        post_center_range=[
            -61.2,
            -61.2,
            -10.0,
            61.2,
            61.2,
            10.0,
        ],
        type='NMSFreeCoder',
        voxel_size=[
            0.2,
            0.2,
            8,
        ]),
    bev_h=50,
    bev_w=50,
    in_channels=256,
    loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
    loss_cls=dict(
        alpha=0.25,
        gamma=2.0,
        loss_weight=2.0,
        type='FocalLoss',
        use_sigmoid=True),
    loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
    num_classes=10,
    num_query=900,
    positional_encoding=dict(
        col_num_embed=50,
        num_feats=128,
        row_num_embed=50,
        type='LearnedPositionalEncoding'),
    sync_cls_avg_factor=True,
    transformer=dict(
        decoder=dict(
            num_layers=6,
            return_intermediate=True,
            transformerlayers=dict(
                attn_cfgs=[
                    dict(
                        dropout=0.1,
                        embed_dims=256,
                        num_heads=8,
                        type='MultiheadAttention'),
                    dict(
                        embed_dims=256,
                        num_levels=1,
                        type='CustomMSDeformableAttention'),
                ],
                ffn_cfgs=dict(
                    feedforward_channels=512,
                    ffn_drop=0.1,
                    num_fcs=2,
                    type='FFN'),
                operation_order=(
                    'self_attn',
                    'norm',
                    'cross_attn',
                    'norm',
                    'ffn',
                    'norm',
                ),
                type='DetrTransformerDecoderLayer'),
            type='DetectionTransformerDecoder'),
        embed_dims=256,
        encoder=dict(
            num_layers=3,
            num_points_in_pillar=8,
            pc_range=[
                -51.2,
                -51.2,
                -5.0,
                51.2,
                51.2,
                3.0,
            ],
            return_intermediate=False,
            transformerlayers=dict(
                attn_cfgs=[
                    dict(
                        embed_dims=256,
                        num_levels=1,
                        type='TemporalSelfAttention'),
                    dict(
                        deformable_attention=dict(
                            embed_dims=256,
                            num_levels=1,
                            num_points=8,
                            type='MSDeformableAttention3D'),
                        embed_dims=256,
                        pc_range=[
                            -51.2,
                            -51.2,
                            -5.0,
                            51.2,
                            51.2,
                            3.0,
                        ],
                        type='SpatialCrossAttention'),
                ],
                ffn_cfgs=dict(
                    feedforward_channels=512,
                    ffn_drop=0.1,
                    num_fcs=2,
                    type='FFN'),
                operation_order=(
                    'self_attn',
                    'norm',
                    'cross_attn',
                    'norm',
                    'ffn',
                    'norm',
                ),
                type='BEVFormerLayer'),
            type='BEVFormerEncoder'),
        num_cams=6,
        num_feature_levels=1,
        rotate_prev_bev=True,
        type='PerceptionTransformer',
        use_can_bus=True,
        use_shift=True),
    type='BEVFormerHead',
    with_box_refine=True)
queue_length = 4
resume = False
scales = [
    0.5,
]
test_cfg = dict(max_iters=1)
test_dataloader = dict(
    batch_size=1,
    collate_fn=dict(type='test_collate'),
    dataset=dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
        bev_size=(
            50,
            50,
        ),
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        data_root='data/nuscenes/v1.0-mini/',
        frame=[
            -3,
            -2,
            -1,
        ],
        modality=dict(
            use_camera=True,
            use_external=False,
            use_lidar=False,
            use_map=False,
            use_radar=False),
        pipeline=[
            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
            dict(
                mean=[
                    123.675,
                    116.28,
                    103.53,
                ],
                std=[
                    58.395,
                    57.12,
                    57.375,
                ],
                to_rgb=True,
                type='NormalizeMultiviewImage'),
            dict(
                flip=False,
                img_scale=(
                    800,
                    450,
                ),
                pts_scale_ratio=[
                    1.0,
                ],
                transforms=[
                    dict(
                        scales=[
                            0.5,
                        ], type='RandomScaleImageMultiViewImage'),
                    dict(size_divisor=32, type='PadMultiViewImage'),
                    dict(
                        class_names=[
                            'car',
                            'truck',
                            'construction_vehicle',
                            'bus',
                            'trailer',
                            'barrier',
                            'motorcycle',
                            'bicycle',
                            'pedestrian',
                            'traffic_cone',
                        ],
                        type='CustomDefaultFormatBundle3D'),
                    dict(keys=[
                        'img',
                    ], type='CustomCollect3D'),
                ],
                type='MultiScaleFlipAug3D'),
        ],
        test_mode=True,
        type='CustomNuScenesDataset'),
    num_workers=0,
    sampler=dict(shuffle=True, type='DefaultSampler'))
test_evaluator = dict(metrics=[
    dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
        data_root='data/nuscenes/v1.0-mini/',
        type='src.NuScenesMetric',
        version='v1.0-mini'),
])
test_max_iters = 1
test_pipeline = [
    dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
    dict(
        mean=[
            123.675,
            116.28,
            103.53,
        ],
        std=[
            58.395,
            57.12,
            57.375,
        ],
        to_rgb=True,
        type='NormalizeMultiviewImage'),
    dict(
        flip=False,
        img_scale=(
            800,
            450,
        ),
        pts_scale_ratio=[
            1.0,
        ],
        transforms=[
            dict(scales=[
                0.5,
            ], type='RandomScaleImageMultiViewImage'),
            dict(size_divisor=32, type='PadMultiViewImage'),
            dict(
                class_names=[
                    'car',
                    'truck',
                    'construction_vehicle',
                    'bus',
                    'trailer',
                    'barrier',
                    'motorcycle',
                    'bicycle',
                    'pedestrian',
                    'traffic_cone',
                ],
                type='CustomDefaultFormatBundle3D'),
            dict(keys=[
                'img',
            ], type='CustomCollect3D'),
        ],
        type='MultiScaleFlipAug3D'),
]
train_cfg = dict(by_epoch=False, max_epochs=5, max_iters=2, val_interval=1)
train_dataloader = dict(
    batch_size=1,
    collate_fn=dict(type='train_collate'),
    dataset=dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
        bev_size=(
            50,
            50,
        ),
        box_type_3d='LiDAR',
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        data_root='data/nuscenes/v1.0-mini/',
        modality=dict(
            use_camera=True,
            use_external=False,
            use_lidar=False,
            use_map=False,
            use_radar=False),
        pipeline=[
            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
            dict(
                type='LoadAnnotations3D',
                with_bbox_3d=True,
                with_label_3d=True),
            dict(
                point_cloud_range=[
                    -51.2,
                    -51.2,
                    -5.0,
                    51.2,
                    51.2,
                    3.0,
                ],
                type='ObjectRangeFilter'),
            dict(
                classes=[
                    'car',
                    'truck',
                    'construction_vehicle',
                    'bus',
                    'trailer',
                    'barrier',
                    'motorcycle',
                    'bicycle',
                    'pedestrian',
                    'traffic_cone',
                ],
                type='ObjectNameFilter'),
            dict(type='PhotoMetricDistortionMultiViewImage'),
            dict(
                mean=[
                    123.675,
                    116.28,
                    103.53,
                ],
                std=[
                    58.395,
                    57.12,
                    57.375,
                ],
                to_rgb=True,
                type='NormalizeMultiviewImage'),
            dict(scales=[
                0.5,
            ], type='RandomScaleImageMultiViewImage'),
            dict(size_divisor=32, type='PadMultiViewImage'),
            dict(
                class_names=[
                    'car',
                    'truck',
                    'construction_vehicle',
                    'bus',
                    'trailer',
                    'barrier',
                    'motorcycle',
                    'bicycle',
                    'pedestrian',
                    'traffic_cone',
                ],
                type='CustomDefaultFormatBundle3D'),
            dict(
                keys=[
                    'gt_bboxes_3d',
                    'gt_labels_3d',
                    'img',
                ],
                type='CustomCollect3D'),
            dict(type='TypeConverter'),
        ],
        queue_length=4,
        test_mode=False,
        type='CustomNuScenesDataset',
        use_valid_flag=True),
    num_workers=0,
    sampler=dict(shuffle=True, type='DefaultSampler'))
train_pipeline = [
    dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(
        point_cloud_range=[
            -51.2,
            -51.2,
            -5.0,
            51.2,
            51.2,
            3.0,
        ],
        type='ObjectRangeFilter'),
    dict(
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        type='ObjectNameFilter'),
    dict(type='PhotoMetricDistortionMultiViewImage'),
    dict(
        mean=[
            123.675,
            116.28,
            103.53,
        ],
        std=[
            58.395,
            57.12,
            57.375,
        ],
        to_rgb=True,
        type='NormalizeMultiviewImage'),
    dict(scales=[
        0.5,
    ], type='RandomScaleImageMultiViewImage'),
    dict(size_divisor=32, type='PadMultiViewImage'),
    dict(
        class_names=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        type='CustomDefaultFormatBundle3D'),
    dict(
        keys=[
            'gt_bboxes_3d',
            'gt_labels_3d',
            'img',
        ], type='CustomCollect3D'),
    dict(type='TypeConverter'),
]
transformer = dict(
    decoder=dict(
        num_layers=6,
        return_intermediate=True,
        transformerlayers=dict(
            attn_cfgs=[
                dict(
                    dropout=0.1,
                    embed_dims=256,
                    num_heads=8,
                    type='MultiheadAttention'),
                dict(
                    embed_dims=256,
                    num_levels=1,
                    type='CustomMSDeformableAttention'),
            ],
            ffn_cfgs=dict(
                feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
            operation_order=(
                'self_attn',
                'norm',
                'cross_attn',
                'norm',
                'ffn',
                'norm',
            ),
            type='DetrTransformerDecoderLayer'),
        type='DetectionTransformerDecoder'),
    embed_dims=256,
    encoder=dict(
        num_layers=3,
        num_points_in_pillar=8,
        pc_range=[
            -51.2,
            -51.2,
            -5.0,
            51.2,
            51.2,
            3.0,
        ],
        return_intermediate=False,
        transformerlayers=dict(
            attn_cfgs=[
                dict(
                    embed_dims=256, num_levels=1,
                    type='TemporalSelfAttention'),
                dict(
                    deformable_attention=dict(
                        embed_dims=256,
                        num_levels=1,
                        num_points=8,
                        type='MSDeformableAttention3D'),
                    embed_dims=256,
                    pc_range=[
                        -51.2,
                        -51.2,
                        -5.0,
                        51.2,
                        51.2,
                        3.0,
                    ],
                    type='SpatialCrossAttention'),
            ],
            ffn_cfgs=dict(
                feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
            operation_order=(
                'self_attn',
                'norm',
                'cross_attn',
                'norm',
                'ffn',
                'norm',
            ),
            type='BEVFormerLayer'),
        type='BEVFormerEncoder'),
    num_cams=6,
    num_feature_levels=1,
    rotate_prev_bev=True,
    type='PerceptionTransformer',
    use_can_bus=True,
    use_shift=True)
val_cfg = dict(max_iters=1)
val_dataloader = dict(
    batch_size=1,
    collate_fn=dict(type='test_collate'),
    dataset=dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
        bev_size=(
            50,
            50,
        ),
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        data_root='data/nuscenes/v1.0-mini/',
        frame=(),
        frames=[
            -3,
            -2,
            -1,
        ],
        modality=dict(
            use_camera=True,
            use_external=False,
            use_lidar=False,
            use_map=False,
            use_radar=False),
        pipeline=[
            dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
            dict(
                mean=[
                    123.675,
                    116.28,
                    103.53,
                ],
                std=[
                    58.395,
                    57.12,
                    57.375,
                ],
                to_rgb=True,
                type='NormalizeMultiviewImage'),
            dict(
                flip=False,
                img_scale=(
                    800,
                    450,
                ),
                pts_scale_ratio=[
                    1.0,
                ],
                transforms=[
                    dict(
                        scales=[
                            0.5,
                        ], type='RandomScaleImageMultiViewImage'),
                    dict(size_divisor=32, type='PadMultiViewImage'),
                    dict(
                        class_names=[
                            'car',
                            'truck',
                            'construction_vehicle',
                            'bus',
                            'trailer',
                            'barrier',
                            'motorcycle',
                            'bicycle',
                            'pedestrian',
                            'traffic_cone',
                        ],
                        type='CustomDefaultFormatBundle3D'),
                    dict(keys=[
                        'img',
                    ], type='CustomCollect3D'),
                ],
                type='MultiScaleFlipAug3D'),
        ],
        samples_per_gpu=1,
        test_mode=True,
        type='CustomNuScenesDataset'),
    num_workers=0,
    sampler=dict(shuffle=True, type='DefaultSampler'))
val_evaluator = dict(metrics=[
    dict(
        ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
        classes=[
            'car',
            'truck',
            'construction_vehicle',
            'bus',
            'trailer',
            'barrier',
            'motorcycle',
            'bicycle',
            'pedestrian',
            'traffic_cone',
        ],
        data_root='data/nuscenes/v1.0-mini/',
        jsonfile_prefix='results',
        modality=dict(
            use_camera=True,
            use_external=False,
            use_lidar=False,
            use_map=False,
            use_radar=False),
        plot_every_run=True,
        plot_examples=1,
        type='src.NuScenesMetric',
        version='v1.0-mini'),
])
val_interval = 1
val_max_iters = 1
version = 'v1.0-mini'
visualizer = dict(
    type='Visualizer',
    vis_backends=[
        dict(type='LocalVisBackend'),
        dict(type='TensorboardVisBackend'),
    ])
voxel_size = [
    0.2,
    0.2,
    8,
]
work_dir = 'experiment'