Upload folder using huggingface_hub

5977178 verified 23 days ago

69.4 kB

	2026/03/14 22:31:06 - bevformer - [4m[97mINFO[0m -
	------------------------------------------------------------
	System environment:
	sys.platform: darwin
	Python: 3.10.0 (default, Oct 17 2021, 11:56:26) [Clang 13.0.0 ]
	CUDA available: False
	numpy_random_seed: 321
	GCC: Apple clang version 15.0.0 (clang-1500.3.9.4)
	PyTorch: 2.10.0
	PyTorch compiling details: PyTorch built with:
	- GCC 4.2
	- C++ Version: 201703
	- clang 15.0.0
	- OpenMP 201811
	- LAPACK is enabled (usually provided by MKL)
	- NNPACK is enabled
	- CPU capability usage: DEFAULT
	- Build settings: BLAS_INFO=accelerate, BUILD_TYPE=Release, COMMIT_SHA=449b1768410104d3ed79d3bcfe4ba1d65c7f22c0, CXX_COMPILER=/usr/bin/c++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_PYTORCH_QNNPACK -DAT_BUILD_ARM_VEC256_WITH_SLEEF -DUSE_XNNPACK -DUSE_PYTORCH_METAL_EXPORT -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DUSE_COREML_DELEGATE -O2 -fPIC -DC10_NODEPRECATED -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=braced-scalar-init -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wvla-extension -Wsuggest-override -Wnewline-eof -Winconsistent-missing-override -Winconsistent-missing-destructor-override -Wno-pass-failed -Wno-error=old-style-cast -Wconstant-conversion -Qunused-arguments -faligned-new -fno-math-errno -fno-trapping-math -Werror=format -DUSE_MPS -Wno-missing-braces, LAPACK_INFO=accelerate, TORCH_VERSION=2.10.0, USE_CUDA=OFF, USE_CUDNN=OFF, USE_CUSPARSELT=OFF, USE_EIGEN_FOR_BLAS=ON, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=OFF, USE_MKLDNN=OFF, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, USE_XCCL=OFF, USE_XPU=OFF,

	TorchVision: 0.25.0
	OpenCV: 4.13.0
	MMEngine: 0.8.5

	Runtime environment:
	dist_cfg: {'backend': 'nccl'}
	seed: 321
	Distributed launcher: none
	Distributed training: False
	GPU number: 1
	------------------------------------------------------------

	2026/03/14 22:31:06 - bevformer - [4m[97mINFO[0m - Set random seed to 321, deterministic: False
	2026/03/14 22:31:06 - bevformer - [4m[97mINFO[0m - Building model
	2026/03/14 22:31:07 - bevformer - [4m[97mINFO[0m - Model architecture:
	BEVFormerDetector(
	(data_preprocessor): BaseDataPreprocessor()
	(pts_bbox_head): BEVFormerHead(
	(loss_cls): FocalLoss()
	(loss_bbox): L1Loss()
	(loss_iou): GIoULoss()
	(transformer): PerceptionTransformer(
	(encoder): BEVFormerEncoder(
	(layers): ModuleList(
	(0-2): 3 x BEVFormerLayer(
	(attentions): ModuleList(
	(0): TemporalSelfAttention(
	(sampling_offsets): Linear(in_features=512, out_features=128, bias=True)
	(attention_weights): Linear(in_features=512, out_features=64, bias=True)
	(value_proj): Linear(in_features=256, out_features=256, bias=True)
	(output_proj): Linear(in_features=256, out_features=256, bias=True)
	(dropout): Dropout(p=0.1, inplace=False)
	)
	(1): SpatialCrossAttention(
	(deformable_attention): MSDeformableAttention3D(
	(sampling_offsets): Linear(in_features=256, out_features=128, bias=True)
	(attention_weights): Linear(in_features=256, out_features=64, bias=True)
	(value_proj): Linear(in_features=256, out_features=256, bias=True)
	)
	(output_proj): Linear(in_features=256, out_features=256, bias=True)
	(dropout): Dropout(p=0.1, inplace=False)
	)
	)
	(ffns): ModuleList(
	(0): FFN(
	(layers): Sequential(
	(0): Sequential(
	(0): Linear(in_features=256, out_features=512, bias=True)
	(1): ReLU(inplace=True)
	(2): Dropout(p=0.1, inplace=False)
	)
	(1): Linear(in_features=512, out_features=256, bias=True)
	(2): Dropout(p=0.1, inplace=False)
	)
	(dropout_layer): Identity()
	(gamma2): Identity()
	)
	)
	(norms): ModuleList(
	(0-2): 3 x LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	)
	)
	)
	)
	(decoder): DetectionTransformerDecoder(
	(layers): ModuleList(
	(0-5): 6 x DetrTransformerDecoderLayer(
	(attentions): ModuleList(
	(0): MultiheadAttention(
	(q_proj): Linear(in_features=256, out_features=256, bias=True)
	(k_proj): Linear(in_features=256, out_features=256, bias=True)
	(v_proj): Linear(in_features=256, out_features=256, bias=True)
	(output_proj): Linear(in_features=256, out_features=256, bias=True)
	(dropout): Dropout(p=0.1, inplace=False)
	)
	(1): CustomMSDeformableAttention(
	(sampling_offsets): Linear(in_features=256, out_features=64, bias=True)
	(attention_weights): Linear(in_features=256, out_features=32, bias=True)
	(value_proj): Linear(in_features=256, out_features=256, bias=True)
	(output_proj): Linear(in_features=256, out_features=256, bias=True)
	(dropout): Dropout(p=0.1, inplace=False)
	)
	)
	(ffns): ModuleList(
	(0): FFN(
	(layers): Sequential(
	(0): Sequential(
	(0): Linear(in_features=256, out_features=512, bias=True)
	(1): ReLU(inplace=True)
	(2): Dropout(p=0.1, inplace=False)
	)
	(1): Linear(in_features=512, out_features=256, bias=True)
	(2): Dropout(p=0.1, inplace=False)
	)
	(dropout_layer): Identity()
	(gamma2): Identity()
	)
	)
	(norms): ModuleList(
	(0-2): 3 x LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	)
	)
	)
	)
	(reference_points): Linear(in_features=256, out_features=3, bias=True)
	(can_bus_mlp): Sequential(
	(0): Linear(in_features=18, out_features=128, bias=True)
	(1): ReLU(inplace=True)
	(2): Linear(in_features=128, out_features=256, bias=True)
	(3): ReLU(inplace=True)
	(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	)
	)
	(cls_branches): ModuleList(
	(0-5): 6 x Sequential(
	(0): Linear(in_features=256, out_features=256, bias=True)
	(1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
	(2): ReLU(inplace=True)
	(3): Linear(in_features=256, out_features=10, bias=True)
	)
	)
	(reg_branches): ModuleList(
	(0-5): 6 x Sequential(
	(0): Linear(in_features=256, out_features=256, bias=True)
	(1): ReLU()
	(2): Linear(in_features=256, out_features=256, bias=True)
	(3): ReLU()
	(4): Linear(in_features=256, out_features=10, bias=True)
	)
	)
	(bev_embedding): Embedding(2500, 256)
	(object_query_embedding): Embedding(900, 512)
	(positional_encoding): LearnedPositionalEncoding(num_feats=128, row_num_embed=50, col_num_embed=50)
	)
	(img_backbone): ResNet(
	(conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	(maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
	(layer1): ResLayer(
	(0): Bottleneck(
	(conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	(downsample): Sequential(
	(0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(1): Bottleneck(
	(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(2): Bottleneck(
	(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	)
	(layer2): ResLayer(
	(0): Bottleneck(
	(conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	(downsample): Sequential(
	(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(1): Bottleneck(
	(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(2): Bottleneck(
	(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(3): Bottleneck(
	(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	)
	(layer3): ResLayer(
	(0): Bottleneck(
	(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	(downsample): Sequential(
	(0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(1): Bottleneck(
	(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(2): Bottleneck(
	(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(3): Bottleneck(
	(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(4): Bottleneck(
	(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(5): Bottleneck(
	(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	)
	(layer4): ResLayer(
	(0): Bottleneck(
	(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	(downsample): Sequential(
	(0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
	(1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(1): Bottleneck(
	(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	(2): Bottleneck(
	(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
	(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
	(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
	(relu): ReLU(inplace=True)
	)
	init_cfg={'type': 'Constant', 'val': 0, 'override': {'name': 'norm3'}}
	)
	)
	init_cfg=[{'type': 'Kaiming', 'layer': 'Conv2d'}, {'type': 'Constant', 'val': 1, 'layer': ['_BatchNorm', 'GroupNorm']}]
	(img_neck): FPN(
	(lateral_convs): ModuleList(
	(0): ConvModule(
	(conv): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
	)
	)
	(fpn_convs): ModuleList(
	(0): ConvModule(
	(conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
	)
	)
	)
	init_cfg={'type': 'Xavier', 'layer': 'Conv2d', 'distribution': 'uniform'}
	(grid_mask): GridMask()
	)
	2026/03/14 22:31:07 - bevformer - [4m[97mINFO[0m - Wrapping model
	2026/03/14 22:31:07 - bevformer - [4m[97mINFO[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.
	2026/03/14 22:31:07 - bevformer - [4m[97mINFO[0m - Hooks will be executed in the following order:
	before_run:
	(VERY_HIGH ) RuntimeInfoHook
	(BELOW_NORMAL) LoggerHook
	(VERY_LOW ) CheckpointHookV2
	(VERY_LOW ) CheckpointUploader
	(VERY_LOW ) CheckpointResumer
	--------------------
	before_train:
	(VERY_HIGH ) RuntimeInfoHook
	(NORMAL ) IterTimerHook
	(VERY_LOW ) CheckpointHookV2
	(VERY_LOW ) CheckpointUploader
	(VERY_LOW ) CheckpointResumer
	--------------------
	before_train_epoch:
	(VERY_HIGH ) RuntimeInfoHook
	(NORMAL ) IterTimerHook
	(NORMAL ) DistSamplerSeedHook
	--------------------
	before_train_iter:
	(VERY_HIGH ) RuntimeInfoHook
	(NORMAL ) IterTimerHook
	--------------------
	after_train_iter:
	(VERY_HIGH ) RuntimeInfoHook
	(NORMAL ) IterTimerHook
	(BELOW_NORMAL) LoggerHook
	(LOW ) ParamSchedulerHook
	(VERY_LOW ) CheckpointHookV2
	(VERY_LOW ) CheckpointUploader
	--------------------
	after_train_epoch:
	(NORMAL ) IterTimerHook
	(LOW ) ParamSchedulerHook
	(VERY_LOW ) CheckpointHookV2
	(VERY_LOW ) CheckpointUploader
	--------------------
	before_val:
	(VERY_HIGH ) RuntimeInfoHook
	(VERY_LOW ) CheckpointHookV2
	(VERY_LOW ) CheckpointResumer
	--------------------
	before_val_epoch:
	(NORMAL ) IterTimerHook
	--------------------
	before_val_iter:
	(NORMAL ) IterTimerHook
	--------------------
	after_val_iter:
	(NORMAL ) IterTimerHook
	(BELOW_NORMAL) LoggerHook
	--------------------
	after_val_epoch:
	(VERY_HIGH ) RuntimeInfoHook
	(NORMAL ) IterTimerHook
	(BELOW_NORMAL) LoggerHook
	(LOW ) ParamSchedulerHook
	(VERY_LOW ) CheckpointHookV2
	(VERY_LOW ) CheckpointUploader
	--------------------
	after_val:
	(VERY_HIGH ) RuntimeInfoHook
	--------------------
	after_train:
	(VERY_HIGH ) RuntimeInfoHook
	(VERY_LOW ) CheckpointHookV2
	(VERY_LOW ) CheckpointUploader
	--------------------
	before_test:
	(VERY_HIGH ) RuntimeInfoHook
	--------------------
	before_test_epoch:
	(NORMAL ) IterTimerHook
	--------------------
	before_test_iter:
	(NORMAL ) IterTimerHook
	--------------------
	after_test_iter:
	(NORMAL ) IterTimerHook
	(BELOW_NORMAL) LoggerHook
	--------------------
	after_test_epoch:
	(VERY_HIGH ) RuntimeInfoHook
	(NORMAL ) IterTimerHook
	(BELOW_NORMAL) LoggerHook
	--------------------
	after_test:
	(VERY_HIGH ) RuntimeInfoHook
	--------------------
	after_run:
	(BELOW_NORMAL) LoggerHook
	--------------------
	2026/03/14 22:31:08 - bevformer - [4m[97mINFO[0m - Config:
	_dim_ = 256
	_ffn_dim_ = 512
	_num_levels_ = 1
	_pos_dim_ = 128
	auto_scale_lr = dict(base_batch_size=16, enable=False)
	bev_h_ = 50
	bev_w_ = 50
	by_epoch = False
	class_names = [
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	]
	custom_hooks = [
	dict(
	by_epoch=False,
	clean_local=False,
	interval=5,
	repo_id='5421Project',
	type='CheckpointUploader'),
	dict(repo_id='5421Project', resume_type='last', type='CheckpointResumer'),
	]
	data = dict(
	nonshuffler_sampler=dict(type='DistributedSampler'),
	samples_per_gpu=1,
	shuffler_sampler=dict(type='DistributedGroupSampler'),
	test=dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
	bev_size=(
	50,
	50,
	),
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	data_root='data/nuscenes/v1.0-mini/',
	frame=[
	-3,
	-2,
	-1,
	],
	modality=dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False),
	pipeline=[
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(
	flip=False,
	img_scale=(
	800,
	450,
	),
	pts_scale_ratio=[
	1.0,
	],
	transforms=[
	dict(
	scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(keys=[
	'img',
	], type='CustomCollect3D'),
	],
	type='MultiScaleFlipAug3D'),
	],
	test_mode=True,
	type='CustomNuScenesDataset'),
	train=dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
	bev_size=(
	50,
	50,
	),
	box_type_3d='LiDAR',
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	data_root='data/nuscenes/v1.0-mini/',
	modality=dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False),
	pipeline=[
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(
	type='LoadAnnotations3D',
	with_bbox_3d=True,
	with_label_3d=True),
	dict(
	point_cloud_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	type='ObjectRangeFilter'),
	dict(
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='ObjectNameFilter'),
	dict(type='PhotoMetricDistortionMultiViewImage'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(
	keys=[
	'gt_bboxes_3d',
	'gt_labels_3d',
	'img',
	],
	type='CustomCollect3D'),
	dict(type='TypeConverter'),
	],
	queue_length=4,
	test_mode=False,
	type='CustomNuScenesDataset',
	use_valid_flag=True),
	val=dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
	bev_size=(
	50,
	50,
	),
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	data_root='data/nuscenes/v1.0-mini/',
	frame=(),
	frames=[
	-3,
	-2,
	-1,
	],
	modality=dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False),
	pipeline=[
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(
	flip=False,
	img_scale=(
	800,
	450,
	),
	pts_scale_ratio=[
	1.0,
	],
	transforms=[
	dict(
	scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(keys=[
	'img',
	], type='CustomCollect3D'),
	],
	type='MultiScaleFlipAug3D'),
	],
	samples_per_gpu=1,
	test_mode=True,
	type='CustomNuScenesDataset'),
	workers_per_gpu=4)
	data_root = 'data/nuscenes/v1.0-mini/'
	dataset_type = 'CustomNuScenesDataset'
	decoder = dict(
	num_layers=6,
	return_intermediate=True,
	transformerlayers=dict(
	attn_cfgs=[
	dict(
	dropout=0.1,
	embed_dims=256,
	num_heads=8,
	type='MultiheadAttention'),
	dict(
	embed_dims=256,
	num_levels=1,
	type='CustomMSDeformableAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='DetrTransformerDecoderLayer'),
	type='DetectionTransformerDecoder')
	default_hooks = dict(
	checkpoint=dict(
	by_epoch=False,
	interval=5,
	max_keep_ckpts=1,
	save_best=[
	'loss',
	'mAP',
	'NDS',
	],
	type='CheckpointHookV2'),
	logger=dict(
	interval=2,
	interval_exp_name=1000,
	log_metric_by_epoch=False,
	type='LoggerHook'),
	param_scheduler=dict(type='ParamSchedulerHook'),
	runtime_info=dict(type='RuntimeInfoHook'),
	sampler_seed=dict(type='DistSamplerSeedHook'),
	timer=dict(type='IterTimerHook'))
	encoder = dict(
	num_layers=3,
	num_points_in_pillar=8,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	return_intermediate=False,
	transformerlayers=dict(
	attn_cfgs=[
	dict(embed_dims=256, num_levels=1, type='TemporalSelfAttention'),
	dict(
	deformable_attention=dict(
	embed_dims=256,
	num_levels=1,
	num_points=8,
	type='MSDeformableAttention3D'),
	embed_dims=256,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	type='SpatialCrossAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='BEVFormerLayer'),
	type='BEVFormerEncoder')
	env_cfg = dict(dist_cfg=dict(backend='nccl'))
	experiment_name = 'debug'
	file_client_args = dict(backend='disk')
	frames = [
	-3,
	-2,
	-1,
	]
	gpu_ids = range(0, 1)
	img_norm_cfg = dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True)
	input_modality = dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False)
	interval = 5
	launcher = 'none'
	load_from = None
	log_interval = 2
	log_processor = dict(window_size=20)
	lr_config = dict(
	min_lr_ratio=0.001,
	policy='CosineAnnealing',
	warmup='linear',
	warmup_iters=500,
	warmup_ratio=0.3333333333333333)
	max_epochs = 5
	max_iters = 10
	model = dict(
	img_backbone=dict(
	depth=50,
	frozen_stages=1,
	norm_cfg=dict(requires_grad=False, type='BN'),
	norm_eval=True,
	num_stages=4,
	out_indices=(3, ),
	style='pytorch',
	type='ResNet'),
	img_neck=dict(
	add_extra_convs='on_output',
	in_channels=[
	2048,
	],
	num_outs=1,
	out_channels=256,
	relu_before_extra_convs=True,
	start_level=0,
	type='FPN'),
	pretrained=dict(img='torchvision://resnet50'),
	pts_bbox_head=dict(
	as_two_stage=False,
	bbox_coder=dict(
	max_num=300,
	num_classes=10,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	post_center_range=[
	-61.2,
	-61.2,
	-10.0,
	61.2,
	61.2,
	10.0,
	],
	type='NMSFreeCoder',
	voxel_size=[
	0.2,
	0.2,
	8,
	]),
	bev_h=50,
	bev_w=50,
	in_channels=256,
	loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
	loss_cls=dict(
	alpha=0.25,
	gamma=2.0,
	loss_weight=2.0,
	type='FocalLoss',
	use_sigmoid=True),
	loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
	num_classes=10,
	num_query=900,
	positional_encoding=dict(
	col_num_embed=50,
	num_feats=128,
	row_num_embed=50,
	type='LearnedPositionalEncoding'),
	sync_cls_avg_factor=True,
	transformer=dict(
	decoder=dict(
	num_layers=6,
	return_intermediate=True,
	transformerlayers=dict(
	attn_cfgs=[
	dict(
	dropout=0.1,
	embed_dims=256,
	num_heads=8,
	type='MultiheadAttention'),
	dict(
	embed_dims=256,
	num_levels=1,
	type='CustomMSDeformableAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512,
	ffn_drop=0.1,
	num_fcs=2,
	type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='DetrTransformerDecoderLayer'),
	type='DetectionTransformerDecoder'),
	embed_dims=256,
	encoder=dict(
	num_layers=3,
	num_points_in_pillar=8,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	return_intermediate=False,
	transformerlayers=dict(
	attn_cfgs=[
	dict(
	embed_dims=256,
	num_levels=1,
	type='TemporalSelfAttention'),
	dict(
	deformable_attention=dict(
	embed_dims=256,
	num_levels=1,
	num_points=8,
	type='MSDeformableAttention3D'),
	embed_dims=256,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	type='SpatialCrossAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512,
	ffn_drop=0.1,
	num_fcs=2,
	type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='BEVFormerLayer'),
	type='BEVFormerEncoder'),
	num_cams=6,
	num_feature_levels=1,
	rotate_prev_bev=True,
	type='PerceptionTransformer',
	use_can_bus=True,
	use_shift=True),
	type='BEVFormerHead',
	with_box_refine=True),
	train_cfg=dict(
	pts=dict(
	assigner=dict(
	cls_cost=dict(type='FocalCost', weight=2.0),
	iou_cost=dict(type='SmoothL1Cost', weight=0.25),
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
	type='HungarianAssigner3D'),
	grid_size=[
	512,
	512,
	1,
	],
	out_size_factor=4,
	point_cloud_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	voxel_size=[
	0.2,
	0.2,
	8,
	])),
	type='BEVFormerDetector',
	use_grid_mask=True,
	video_test_mode=True)
	optim_wrapper = dict(
	optimizer=dict(lr=0.0001, type='AdamW', weight_decay=0.01),
	type='OptimWrapper')
	optimizer = dict(lr=0.0001, type='AdamW', weight_decay=0.01)
	param_scheduler = dict(
	milestones=[
	1,
	2,
	], type='MultiStepLR')
	point_cloud_range = [
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	]
	pts_bbox_head = dict(
	as_two_stage=False,
	bbox_coder=dict(
	max_num=300,
	num_classes=10,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	post_center_range=[
	-61.2,
	-61.2,
	-10.0,
	61.2,
	61.2,
	10.0,
	],
	type='NMSFreeCoder',
	voxel_size=[
	0.2,
	0.2,
	8,
	]),
	bev_h=50,
	bev_w=50,
	in_channels=256,
	loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
	loss_cls=dict(
	alpha=0.25,
	gamma=2.0,
	loss_weight=2.0,
	type='FocalLoss',
	use_sigmoid=True),
	loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
	num_classes=10,
	num_query=900,
	positional_encoding=dict(
	col_num_embed=50,
	num_feats=128,
	row_num_embed=50,
	type='LearnedPositionalEncoding'),
	sync_cls_avg_factor=True,
	transformer=dict(
	decoder=dict(
	num_layers=6,
	return_intermediate=True,
	transformerlayers=dict(
	attn_cfgs=[
	dict(
	dropout=0.1,
	embed_dims=256,
	num_heads=8,
	type='MultiheadAttention'),
	dict(
	embed_dims=256,
	num_levels=1,
	type='CustomMSDeformableAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512,
	ffn_drop=0.1,
	num_fcs=2,
	type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='DetrTransformerDecoderLayer'),
	type='DetectionTransformerDecoder'),
	embed_dims=256,
	encoder=dict(
	num_layers=3,
	num_points_in_pillar=8,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	return_intermediate=False,
	transformerlayers=dict(
	attn_cfgs=[
	dict(
	embed_dims=256,
	num_levels=1,
	type='TemporalSelfAttention'),
	dict(
	deformable_attention=dict(
	embed_dims=256,
	num_levels=1,
	num_points=8,
	type='MSDeformableAttention3D'),
	embed_dims=256,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	type='SpatialCrossAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512,
	ffn_drop=0.1,
	num_fcs=2,
	type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='BEVFormerLayer'),
	type='BEVFormerEncoder'),
	num_cams=6,
	num_feature_levels=1,
	rotate_prev_bev=True,
	type='PerceptionTransformer',
	use_can_bus=True,
	use_shift=True),
	type='BEVFormerHead',
	with_box_refine=True)
	queue_length = 4
	resume = False
	scales = [
	0.5,
	]
	test_cfg = dict(max_iters=1)
	test_dataloader = dict(
	batch_size=1,
	collate_fn=dict(type='test_collate'),
	dataset=dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
	bev_size=(
	50,
	50,
	),
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	data_root='data/nuscenes/v1.0-mini/',
	frame=[
	-3,
	-2,
	-1,
	],
	modality=dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False),
	pipeline=[
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(
	flip=False,
	img_scale=(
	800,
	450,
	),
	pts_scale_ratio=[
	1.0,
	],
	transforms=[
	dict(
	scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(keys=[
	'img',
	], type='CustomCollect3D'),
	],
	type='MultiScaleFlipAug3D'),
	],
	test_mode=True,
	type='CustomNuScenesDataset'),
	num_workers=0,
	sampler=dict(shuffle=True, type='DefaultSampler'))
	test_evaluator = dict(metrics=[
	dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
	data_root='data/nuscenes/v1.0-mini/',
	type='src.NuScenesMetric',
	version='v1.0-mini'),
	])
	test_max_iters = 1
	test_pipeline = [
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(
	flip=False,
	img_scale=(
	800,
	450,
	),
	pts_scale_ratio=[
	1.0,
	],
	transforms=[
	dict(scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(keys=[
	'img',
	], type='CustomCollect3D'),
	],
	type='MultiScaleFlipAug3D'),
	]
	train_cfg = dict(by_epoch=False, max_epochs=5, max_iters=10, val_interval=5)
	train_dataloader = dict(
	batch_size=1,
	collate_fn=dict(type='train_collate'),
	dataset=dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
	bev_size=(
	50,
	50,
	),
	box_type_3d='LiDAR',
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	data_root='data/nuscenes/v1.0-mini/',
	modality=dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False),
	pipeline=[
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(
	type='LoadAnnotations3D',
	with_bbox_3d=True,
	with_label_3d=True),
	dict(
	point_cloud_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	type='ObjectRangeFilter'),
	dict(
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='ObjectNameFilter'),
	dict(type='PhotoMetricDistortionMultiViewImage'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(
	keys=[
	'gt_bboxes_3d',
	'gt_labels_3d',
	'img',
	],
	type='CustomCollect3D'),
	dict(type='TypeConverter'),
	],
	queue_length=4,
	test_mode=False,
	type='CustomNuScenesDataset',
	use_valid_flag=True),
	num_workers=0,
	sampler=dict(shuffle=True, type='DefaultSampler'))
	train_pipeline = [
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
	dict(
	point_cloud_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	type='ObjectRangeFilter'),
	dict(
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='ObjectNameFilter'),
	dict(type='PhotoMetricDistortionMultiViewImage'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(
	keys=[
	'gt_bboxes_3d',
	'gt_labels_3d',
	'img',
	], type='CustomCollect3D'),
	dict(type='TypeConverter'),
	]
	transformer = dict(
	decoder=dict(
	num_layers=6,
	return_intermediate=True,
	transformerlayers=dict(
	attn_cfgs=[
	dict(
	dropout=0.1,
	embed_dims=256,
	num_heads=8,
	type='MultiheadAttention'),
	dict(
	embed_dims=256,
	num_levels=1,
	type='CustomMSDeformableAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='DetrTransformerDecoderLayer'),
	type='DetectionTransformerDecoder'),
	embed_dims=256,
	encoder=dict(
	num_layers=3,
	num_points_in_pillar=8,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	return_intermediate=False,
	transformerlayers=dict(
	attn_cfgs=[
	dict(
	embed_dims=256, num_levels=1,
	type='TemporalSelfAttention'),
	dict(
	deformable_attention=dict(
	embed_dims=256,
	num_levels=1,
	num_points=8,
	type='MSDeformableAttention3D'),
	embed_dims=256,
	pc_range=[
	-51.2,
	-51.2,
	-5.0,
	51.2,
	51.2,
	3.0,
	],
	type='SpatialCrossAttention'),
	],
	ffn_cfgs=dict(
	feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
	operation_order=(
	'self_attn',
	'norm',
	'cross_attn',
	'norm',
	'ffn',
	'norm',
	),
	type='BEVFormerLayer'),
	type='BEVFormerEncoder'),
	num_cams=6,
	num_feature_levels=1,
	rotate_prev_bev=True,
	type='PerceptionTransformer',
	use_can_bus=True,
	use_shift=True)
	val_cfg = dict(max_iters=1)
	val_dataloader = dict(
	batch_size=1,
	collate_fn=dict(type='test_collate'),
	dataset=dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
	bev_size=(
	50,
	50,
	),
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	data_root='data/nuscenes/v1.0-mini/',
	frame=(),
	frames=[
	-3,
	-2,
	-1,
	],
	modality=dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False),
	pipeline=[
	dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
	dict(
	mean=[
	123.675,
	116.28,
	103.53,
	],
	std=[
	58.395,
	57.12,
	57.375,
	],
	to_rgb=True,
	type='NormalizeMultiviewImage'),
	dict(
	flip=False,
	img_scale=(
	800,
	450,
	),
	pts_scale_ratio=[
	1.0,
	],
	transforms=[
	dict(
	scales=[
	0.5,
	], type='RandomScaleImageMultiViewImage'),
	dict(size_divisor=32, type='PadMultiViewImage'),
	dict(
	class_names=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	type='CustomDefaultFormatBundle3D'),
	dict(keys=[
	'img',
	], type='CustomCollect3D'),
	],
	type='MultiScaleFlipAug3D'),
	],
	samples_per_gpu=1,
	test_mode=True,
	type='CustomNuScenesDataset'),
	num_workers=0,
	sampler=dict(shuffle=True, type='DefaultSampler'))
	val_evaluator = dict(metrics=[
	dict(
	ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
	classes=[
	'car',
	'truck',
	'construction_vehicle',
	'bus',
	'trailer',
	'barrier',
	'motorcycle',
	'bicycle',
	'pedestrian',
	'traffic_cone',
	],
	data_root='data/nuscenes/v1.0-mini/',
	jsonfile_prefix='results',
	modality=dict(
	use_camera=True,
	use_external=False,
	use_lidar=False,
	use_map=False,
	use_radar=False),
	plot_every_run=True,
	plot_examples=1,
	type='src.NuScenesMetric',
	version='v1.0-mini'),
	])
	val_interval = 5
	val_max_iters = 1
	version = 'v1.0-mini'
	visualizer = dict(
	type='Visualizer',
	vis_backends=[
	dict(type='LocalVisBackend'),
	dict(type='TensorboardVisBackend'),
	])
	voxel_size = [
	0.2,
	0.2,
	8,
	]
	work_dir = 'experiment'

	2026/03/14 22:31:08 - bevformer - [4m[97mINFO[0m - See full config in 'experiment/debug/bevformer_tiny_test.py'.
	2026/03/14 22:31:10 - bevformer - [4m[97mINFO[0m - Checkpoints will be saved to 'experiment/debug' after every 5 steps.
	2026/03/14 22:31:10 - bevformer - [4m[97mINFO[0m - Initialize best checkpoints by train phase.
	2026/03/14 22:31:10 - bevformer - [4m[97mINFO[0m - Set best path for 'loss' None.
	2026/03/14 22:31:10 - bevformer - [4m[97mINFO[0m - Set best path for 'mAP' None.
	2026/03/14 22:31:10 - bevformer - [4m[97mINFO[0m - Set best path for 'NDS' None.
	2026/03/14 22:31:10 - bevformer - [4m[97mINFO[0m - The best checkpoints will be saved to 'experiment/debug' based on ['loss', 'mAP', 'NDS'] with rules ['less', 'greater', 'greater'] after every 5 steps.
	2026/03/14 22:31:10 - bevformer - [4m[97mINFO[0m - Keep maximum 1 checkpoints in local.
	2026/03/14 22:31:13 - bevformer - [4m[97mINFO[0m - Created '5421Project/debug' to save checkpoints (see 'https://huggingface.co/5421Project/debug').
	2026/03/14 22:31:13 - bevformer - [4m[97mINFO[0m - Checkpoints will be pushed to repo 'https://huggingface.co/5421Project/debug' after every 5 steps.
	2026/03/14 22:31:55 - bevformer - [4m[97mINFO[0m - Epoch(train) [1][ 2/323] lr: 1.0000e-04 eta: 0:02:49 time: 21.1329 data_time: 1.0395 loss: 54.7820 loss_cls: 2.2636 loss_bbox: 7.1453 d0.loss_cls: 2.2556 d0.loss_bbox: 6.7686 d1.loss_cls: 2.2362 d1.loss_bbox: 6.8347 d2.loss_cls: 2.1509 d2.loss_bbox: 6.7891 d3.loss_cls: 2.2262 d3.loss_bbox: 6.8406 d4.loss_cls: 2.2757 d4.loss_bbox: 6.9953
	2026/03/14 22:32:35 - bevformer - [4m[97mINFO[0m - Epoch(train) [1][ 4/323] lr: 1.0000e-04 eta: 0:02:03 time: 20.5688 data_time: 1.0071 loss: 52.4912 loss_cls: 2.2311 loss_bbox: 6.6259 d0.loss_cls: 2.2306 d0.loss_bbox: 6.5213 d1.loss_cls: 2.2324 d1.loss_bbox: 6.5294 d2.loss_cls: 2.1107 d2.loss_bbox: 6.5639 d3.loss_cls: 2.1472 d3.loss_bbox: 6.5297 d4.loss_cls: 2.2651 d4.loss_bbox: 6.5040
	2026/03/14 22:32:56 - bevformer - [4m[97mINFO[0m - Saving checkpoint at 5 iterations
	2026/03/14 22:32:57 - bevformer - [4m[97mINFO[0m - Saving best checkpoints...
	2026/03/14 22:32:57 - bevformer - [4m[97mINFO[0m - Set 'best_score_loss' to +/-inf as it is not in message hub.
	2026/03/14 22:32:57 - bevformer - [4m[97mINFO[0m - [loss]: Best score: inf, current score: 43.642234802246094
	2026/03/14 22:32:58 - bevformer - [4m[97mINFO[0m - The best checkpoint with 43.6422 loss at 5 iter is saved to 'best_loss_iter_5.pth'.
	2026/03/14 22:32:58 - bevformer - [4m[97mINFO[0m - Pushing checkpoint at 5 steps...
	2026/03/14 22:34:31 - bevformer - [4m[97mINFO[0m - Pushed last checkpoint 'experiment/debug/iter_5.pth' to repo
	2026/03/14 22:34:35 - bevformer - [4m[97mINFO[0m - Pushed best checkpoint 'best_loss_iter_5.pth' of [loss]...
	2026/03/14 22:34:49 - bevformer - [4m[97mINFO[0m - Epoch(val) [0][81/81] NDS: 0.0000 mAP: 0.0000 data_time: 0.8768 time: 2.6421
	2026/03/14 22:34:49 - bevformer - [4m[97mINFO[0m - Save best checkpoints after val epoch.
	2026/03/14 22:34:49 - bevformer - [4m[97mINFO[0m - Saving best checkpoints...
	2026/03/14 22:34:49 - bevformer - [4m[97mINFO[0m - Set 'best_score_mAP' to +/-inf as it is not in message hub.
	2026/03/14 22:34:49 - bevformer - [4m[97mINFO[0m - [mAP]: Best score: -inf, current score: 3.561391434020159e-05
	2026/03/14 22:34:50 - bevformer - [4m[97mINFO[0m - The best checkpoint with 0.0000 mAP at 5 iter is saved to 'best_mAP_iter_5.pth'.
	2026/03/14 22:34:50 - bevformer - [4m[97mINFO[0m - Set 'best_score_NDS' to +/-inf as it is not in message hub.
	2026/03/14 22:34:50 - bevformer - [4m[97mINFO[0m - [NDS]: Best score: -inf, current score: 1.7806957170100794e-05
	2026/03/14 22:34:51 - bevformer - [4m[97mINFO[0m - The best checkpoint with 0.0000 NDS at 5 iter is saved to 'best_NDS_iter_5.pth'.
	2026/03/14 22:34:51 - bevformer - [4m[97mINFO[0m - Resaving checkpoint at 5 iter...
	2026/03/14 22:34:55 - bevformer - [4m[97mINFO[0m - Pushed best checkpoint 'best_mAP_iter_5.pth' of [mAP]...
	2026/03/14 22:34:57 - bevformer - [4m[97mINFO[0m - Pushed best checkpoint 'best_NDS_iter_5.pth' of [NDS]...
	2026/03/14 22:35:20 - bevformer - [4m[97mINFO[0m - Epoch(train) [1][ 6/323] lr: 1.0000e-04 eta: 0:01:36 time: 24.1120 data_time: 4.3466 loss: 51.6401 loss_cls: 2.1800 loss_bbox: 6.4342 d0.loss_cls: 2.1850 d0.loss_bbox: 6.4393 d1.loss_cls: 2.2004 d1.loss_bbox: 6.4757 d2.loss_cls: 2.0870 d2.loss_bbox: 6.4939 d3.loss_cls: 2.1014 d3.loss_bbox: 6.4365 d4.loss_cls: 2.2068 d4.loss_bbox: 6.3998
	2026/03/14 22:36:00 - bevformer - [4m[97mINFO[0m - Epoch(train) [1][ 8/323] lr: 1.0000e-04 eta: 0:00:46 time: 23.1149 data_time: 3.4998 loss: 49.8073 loss_cls: 2.1378 loss_bbox: 6.1821 d0.loss_cls: 2.1482 d0.loss_bbox: 6.1880 d1.loss_cls: 2.1749 d1.loss_bbox: 6.1841 d2.loss_cls: 2.0729 d2.loss_bbox: 6.2197 d3.loss_cls: 2.0748 d3.loss_bbox: 6.1526 d4.loss_cls: 2.1567 d4.loss_bbox: 6.1156
	2026/03/14 22:36:42 - bevformer - [4m[97mINFO[0m - Epoch(train) [1][ 10/323] lr: 1.0000e-04 eta: 0:00:00 time: 22.7019 data_time: 2.9867 loss: 50.4065 loss_cls: 2.1042 loss_bbox: 6.3351 d0.loss_cls: 2.1262 d0.loss_bbox: 6.3189 d1.loss_cls: 2.1572 d1.loss_bbox: 6.2772 d2.loss_cls: 2.0582 d2.loss_bbox: 6.3471 d3.loss_cls: 2.0630 d3.loss_bbox: 6.2461 d4.loss_cls: 2.1094 d4.loss_bbox: 6.2639
	2026/03/14 22:36:42 - bevformer - [4m[97mINFO[0m - Saving checkpoint at 10 iterations
	2026/03/14 22:36:44 - bevformer - [4m[97mINFO[0m - Saving best checkpoints...
	2026/03/14 22:36:44 - bevformer - [4m[97mINFO[0m - Got best score ['loss'] from message hub
	2026/03/14 22:36:44 - bevformer - [4m[97mINFO[0m - [loss]: Best score: 43.642234802246094, current score: 50.985355377197266
	2026/03/14 22:36:44 - bevformer - [4m[97mINFO[0m - Pushing checkpoint at 10 steps...
	2026/03/14 22:37:18 - bevformer - [4m[97mINFO[0m - Pushed last checkpoint 'experiment/debug/iter_10.pth' to repo
	2026/03/14 22:37:19 - bevformer - [4m[97mINFO[0m - Removed 'iter_5.pth' from repo
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - Epoch(val) [0][81/81] NDS: 0.0221 mAP: 0.0000 data_time: 0.7784 time: 2.4680
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - Save best checkpoints after val epoch.
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - Saving best checkpoints...
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - Got best score ['mAP'] from message hub
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - [mAP]: Best score: 3.561391434020159e-05, current score: 7.014590347923641e-07
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - Got best score ['NDS'] from message hub
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - [NDS]: Best score: 1.7806957170100794e-05, current score: 0.02213961767715449
	2026/03/14 22:37:32 - bevformer - [4m[97mINFO[0m - The previous best checkpoint 'experiment/debug/best_NDS_iter_5.pth' is removed
	2026/03/14 22:37:33 - bevformer - [4m[97mINFO[0m - The best checkpoint with 0.0221 NDS at 10 iter is saved to 'best_NDS_iter_10.pth'.
	2026/03/14 22:37:33 - bevformer - [4m[97mINFO[0m - Resaving checkpoint at 10 iter...
	2026/03/14 22:37:36 - bevformer - [4m[97mINFO[0m - Pushed best checkpoint 'best_NDS_iter_10.pth' of [NDS]...
	2026/03/14 22:37:36 - bevformer - [4m[97mINFO[0m - Removed 'best_NDS_iter_5.pth' from repo
	2026/03/14 22:37:37 - bevformer - [4m[97mINFO[0m - Pushing visualizing data and safetensors to repo after training...