nguyenminh4099 commited on
Commit
e6a866d
·
verified ·
1 Parent(s): 7e34541

Upload folder using huggingface_hub

Browse files
20260315_230250/20260315_230250.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"lr": 0.0001, "data_time": 0.954918372631073, "loss": 44.09534311294556, "loss_cls": 1.1228834331035613, "loss_bbox": 6.337317788600922, "d0.loss_cls": 1.108318066596985, "d0.loss_bbox": 6.259250831604004, "d1.loss_cls": 1.111513575911522, "d1.loss_bbox": 6.144867360591888, "d2.loss_cls": 1.1080957710742951, "d2.loss_bbox": 6.104123330116272, "d3.loss_cls": 1.1203179508447647, "d3.loss_bbox": 6.3368358492851256, "d4.loss_cls": 1.1214783400297166, "d4.loss_bbox": 6.220340991020203, "time": 21.755193567276002, "epoch": 1, "iter": 650, "step": 650}
2
+ {"lr": 0.0001, "data_time": 0.9646973729133606, "loss": 51.661175918579104, "loss_cls": 1.1298476964235307, "loss_bbox": 7.4403127312660216, "d0.loss_cls": 1.085216408967972, "d0.loss_bbox": 7.6048126220703125, "d1.loss_cls": 1.1213543117046356, "d1.loss_bbox": 7.538022649288178, "d2.loss_cls": 1.1103688687086106, "d2.loss_bbox": 7.412710273265839, "d3.loss_cls": 1.1244247049093246, "d3.loss_bbox": 7.577581775188446, "d4.loss_cls": 1.126863345503807, "d4.loss_bbox": 7.389661097526551, "time": 22.624796402454376, "epoch": 1, "iter": 700, "step": 700}
3
+ {"lr": 0.0001, "data_time": 0.9243666768074036, "loss": 38.63754949569702, "loss_cls": 1.0912210047245026, "loss_bbox": 5.242098152637482, "d0.loss_cls": 1.0618977814912796, "d0.loss_bbox": 5.441217350959778, "d1.loss_cls": 1.0883103013038635, "d1.loss_bbox": 5.398028314113617, "d2.loss_cls": 1.0953371733427049, "d2.loss_bbox": 5.400306010246277, "d3.loss_cls": 1.0949742108583451, "d3.loss_bbox": 5.344762396812439, "d4.loss_cls": 1.0946346282958985, "d4.loss_bbox": 5.284762263298035, "time": 23.798150968551635, "epoch": 1, "iter": 750, "step": 750}
4
+ {"NDS": 0.024444029379540926, "mAP": 0.0003303450412910843, "data_time": 0.5486190546126593, "time": 2.0810610907418385, "step": 750}
5
+ {"lr": 0.0001, "data_time": 0.9359340071678162, "loss": 40.09088401794433, "loss_cls": 1.189671701192856, "loss_bbox": 5.4612990617752075, "d0.loss_cls": 1.1360573798418045, "d0.loss_bbox": 5.618711662292481, "d1.loss_cls": 1.170571580529213, "d1.loss_bbox": 5.556303668022156, "d2.loss_cls": 1.1710273623466492, "d2.loss_bbox": 5.482773923873902, "d3.loss_cls": 1.1886319637298584, "d3.loss_bbox": 5.466277146339417, "d4.loss_cls": 1.1805710077285767, "d4.loss_bbox": 5.468987131118775, "time": 25.543924260139466, "epoch": 1, "iter": 800, "step": 800}
6
+ {"lr": 0.0001, "data_time": 0.9578536748886108, "loss": 51.03642435073853, "loss_cls": 1.1871638536453246, "loss_bbox": 7.277057945728302, "d0.loss_cls": 1.1480519831180573, "d0.loss_bbox": 7.431847703456879, "d1.loss_cls": 1.1863121330738067, "d1.loss_bbox": 7.32298082113266, "d2.loss_cls": 1.172393587231636, "d2.loss_bbox": 7.3522356986999515, "d3.loss_cls": 1.1807974189519883, "d3.loss_bbox": 7.363990998268127, "d4.loss_cls": 1.1806713700294496, "d4.loss_bbox": 7.23292065858841, "time": 25.078794717788696, "epoch": 1, "iter": 850, "step": 850}
7
+ {"lr": 0.0001, "data_time": 0.9448531866073608, "loss": 47.00728120803833, "loss_cls": 1.1108986586332321, "loss_bbox": 6.683765578269958, "d0.loss_cls": 1.058606892824173, "d0.loss_bbox": 6.775075042247773, "d1.loss_cls": 1.1145205676555634, "d1.loss_bbox": 6.805619835853577, "d2.loss_cls": 1.1085663586854935, "d2.loss_bbox": 6.700556540489197, "d3.loss_cls": 1.1268153429031371, "d3.loss_bbox": 6.825287544727326, "d4.loss_cls": 1.123038759827614, "d4.loss_bbox": 6.574530410766601, "time": 23.162017893791198, "epoch": 1, "iter": 900, "step": 900}
8
+ {"lr": 0.0001, "data_time": 0.9400171756744384, "loss": 41.35070466995239, "loss_cls": 1.158398947119713, "loss_bbox": 5.760552084445953, "d0.loss_cls": 1.089666599035263, "d0.loss_bbox": 5.752838516235352, "d1.loss_cls": 1.1419324547052383, "d1.loss_bbox": 5.745536303520202, "d2.loss_cls": 1.1454829275608063, "d2.loss_bbox": 5.826562070846558, "d3.loss_cls": 1.1492114216089249, "d3.loss_bbox": 5.729734838008881, "d4.loss_cls": 1.1491299778223039, "d4.loss_bbox": 5.701658976078034, "time": 22.88226662874222, "epoch": 1, "iter": 950, "step": 950}
9
+ {"lr": 0.0001, "data_time": 0.9400394678115844, "loss": 46.44832859039307, "loss_cls": 1.1757118165493012, "loss_bbox": 6.591959166526794, "d0.loss_cls": 1.098275139927864, "d0.loss_bbox": 6.536907982826233, "d1.loss_cls": 1.1439443498849868, "d1.loss_bbox": 6.561067795753479, "d2.loss_cls": 1.150243878364563, "d2.loss_bbox": 6.466058671474457, "d3.loss_cls": 1.160718309879303, "d3.loss_bbox": 6.50878232717514, "d4.loss_cls": 1.1740066707134247, "d4.loss_bbox": 6.8806525349617, "time": 23.910873460769654, "epoch": 1, "iter": 1000, "step": 1000}
10
+ {"NDS": 0.024568420980261345, "mAP": 0.00024522344729063566, "data_time": 0.5547568256204779, "time": 2.091413226994601, "step": 1000}
11
+ {"lr": 0.0001, "data_time": 0.9280791401863098, "loss": 35.56269006729126, "loss_cls": 1.1439091116189957, "loss_bbox": 4.727340304851532, "d0.loss_cls": 1.0815186262130738, "d0.loss_bbox": 4.890256083011627, "d1.loss_cls": 1.1172061949968337, "d1.loss_bbox": 4.832208633422852, "d2.loss_cls": 1.1246428489685059, "d2.loss_bbox": 4.8471301078796385, "d3.loss_cls": 1.1306370705366136, "d3.loss_bbox": 4.7437060356140135, "d4.loss_cls": 1.1405282199382782, "d4.loss_bbox": 4.783606648445129, "time": 22.772711980342866, "epoch": 1, "iter": 1050, "step": 1050}
12
+ {"lr": 0.0001, "data_time": 0.9757530927658081, "loss": 42.73475952148438, "loss_cls": 1.1595562100410461, "loss_bbox": 5.88040109872818, "d0.loss_cls": 1.0938430309295655, "d0.loss_bbox": 5.977837765216828, "d1.loss_cls": 1.1540126234292984, "d1.loss_bbox": 5.96634818315506, "d2.loss_cls": 1.1451045274734497, "d2.loss_bbox": 6.051124310493469, "d3.loss_cls": 1.1639264404773713, "d3.loss_bbox": 5.975519037246704, "d4.loss_cls": 1.1618152916431428, "d4.loss_bbox": 6.005270600318909, "time": 22.53203227519989, "epoch": 1, "iter": 1100, "step": 1100}
20260315_230250/config.py ADDED
@@ -0,0 +1,1320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _dim_ = 256
2
+ _ffn_dim_ = 512
3
+ _num_levels_ = 1
4
+ _pos_dim_ = 128
5
+ auto_scale_lr = dict(base_batch_size=16, enable=False)
6
+ bev_h_ = 50
7
+ bev_w_ = 50
8
+ by_epoch = False
9
+ class_names = [
10
+ 'car',
11
+ 'truck',
12
+ 'construction_vehicle',
13
+ 'bus',
14
+ 'trailer',
15
+ 'barrier',
16
+ 'motorcycle',
17
+ 'bicycle',
18
+ 'pedestrian',
19
+ 'traffic_cone',
20
+ ]
21
+ custom_hooks = [
22
+ dict(
23
+ by_epoch=False,
24
+ clean_local=False,
25
+ interval=250,
26
+ repo_id='5421Project',
27
+ type='CheckpointUploader'),
28
+ dict(repo_id='5421Project', resume_type='last', type='CheckpointResumer'),
29
+ ]
30
+ data = dict(
31
+ nonshuffler_sampler=dict(type='DistributedSampler'),
32
+ samples_per_gpu=1,
33
+ shuffler_sampler=dict(type='DistributedGroupSampler'),
34
+ test=dict(
35
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
36
+ bev_size=(
37
+ 50,
38
+ 50,
39
+ ),
40
+ classes=[
41
+ 'car',
42
+ 'truck',
43
+ 'construction_vehicle',
44
+ 'bus',
45
+ 'trailer',
46
+ 'barrier',
47
+ 'motorcycle',
48
+ 'bicycle',
49
+ 'pedestrian',
50
+ 'traffic_cone',
51
+ ],
52
+ data_root='data/nuscenes/v1.0-mini/',
53
+ frame=[
54
+ -3,
55
+ -2,
56
+ -1,
57
+ ],
58
+ modality=dict(
59
+ use_camera=True,
60
+ use_external=False,
61
+ use_lidar=False,
62
+ use_map=False,
63
+ use_radar=False),
64
+ pipeline=[
65
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
66
+ dict(
67
+ mean=[
68
+ 123.675,
69
+ 116.28,
70
+ 103.53,
71
+ ],
72
+ std=[
73
+ 58.395,
74
+ 57.12,
75
+ 57.375,
76
+ ],
77
+ to_rgb=True,
78
+ type='NormalizeMultiviewImage'),
79
+ dict(
80
+ flip=False,
81
+ img_scale=(
82
+ 800,
83
+ 450,
84
+ ),
85
+ pts_scale_ratio=[
86
+ 1.0,
87
+ ],
88
+ transforms=[
89
+ dict(
90
+ scales=[
91
+ 0.5,
92
+ ], type='RandomScaleImageMultiViewImage'),
93
+ dict(size_divisor=32, type='PadMultiViewImage'),
94
+ dict(
95
+ class_names=[
96
+ 'car',
97
+ 'truck',
98
+ 'construction_vehicle',
99
+ 'bus',
100
+ 'trailer',
101
+ 'barrier',
102
+ 'motorcycle',
103
+ 'bicycle',
104
+ 'pedestrian',
105
+ 'traffic_cone',
106
+ ],
107
+ type='CustomDefaultFormatBundle3D'),
108
+ dict(keys=[
109
+ 'img',
110
+ ], type='CustomCollect3D'),
111
+ ],
112
+ type='MultiScaleFlipAug3D'),
113
+ ],
114
+ test_mode=True,
115
+ type='CustomNuScenesDataset'),
116
+ train=dict(
117
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
118
+ bev_size=(
119
+ 50,
120
+ 50,
121
+ ),
122
+ box_type_3d='LiDAR',
123
+ classes=[
124
+ 'car',
125
+ 'truck',
126
+ 'construction_vehicle',
127
+ 'bus',
128
+ 'trailer',
129
+ 'barrier',
130
+ 'motorcycle',
131
+ 'bicycle',
132
+ 'pedestrian',
133
+ 'traffic_cone',
134
+ ],
135
+ data_root='data/nuscenes/v1.0-mini/',
136
+ modality=dict(
137
+ use_camera=True,
138
+ use_external=False,
139
+ use_lidar=False,
140
+ use_map=False,
141
+ use_radar=False),
142
+ pipeline=[
143
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
144
+ dict(
145
+ type='LoadAnnotations3D',
146
+ with_bbox_3d=True,
147
+ with_label_3d=True),
148
+ dict(
149
+ point_cloud_range=[
150
+ -51.2,
151
+ -51.2,
152
+ -5.0,
153
+ 51.2,
154
+ 51.2,
155
+ 3.0,
156
+ ],
157
+ type='ObjectRangeFilter'),
158
+ dict(
159
+ classes=[
160
+ 'car',
161
+ 'truck',
162
+ 'construction_vehicle',
163
+ 'bus',
164
+ 'trailer',
165
+ 'barrier',
166
+ 'motorcycle',
167
+ 'bicycle',
168
+ 'pedestrian',
169
+ 'traffic_cone',
170
+ ],
171
+ type='ObjectNameFilter'),
172
+ dict(type='PhotoMetricDistortionMultiViewImage'),
173
+ dict(
174
+ mean=[
175
+ 123.675,
176
+ 116.28,
177
+ 103.53,
178
+ ],
179
+ std=[
180
+ 58.395,
181
+ 57.12,
182
+ 57.375,
183
+ ],
184
+ to_rgb=True,
185
+ type='NormalizeMultiviewImage'),
186
+ dict(scales=[
187
+ 0.5,
188
+ ], type='RandomScaleImageMultiViewImage'),
189
+ dict(size_divisor=32, type='PadMultiViewImage'),
190
+ dict(
191
+ class_names=[
192
+ 'car',
193
+ 'truck',
194
+ 'construction_vehicle',
195
+ 'bus',
196
+ 'trailer',
197
+ 'barrier',
198
+ 'motorcycle',
199
+ 'bicycle',
200
+ 'pedestrian',
201
+ 'traffic_cone',
202
+ ],
203
+ type='CustomDefaultFormatBundle3D'),
204
+ dict(
205
+ keys=[
206
+ 'gt_bboxes_3d',
207
+ 'gt_labels_3d',
208
+ 'img',
209
+ ],
210
+ type='CustomCollect3D'),
211
+ dict(type='TypeConverter'),
212
+ ],
213
+ queue_length=4,
214
+ test_mode=False,
215
+ type='CustomNuScenesDataset',
216
+ use_valid_flag=True),
217
+ val=dict(
218
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
219
+ bev_size=(
220
+ 50,
221
+ 50,
222
+ ),
223
+ classes=[
224
+ 'car',
225
+ 'truck',
226
+ 'construction_vehicle',
227
+ 'bus',
228
+ 'trailer',
229
+ 'barrier',
230
+ 'motorcycle',
231
+ 'bicycle',
232
+ 'pedestrian',
233
+ 'traffic_cone',
234
+ ],
235
+ data_root='data/nuscenes/v1.0-mini/',
236
+ frame=(),
237
+ frames=[
238
+ -3,
239
+ -2,
240
+ -1,
241
+ ],
242
+ modality=dict(
243
+ use_camera=True,
244
+ use_external=False,
245
+ use_lidar=False,
246
+ use_map=False,
247
+ use_radar=False),
248
+ pipeline=[
249
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
250
+ dict(
251
+ mean=[
252
+ 123.675,
253
+ 116.28,
254
+ 103.53,
255
+ ],
256
+ std=[
257
+ 58.395,
258
+ 57.12,
259
+ 57.375,
260
+ ],
261
+ to_rgb=True,
262
+ type='NormalizeMultiviewImage'),
263
+ dict(
264
+ flip=False,
265
+ img_scale=(
266
+ 800,
267
+ 450,
268
+ ),
269
+ pts_scale_ratio=[
270
+ 1.0,
271
+ ],
272
+ transforms=[
273
+ dict(
274
+ scales=[
275
+ 0.5,
276
+ ], type='RandomScaleImageMultiViewImage'),
277
+ dict(size_divisor=32, type='PadMultiViewImage'),
278
+ dict(
279
+ class_names=[
280
+ 'car',
281
+ 'truck',
282
+ 'construction_vehicle',
283
+ 'bus',
284
+ 'trailer',
285
+ 'barrier',
286
+ 'motorcycle',
287
+ 'bicycle',
288
+ 'pedestrian',
289
+ 'traffic_cone',
290
+ ],
291
+ type='CustomDefaultFormatBundle3D'),
292
+ dict(keys=[
293
+ 'img',
294
+ ], type='CustomCollect3D'),
295
+ ],
296
+ type='MultiScaleFlipAug3D'),
297
+ ],
298
+ samples_per_gpu=1,
299
+ test_mode=True,
300
+ type='CustomNuScenesDataset'),
301
+ workers_per_gpu=4)
302
+ data_root = 'data/nuscenes/v1.0-mini/'
303
+ dataset_type = 'CustomNuScenesDataset'
304
+ decoder = dict(
305
+ num_layers=6,
306
+ return_intermediate=True,
307
+ transformerlayers=dict(
308
+ attn_cfgs=[
309
+ dict(
310
+ dropout=0.1,
311
+ embed_dims=256,
312
+ num_heads=8,
313
+ type='MultiheadAttention'),
314
+ dict(
315
+ embed_dims=256,
316
+ num_levels=1,
317
+ type='CustomMSDeformableAttention'),
318
+ ],
319
+ ffn_cfgs=dict(
320
+ feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
321
+ operation_order=(
322
+ 'self_attn',
323
+ 'norm',
324
+ 'cross_attn',
325
+ 'norm',
326
+ 'ffn',
327
+ 'norm',
328
+ ),
329
+ type='DetrTransformerDecoderLayer'),
330
+ type='DetectionTransformerDecoder')
331
+ default_hooks = dict(
332
+ checkpoint=dict(
333
+ by_epoch=False,
334
+ interval=250,
335
+ max_keep_ckpts=1,
336
+ save_best=[
337
+ 'loss',
338
+ 'mAP',
339
+ 'NDS',
340
+ ],
341
+ type='CheckpointHookV2'),
342
+ logger=dict(
343
+ interval=50,
344
+ interval_exp_name=1000,
345
+ log_metric_by_epoch=False,
346
+ type='LoggerHook'),
347
+ param_scheduler=dict(type='ParamSchedulerHook'),
348
+ runtime_info=dict(type='RuntimeInfoHook'),
349
+ sampler_seed=dict(type='DistSamplerSeedHook'),
350
+ timer=dict(type='IterTimerHook'))
351
+ encoder = dict(
352
+ num_layers=3,
353
+ num_points_in_pillar=8,
354
+ pc_range=[
355
+ -51.2,
356
+ -51.2,
357
+ -5.0,
358
+ 51.2,
359
+ 51.2,
360
+ 3.0,
361
+ ],
362
+ return_intermediate=False,
363
+ transformerlayers=dict(
364
+ attn_cfgs=[
365
+ dict(embed_dims=256, num_levels=1, type='TemporalSelfAttention'),
366
+ dict(
367
+ deformable_attention=dict(
368
+ embed_dims=256,
369
+ num_levels=1,
370
+ num_points=8,
371
+ type='MSDeformableAttention3D'),
372
+ embed_dims=256,
373
+ pc_range=[
374
+ -51.2,
375
+ -51.2,
376
+ -5.0,
377
+ 51.2,
378
+ 51.2,
379
+ 3.0,
380
+ ],
381
+ type='SpatialCrossAttention'),
382
+ ],
383
+ ffn_cfgs=dict(
384
+ feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
385
+ operation_order=(
386
+ 'self_attn',
387
+ 'norm',
388
+ 'cross_attn',
389
+ 'norm',
390
+ 'ffn',
391
+ 'norm',
392
+ ),
393
+ type='BEVFormerLayer'),
394
+ type='BEVFormerEncoder')
395
+ env_cfg = dict(dist_cfg=dict(backend='nccl'))
396
+ experiment_name = 'baseline-v0.1'
397
+ file_client_args = dict(backend='disk')
398
+ frames = [
399
+ -3,
400
+ -2,
401
+ -1,
402
+ ]
403
+ gpu_ids = range(0, 1)
404
+ img_norm_cfg = dict(
405
+ mean=[
406
+ 123.675,
407
+ 116.28,
408
+ 103.53,
409
+ ],
410
+ std=[
411
+ 58.395,
412
+ 57.12,
413
+ 57.375,
414
+ ],
415
+ to_rgb=True)
416
+ input_modality = dict(
417
+ use_camera=True,
418
+ use_external=False,
419
+ use_lidar=False,
420
+ use_map=False,
421
+ use_radar=False)
422
+ interval = 250
423
+ launcher = 'none'
424
+ load_from = None
425
+ log_interval = 50
426
+ log_processor = dict(window_size=20)
427
+ lr_config = dict(
428
+ min_lr_ratio=0.001,
429
+ policy='CosineAnnealing',
430
+ warmup='linear',
431
+ warmup_iters=500,
432
+ warmup_ratio=0.3333333333333333)
433
+ max_epochs = 5
434
+ max_iters = 1100
435
+ model = dict(
436
+ img_backbone=dict(
437
+ depth=50,
438
+ frozen_stages=1,
439
+ norm_cfg=dict(requires_grad=False, type='BN'),
440
+ norm_eval=True,
441
+ num_stages=4,
442
+ out_indices=(3, ),
443
+ style='pytorch',
444
+ type='ResNet'),
445
+ img_neck=dict(
446
+ add_extra_convs='on_output',
447
+ in_channels=[
448
+ 2048,
449
+ ],
450
+ num_outs=1,
451
+ out_channels=256,
452
+ relu_before_extra_convs=True,
453
+ start_level=0,
454
+ type='FPN'),
455
+ pretrained=dict(img='torchvision://resnet50'),
456
+ pts_bbox_head=dict(
457
+ as_two_stage=False,
458
+ bbox_coder=dict(
459
+ max_num=300,
460
+ num_classes=10,
461
+ pc_range=[
462
+ -51.2,
463
+ -51.2,
464
+ -5.0,
465
+ 51.2,
466
+ 51.2,
467
+ 3.0,
468
+ ],
469
+ post_center_range=[
470
+ -61.2,
471
+ -61.2,
472
+ -10.0,
473
+ 61.2,
474
+ 61.2,
475
+ 10.0,
476
+ ],
477
+ type='NMSFreeCoder',
478
+ voxel_size=[
479
+ 0.2,
480
+ 0.2,
481
+ 8,
482
+ ]),
483
+ bev_h=50,
484
+ bev_w=50,
485
+ in_channels=256,
486
+ loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
487
+ loss_cls=dict(
488
+ alpha=0.25,
489
+ gamma=2.0,
490
+ loss_weight=2.0,
491
+ type='FocalLoss',
492
+ use_sigmoid=True),
493
+ loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
494
+ num_classes=10,
495
+ num_query=900,
496
+ positional_encoding=dict(
497
+ col_num_embed=50,
498
+ num_feats=128,
499
+ row_num_embed=50,
500
+ type='LearnedPositionalEncoding'),
501
+ sync_cls_avg_factor=True,
502
+ transformer=dict(
503
+ decoder=dict(
504
+ num_layers=6,
505
+ return_intermediate=True,
506
+ transformerlayers=dict(
507
+ attn_cfgs=[
508
+ dict(
509
+ dropout=0.1,
510
+ embed_dims=256,
511
+ num_heads=8,
512
+ type='MultiheadAttention'),
513
+ dict(
514
+ embed_dims=256,
515
+ num_levels=1,
516
+ type='CustomMSDeformableAttention'),
517
+ ],
518
+ ffn_cfgs=dict(
519
+ feedforward_channels=512,
520
+ ffn_drop=0.1,
521
+ num_fcs=2,
522
+ type='FFN'),
523
+ operation_order=(
524
+ 'self_attn',
525
+ 'norm',
526
+ 'cross_attn',
527
+ 'norm',
528
+ 'ffn',
529
+ 'norm',
530
+ ),
531
+ type='DetrTransformerDecoderLayer'),
532
+ type='DetectionTransformerDecoder'),
533
+ embed_dims=256,
534
+ encoder=dict(
535
+ num_layers=3,
536
+ num_points_in_pillar=8,
537
+ pc_range=[
538
+ -51.2,
539
+ -51.2,
540
+ -5.0,
541
+ 51.2,
542
+ 51.2,
543
+ 3.0,
544
+ ],
545
+ return_intermediate=False,
546
+ transformerlayers=dict(
547
+ attn_cfgs=[
548
+ dict(
549
+ embed_dims=256,
550
+ num_levels=1,
551
+ type='TemporalSelfAttention'),
552
+ dict(
553
+ deformable_attention=dict(
554
+ embed_dims=256,
555
+ num_levels=1,
556
+ num_points=8,
557
+ type='MSDeformableAttention3D'),
558
+ embed_dims=256,
559
+ pc_range=[
560
+ -51.2,
561
+ -51.2,
562
+ -5.0,
563
+ 51.2,
564
+ 51.2,
565
+ 3.0,
566
+ ],
567
+ type='SpatialCrossAttention'),
568
+ ],
569
+ ffn_cfgs=dict(
570
+ feedforward_channels=512,
571
+ ffn_drop=0.1,
572
+ num_fcs=2,
573
+ type='FFN'),
574
+ operation_order=(
575
+ 'self_attn',
576
+ 'norm',
577
+ 'cross_attn',
578
+ 'norm',
579
+ 'ffn',
580
+ 'norm',
581
+ ),
582
+ type='BEVFormerLayer'),
583
+ type='BEVFormerEncoder'),
584
+ num_cams=6,
585
+ num_feature_levels=1,
586
+ rotate_prev_bev=True,
587
+ type='PerceptionTransformer',
588
+ use_can_bus=True,
589
+ use_shift=True),
590
+ type='BEVFormerHead',
591
+ with_box_refine=True),
592
+ train_cfg=dict(
593
+ pts=dict(
594
+ assigner=dict(
595
+ cls_cost=dict(type='FocalCost', weight=2.0),
596
+ iou_cost=dict(type='SmoothL1Cost', weight=0.25),
597
+ pc_range=[
598
+ -51.2,
599
+ -51.2,
600
+ -5.0,
601
+ 51.2,
602
+ 51.2,
603
+ 3.0,
604
+ ],
605
+ reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
606
+ type='HungarianAssigner3D'),
607
+ grid_size=[
608
+ 512,
609
+ 512,
610
+ 1,
611
+ ],
612
+ out_size_factor=4,
613
+ point_cloud_range=[
614
+ -51.2,
615
+ -51.2,
616
+ -5.0,
617
+ 51.2,
618
+ 51.2,
619
+ 3.0,
620
+ ],
621
+ voxel_size=[
622
+ 0.2,
623
+ 0.2,
624
+ 8,
625
+ ])),
626
+ type='BEVFormerDetector',
627
+ use_grid_mask=True,
628
+ video_test_mode=True)
629
+ optim_wrapper = dict(
630
+ optimizer=dict(lr=0.0001, type='AdamW', weight_decay=0.01),
631
+ type='OptimWrapper')
632
+ optimizer = dict(lr=0.0001, type='AdamW', weight_decay=0.01)
633
+ param_scheduler = dict(
634
+ milestones=[
635
+ 1,
636
+ 2,
637
+ ], type='MultiStepLR')
638
+ point_cloud_range = [
639
+ -51.2,
640
+ -51.2,
641
+ -5.0,
642
+ 51.2,
643
+ 51.2,
644
+ 3.0,
645
+ ]
646
+ pts_bbox_head = dict(
647
+ as_two_stage=False,
648
+ bbox_coder=dict(
649
+ max_num=300,
650
+ num_classes=10,
651
+ pc_range=[
652
+ -51.2,
653
+ -51.2,
654
+ -5.0,
655
+ 51.2,
656
+ 51.2,
657
+ 3.0,
658
+ ],
659
+ post_center_range=[
660
+ -61.2,
661
+ -61.2,
662
+ -10.0,
663
+ 61.2,
664
+ 61.2,
665
+ 10.0,
666
+ ],
667
+ type='NMSFreeCoder',
668
+ voxel_size=[
669
+ 0.2,
670
+ 0.2,
671
+ 8,
672
+ ]),
673
+ bev_h=50,
674
+ bev_w=50,
675
+ in_channels=256,
676
+ loss_bbox=dict(loss_weight=0.5, type='L1Loss'),
677
+ loss_cls=dict(
678
+ alpha=0.25,
679
+ gamma=2.0,
680
+ loss_weight=2.0,
681
+ type='FocalLoss',
682
+ use_sigmoid=True),
683
+ loss_iou=dict(loss_weight=0.25, type='GIoULoss'),
684
+ num_classes=10,
685
+ num_query=900,
686
+ positional_encoding=dict(
687
+ col_num_embed=50,
688
+ num_feats=128,
689
+ row_num_embed=50,
690
+ type='LearnedPositionalEncoding'),
691
+ sync_cls_avg_factor=True,
692
+ transformer=dict(
693
+ decoder=dict(
694
+ num_layers=6,
695
+ return_intermediate=True,
696
+ transformerlayers=dict(
697
+ attn_cfgs=[
698
+ dict(
699
+ dropout=0.1,
700
+ embed_dims=256,
701
+ num_heads=8,
702
+ type='MultiheadAttention'),
703
+ dict(
704
+ embed_dims=256,
705
+ num_levels=1,
706
+ type='CustomMSDeformableAttention'),
707
+ ],
708
+ ffn_cfgs=dict(
709
+ feedforward_channels=512,
710
+ ffn_drop=0.1,
711
+ num_fcs=2,
712
+ type='FFN'),
713
+ operation_order=(
714
+ 'self_attn',
715
+ 'norm',
716
+ 'cross_attn',
717
+ 'norm',
718
+ 'ffn',
719
+ 'norm',
720
+ ),
721
+ type='DetrTransformerDecoderLayer'),
722
+ type='DetectionTransformerDecoder'),
723
+ embed_dims=256,
724
+ encoder=dict(
725
+ num_layers=3,
726
+ num_points_in_pillar=8,
727
+ pc_range=[
728
+ -51.2,
729
+ -51.2,
730
+ -5.0,
731
+ 51.2,
732
+ 51.2,
733
+ 3.0,
734
+ ],
735
+ return_intermediate=False,
736
+ transformerlayers=dict(
737
+ attn_cfgs=[
738
+ dict(
739
+ embed_dims=256,
740
+ num_levels=1,
741
+ type='TemporalSelfAttention'),
742
+ dict(
743
+ deformable_attention=dict(
744
+ embed_dims=256,
745
+ num_levels=1,
746
+ num_points=8,
747
+ type='MSDeformableAttention3D'),
748
+ embed_dims=256,
749
+ pc_range=[
750
+ -51.2,
751
+ -51.2,
752
+ -5.0,
753
+ 51.2,
754
+ 51.2,
755
+ 3.0,
756
+ ],
757
+ type='SpatialCrossAttention'),
758
+ ],
759
+ ffn_cfgs=dict(
760
+ feedforward_channels=512,
761
+ ffn_drop=0.1,
762
+ num_fcs=2,
763
+ type='FFN'),
764
+ operation_order=(
765
+ 'self_attn',
766
+ 'norm',
767
+ 'cross_attn',
768
+ 'norm',
769
+ 'ffn',
770
+ 'norm',
771
+ ),
772
+ type='BEVFormerLayer'),
773
+ type='BEVFormerEncoder'),
774
+ num_cams=6,
775
+ num_feature_levels=1,
776
+ rotate_prev_bev=True,
777
+ type='PerceptionTransformer',
778
+ use_can_bus=True,
779
+ use_shift=True),
780
+ type='BEVFormerHead',
781
+ with_box_refine=True)
782
+ queue_length = 4
783
+ resume = True
784
+ scales = [
785
+ 0.5,
786
+ ]
787
+ test_cfg = dict(max_iters=1)
788
+ test_dataloader = dict(
789
+ batch_size=1,
790
+ collate_fn=dict(type='test_collate'),
791
+ dataset=dict(
792
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
793
+ bev_size=(
794
+ 50,
795
+ 50,
796
+ ),
797
+ classes=[
798
+ 'car',
799
+ 'truck',
800
+ 'construction_vehicle',
801
+ 'bus',
802
+ 'trailer',
803
+ 'barrier',
804
+ 'motorcycle',
805
+ 'bicycle',
806
+ 'pedestrian',
807
+ 'traffic_cone',
808
+ ],
809
+ data_root='data/nuscenes/v1.0-mini/',
810
+ frame=[
811
+ -3,
812
+ -2,
813
+ -1,
814
+ ],
815
+ modality=dict(
816
+ use_camera=True,
817
+ use_external=False,
818
+ use_lidar=False,
819
+ use_map=False,
820
+ use_radar=False),
821
+ pipeline=[
822
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
823
+ dict(
824
+ mean=[
825
+ 123.675,
826
+ 116.28,
827
+ 103.53,
828
+ ],
829
+ std=[
830
+ 58.395,
831
+ 57.12,
832
+ 57.375,
833
+ ],
834
+ to_rgb=True,
835
+ type='NormalizeMultiviewImage'),
836
+ dict(
837
+ flip=False,
838
+ img_scale=(
839
+ 800,
840
+ 450,
841
+ ),
842
+ pts_scale_ratio=[
843
+ 1.0,
844
+ ],
845
+ transforms=[
846
+ dict(
847
+ scales=[
848
+ 0.5,
849
+ ], type='RandomScaleImageMultiViewImage'),
850
+ dict(size_divisor=32, type='PadMultiViewImage'),
851
+ dict(
852
+ class_names=[
853
+ 'car',
854
+ 'truck',
855
+ 'construction_vehicle',
856
+ 'bus',
857
+ 'trailer',
858
+ 'barrier',
859
+ 'motorcycle',
860
+ 'bicycle',
861
+ 'pedestrian',
862
+ 'traffic_cone',
863
+ ],
864
+ type='CustomDefaultFormatBundle3D'),
865
+ dict(keys=[
866
+ 'img',
867
+ ], type='CustomCollect3D'),
868
+ ],
869
+ type='MultiScaleFlipAug3D'),
870
+ ],
871
+ test_mode=True,
872
+ type='CustomNuScenesDataset'),
873
+ num_workers=0,
874
+ sampler=dict(shuffle=True, type='DefaultSampler'))
875
+ test_evaluator = dict(metrics=[
876
+ dict(
877
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
878
+ data_root='data/nuscenes/v1.0-mini/',
879
+ type='src.NuScenesMetric',
880
+ version='v1.0-mini'),
881
+ ])
882
+ test_max_iters = 1
883
+ test_pipeline = [
884
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
885
+ dict(
886
+ mean=[
887
+ 123.675,
888
+ 116.28,
889
+ 103.53,
890
+ ],
891
+ std=[
892
+ 58.395,
893
+ 57.12,
894
+ 57.375,
895
+ ],
896
+ to_rgb=True,
897
+ type='NormalizeMultiviewImage'),
898
+ dict(
899
+ flip=False,
900
+ img_scale=(
901
+ 800,
902
+ 450,
903
+ ),
904
+ pts_scale_ratio=[
905
+ 1.0,
906
+ ],
907
+ transforms=[
908
+ dict(scales=[
909
+ 0.5,
910
+ ], type='RandomScaleImageMultiViewImage'),
911
+ dict(size_divisor=32, type='PadMultiViewImage'),
912
+ dict(
913
+ class_names=[
914
+ 'car',
915
+ 'truck',
916
+ 'construction_vehicle',
917
+ 'bus',
918
+ 'trailer',
919
+ 'barrier',
920
+ 'motorcycle',
921
+ 'bicycle',
922
+ 'pedestrian',
923
+ 'traffic_cone',
924
+ ],
925
+ type='CustomDefaultFormatBundle3D'),
926
+ dict(keys=[
927
+ 'img',
928
+ ], type='CustomCollect3D'),
929
+ ],
930
+ type='MultiScaleFlipAug3D'),
931
+ ]
932
+ train_cfg = dict(
933
+ by_epoch=False, max_epochs=5, max_iters=1100, val_interval=250)
934
+ train_dataloader = dict(
935
+ batch_size=1,
936
+ collate_fn=dict(type='train_collate'),
937
+ dataset=dict(
938
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_train.pkl',
939
+ bev_size=(
940
+ 50,
941
+ 50,
942
+ ),
943
+ box_type_3d='LiDAR',
944
+ classes=[
945
+ 'car',
946
+ 'truck',
947
+ 'construction_vehicle',
948
+ 'bus',
949
+ 'trailer',
950
+ 'barrier',
951
+ 'motorcycle',
952
+ 'bicycle',
953
+ 'pedestrian',
954
+ 'traffic_cone',
955
+ ],
956
+ data_root='data/nuscenes/v1.0-mini/',
957
+ modality=dict(
958
+ use_camera=True,
959
+ use_external=False,
960
+ use_lidar=False,
961
+ use_map=False,
962
+ use_radar=False),
963
+ pipeline=[
964
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
965
+ dict(
966
+ type='LoadAnnotations3D',
967
+ with_bbox_3d=True,
968
+ with_label_3d=True),
969
+ dict(
970
+ point_cloud_range=[
971
+ -51.2,
972
+ -51.2,
973
+ -5.0,
974
+ 51.2,
975
+ 51.2,
976
+ 3.0,
977
+ ],
978
+ type='ObjectRangeFilter'),
979
+ dict(
980
+ classes=[
981
+ 'car',
982
+ 'truck',
983
+ 'construction_vehicle',
984
+ 'bus',
985
+ 'trailer',
986
+ 'barrier',
987
+ 'motorcycle',
988
+ 'bicycle',
989
+ 'pedestrian',
990
+ 'traffic_cone',
991
+ ],
992
+ type='ObjectNameFilter'),
993
+ dict(type='PhotoMetricDistortionMultiViewImage'),
994
+ dict(
995
+ mean=[
996
+ 123.675,
997
+ 116.28,
998
+ 103.53,
999
+ ],
1000
+ std=[
1001
+ 58.395,
1002
+ 57.12,
1003
+ 57.375,
1004
+ ],
1005
+ to_rgb=True,
1006
+ type='NormalizeMultiviewImage'),
1007
+ dict(scales=[
1008
+ 0.5,
1009
+ ], type='RandomScaleImageMultiViewImage'),
1010
+ dict(size_divisor=32, type='PadMultiViewImage'),
1011
+ dict(
1012
+ class_names=[
1013
+ 'car',
1014
+ 'truck',
1015
+ 'construction_vehicle',
1016
+ 'bus',
1017
+ 'trailer',
1018
+ 'barrier',
1019
+ 'motorcycle',
1020
+ 'bicycle',
1021
+ 'pedestrian',
1022
+ 'traffic_cone',
1023
+ ],
1024
+ type='CustomDefaultFormatBundle3D'),
1025
+ dict(
1026
+ keys=[
1027
+ 'gt_bboxes_3d',
1028
+ 'gt_labels_3d',
1029
+ 'img',
1030
+ ],
1031
+ type='CustomCollect3D'),
1032
+ dict(type='TypeConverter'),
1033
+ ],
1034
+ queue_length=4,
1035
+ test_mode=False,
1036
+ type='CustomNuScenesDataset',
1037
+ use_valid_flag=True),
1038
+ num_workers=0,
1039
+ sampler=dict(shuffle=True, type='DefaultSampler'))
1040
+ train_pipeline = [
1041
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
1042
+ dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
1043
+ dict(
1044
+ point_cloud_range=[
1045
+ -51.2,
1046
+ -51.2,
1047
+ -5.0,
1048
+ 51.2,
1049
+ 51.2,
1050
+ 3.0,
1051
+ ],
1052
+ type='ObjectRangeFilter'),
1053
+ dict(
1054
+ classes=[
1055
+ 'car',
1056
+ 'truck',
1057
+ 'construction_vehicle',
1058
+ 'bus',
1059
+ 'trailer',
1060
+ 'barrier',
1061
+ 'motorcycle',
1062
+ 'bicycle',
1063
+ 'pedestrian',
1064
+ 'traffic_cone',
1065
+ ],
1066
+ type='ObjectNameFilter'),
1067
+ dict(type='PhotoMetricDistortionMultiViewImage'),
1068
+ dict(
1069
+ mean=[
1070
+ 123.675,
1071
+ 116.28,
1072
+ 103.53,
1073
+ ],
1074
+ std=[
1075
+ 58.395,
1076
+ 57.12,
1077
+ 57.375,
1078
+ ],
1079
+ to_rgb=True,
1080
+ type='NormalizeMultiviewImage'),
1081
+ dict(scales=[
1082
+ 0.5,
1083
+ ], type='RandomScaleImageMultiViewImage'),
1084
+ dict(size_divisor=32, type='PadMultiViewImage'),
1085
+ dict(
1086
+ class_names=[
1087
+ 'car',
1088
+ 'truck',
1089
+ 'construction_vehicle',
1090
+ 'bus',
1091
+ 'trailer',
1092
+ 'barrier',
1093
+ 'motorcycle',
1094
+ 'bicycle',
1095
+ 'pedestrian',
1096
+ 'traffic_cone',
1097
+ ],
1098
+ type='CustomDefaultFormatBundle3D'),
1099
+ dict(
1100
+ keys=[
1101
+ 'gt_bboxes_3d',
1102
+ 'gt_labels_3d',
1103
+ 'img',
1104
+ ], type='CustomCollect3D'),
1105
+ dict(type='TypeConverter'),
1106
+ ]
1107
+ transformer = dict(
1108
+ decoder=dict(
1109
+ num_layers=6,
1110
+ return_intermediate=True,
1111
+ transformerlayers=dict(
1112
+ attn_cfgs=[
1113
+ dict(
1114
+ dropout=0.1,
1115
+ embed_dims=256,
1116
+ num_heads=8,
1117
+ type='MultiheadAttention'),
1118
+ dict(
1119
+ embed_dims=256,
1120
+ num_levels=1,
1121
+ type='CustomMSDeformableAttention'),
1122
+ ],
1123
+ ffn_cfgs=dict(
1124
+ feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
1125
+ operation_order=(
1126
+ 'self_attn',
1127
+ 'norm',
1128
+ 'cross_attn',
1129
+ 'norm',
1130
+ 'ffn',
1131
+ 'norm',
1132
+ ),
1133
+ type='DetrTransformerDecoderLayer'),
1134
+ type='DetectionTransformerDecoder'),
1135
+ embed_dims=256,
1136
+ encoder=dict(
1137
+ num_layers=3,
1138
+ num_points_in_pillar=8,
1139
+ pc_range=[
1140
+ -51.2,
1141
+ -51.2,
1142
+ -5.0,
1143
+ 51.2,
1144
+ 51.2,
1145
+ 3.0,
1146
+ ],
1147
+ return_intermediate=False,
1148
+ transformerlayers=dict(
1149
+ attn_cfgs=[
1150
+ dict(
1151
+ embed_dims=256, num_levels=1,
1152
+ type='TemporalSelfAttention'),
1153
+ dict(
1154
+ deformable_attention=dict(
1155
+ embed_dims=256,
1156
+ num_levels=1,
1157
+ num_points=8,
1158
+ type='MSDeformableAttention3D'),
1159
+ embed_dims=256,
1160
+ pc_range=[
1161
+ -51.2,
1162
+ -51.2,
1163
+ -5.0,
1164
+ 51.2,
1165
+ 51.2,
1166
+ 3.0,
1167
+ ],
1168
+ type='SpatialCrossAttention'),
1169
+ ],
1170
+ ffn_cfgs=dict(
1171
+ feedforward_channels=512, ffn_drop=0.1, num_fcs=2, type='FFN'),
1172
+ operation_order=(
1173
+ 'self_attn',
1174
+ 'norm',
1175
+ 'cross_attn',
1176
+ 'norm',
1177
+ 'ffn',
1178
+ 'norm',
1179
+ ),
1180
+ type='BEVFormerLayer'),
1181
+ type='BEVFormerEncoder'),
1182
+ num_cams=6,
1183
+ num_feature_levels=1,
1184
+ rotate_prev_bev=True,
1185
+ type='PerceptionTransformer',
1186
+ use_can_bus=True,
1187
+ use_shift=True)
1188
+ val_cfg = dict(max_iters=1)
1189
+ val_dataloader = dict(
1190
+ batch_size=1,
1191
+ collate_fn=dict(type='test_collate'),
1192
+ dataset=dict(
1193
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
1194
+ bev_size=(
1195
+ 50,
1196
+ 50,
1197
+ ),
1198
+ classes=[
1199
+ 'car',
1200
+ 'truck',
1201
+ 'construction_vehicle',
1202
+ 'bus',
1203
+ 'trailer',
1204
+ 'barrier',
1205
+ 'motorcycle',
1206
+ 'bicycle',
1207
+ 'pedestrian',
1208
+ 'traffic_cone',
1209
+ ],
1210
+ data_root='data/nuscenes/v1.0-mini/',
1211
+ frame=(),
1212
+ frames=[
1213
+ -3,
1214
+ -2,
1215
+ -1,
1216
+ ],
1217
+ modality=dict(
1218
+ use_camera=True,
1219
+ use_external=False,
1220
+ use_lidar=False,
1221
+ use_map=False,
1222
+ use_radar=False),
1223
+ pipeline=[
1224
+ dict(to_float32=True, type='LoadMultiViewImageFromFiles'),
1225
+ dict(
1226
+ mean=[
1227
+ 123.675,
1228
+ 116.28,
1229
+ 103.53,
1230
+ ],
1231
+ std=[
1232
+ 58.395,
1233
+ 57.12,
1234
+ 57.375,
1235
+ ],
1236
+ to_rgb=True,
1237
+ type='NormalizeMultiviewImage'),
1238
+ dict(
1239
+ flip=False,
1240
+ img_scale=(
1241
+ 800,
1242
+ 450,
1243
+ ),
1244
+ pts_scale_ratio=[
1245
+ 1.0,
1246
+ ],
1247
+ transforms=[
1248
+ dict(
1249
+ scales=[
1250
+ 0.5,
1251
+ ], type='RandomScaleImageMultiViewImage'),
1252
+ dict(size_divisor=32, type='PadMultiViewImage'),
1253
+ dict(
1254
+ class_names=[
1255
+ 'car',
1256
+ 'truck',
1257
+ 'construction_vehicle',
1258
+ 'bus',
1259
+ 'trailer',
1260
+ 'barrier',
1261
+ 'motorcycle',
1262
+ 'bicycle',
1263
+ 'pedestrian',
1264
+ 'traffic_cone',
1265
+ ],
1266
+ type='CustomDefaultFormatBundle3D'),
1267
+ dict(keys=[
1268
+ 'img',
1269
+ ], type='CustomCollect3D'),
1270
+ ],
1271
+ type='MultiScaleFlipAug3D'),
1272
+ ],
1273
+ samples_per_gpu=1,
1274
+ test_mode=True,
1275
+ type='CustomNuScenesDataset'),
1276
+ num_workers=0,
1277
+ sampler=dict(shuffle=True, type='DefaultSampler'))
1278
+ val_evaluator = dict(metrics=[
1279
+ dict(
1280
+ ann_file='data/nuscenes/v1.0-mini/nuscenes_infos_temporal_val.pkl',
1281
+ classes=[
1282
+ 'car',
1283
+ 'truck',
1284
+ 'construction_vehicle',
1285
+ 'bus',
1286
+ 'trailer',
1287
+ 'barrier',
1288
+ 'motorcycle',
1289
+ 'bicycle',
1290
+ 'pedestrian',
1291
+ 'traffic_cone',
1292
+ ],
1293
+ data_root='data/nuscenes/v1.0-mini/',
1294
+ jsonfile_prefix='results',
1295
+ modality=dict(
1296
+ use_camera=True,
1297
+ use_external=False,
1298
+ use_lidar=False,
1299
+ use_map=False,
1300
+ use_radar=False),
1301
+ plot_every_run=True,
1302
+ plot_examples=1,
1303
+ type='src.NuScenesMetric',
1304
+ version='v1.0-mini'),
1305
+ ])
1306
+ val_interval = 250
1307
+ val_max_iters = 1
1308
+ version = 'v1.0-mini'
1309
+ visualizer = dict(
1310
+ type='Visualizer',
1311
+ vis_backends=[
1312
+ dict(type='LocalVisBackend'),
1313
+ dict(type='TensorboardVisBackend'),
1314
+ ])
1315
+ voxel_size = [
1316
+ 0.2,
1317
+ 0.2,
1318
+ 8,
1319
+ ]
1320
+ work_dir = 'experiment'
20260315_230250/events.out.tfevents.1773630171.Minhs-MacBook-Air.local.51042.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e1d802d6fa0d639a46d990a586203e22af4136a2384e402b40767d6fe4d176b
3
+ size 46632
20260315_230250/scalars.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"lr": 0.0001, "data_time": 0.954918372631073, "loss": 44.09534311294556, "loss_cls": 1.1228834331035613, "loss_bbox": 6.337317788600922, "d0.loss_cls": 1.108318066596985, "d0.loss_bbox": 6.259250831604004, "d1.loss_cls": 1.111513575911522, "d1.loss_bbox": 6.144867360591888, "d2.loss_cls": 1.1080957710742951, "d2.loss_bbox": 6.104123330116272, "d3.loss_cls": 1.1203179508447647, "d3.loss_bbox": 6.3368358492851256, "d4.loss_cls": 1.1214783400297166, "d4.loss_bbox": 6.220340991020203, "time": 21.755193567276002, "epoch": 1, "iter": 650, "step": 650}
2
+ {"lr": 0.0001, "data_time": 0.9646973729133606, "loss": 51.661175918579104, "loss_cls": 1.1298476964235307, "loss_bbox": 7.4403127312660216, "d0.loss_cls": 1.085216408967972, "d0.loss_bbox": 7.6048126220703125, "d1.loss_cls": 1.1213543117046356, "d1.loss_bbox": 7.538022649288178, "d2.loss_cls": 1.1103688687086106, "d2.loss_bbox": 7.412710273265839, "d3.loss_cls": 1.1244247049093246, "d3.loss_bbox": 7.577581775188446, "d4.loss_cls": 1.126863345503807, "d4.loss_bbox": 7.389661097526551, "time": 22.624796402454376, "epoch": 1, "iter": 700, "step": 700}
3
+ {"lr": 0.0001, "data_time": 0.9243666768074036, "loss": 38.63754949569702, "loss_cls": 1.0912210047245026, "loss_bbox": 5.242098152637482, "d0.loss_cls": 1.0618977814912796, "d0.loss_bbox": 5.441217350959778, "d1.loss_cls": 1.0883103013038635, "d1.loss_bbox": 5.398028314113617, "d2.loss_cls": 1.0953371733427049, "d2.loss_bbox": 5.400306010246277, "d3.loss_cls": 1.0949742108583451, "d3.loss_bbox": 5.344762396812439, "d4.loss_cls": 1.0946346282958985, "d4.loss_bbox": 5.284762263298035, "time": 23.798150968551635, "epoch": 1, "iter": 750, "step": 750}
4
+ {"NDS": 0.024444029379540926, "mAP": 0.0003303450412910843, "data_time": 0.5486190546126593, "time": 2.0810610907418385, "step": 750}
5
+ {"lr": 0.0001, "data_time": 0.9359340071678162, "loss": 40.09088401794433, "loss_cls": 1.189671701192856, "loss_bbox": 5.4612990617752075, "d0.loss_cls": 1.1360573798418045, "d0.loss_bbox": 5.618711662292481, "d1.loss_cls": 1.170571580529213, "d1.loss_bbox": 5.556303668022156, "d2.loss_cls": 1.1710273623466492, "d2.loss_bbox": 5.482773923873902, "d3.loss_cls": 1.1886319637298584, "d3.loss_bbox": 5.466277146339417, "d4.loss_cls": 1.1805710077285767, "d4.loss_bbox": 5.468987131118775, "time": 25.543924260139466, "epoch": 1, "iter": 800, "step": 800}
6
+ {"lr": 0.0001, "data_time": 0.9578536748886108, "loss": 51.03642435073853, "loss_cls": 1.1871638536453246, "loss_bbox": 7.277057945728302, "d0.loss_cls": 1.1480519831180573, "d0.loss_bbox": 7.431847703456879, "d1.loss_cls": 1.1863121330738067, "d1.loss_bbox": 7.32298082113266, "d2.loss_cls": 1.172393587231636, "d2.loss_bbox": 7.3522356986999515, "d3.loss_cls": 1.1807974189519883, "d3.loss_bbox": 7.363990998268127, "d4.loss_cls": 1.1806713700294496, "d4.loss_bbox": 7.23292065858841, "time": 25.078794717788696, "epoch": 1, "iter": 850, "step": 850}
7
+ {"lr": 0.0001, "data_time": 0.9448531866073608, "loss": 47.00728120803833, "loss_cls": 1.1108986586332321, "loss_bbox": 6.683765578269958, "d0.loss_cls": 1.058606892824173, "d0.loss_bbox": 6.775075042247773, "d1.loss_cls": 1.1145205676555634, "d1.loss_bbox": 6.805619835853577, "d2.loss_cls": 1.1085663586854935, "d2.loss_bbox": 6.700556540489197, "d3.loss_cls": 1.1268153429031371, "d3.loss_bbox": 6.825287544727326, "d4.loss_cls": 1.123038759827614, "d4.loss_bbox": 6.574530410766601, "time": 23.162017893791198, "epoch": 1, "iter": 900, "step": 900}
8
+ {"lr": 0.0001, "data_time": 0.9400171756744384, "loss": 41.35070466995239, "loss_cls": 1.158398947119713, "loss_bbox": 5.760552084445953, "d0.loss_cls": 1.089666599035263, "d0.loss_bbox": 5.752838516235352, "d1.loss_cls": 1.1419324547052383, "d1.loss_bbox": 5.745536303520202, "d2.loss_cls": 1.1454829275608063, "d2.loss_bbox": 5.826562070846558, "d3.loss_cls": 1.1492114216089249, "d3.loss_bbox": 5.729734838008881, "d4.loss_cls": 1.1491299778223039, "d4.loss_bbox": 5.701658976078034, "time": 22.88226662874222, "epoch": 1, "iter": 950, "step": 950}
9
+ {"lr": 0.0001, "data_time": 0.9400394678115844, "loss": 46.44832859039307, "loss_cls": 1.1757118165493012, "loss_bbox": 6.591959166526794, "d0.loss_cls": 1.098275139927864, "d0.loss_bbox": 6.536907982826233, "d1.loss_cls": 1.1439443498849868, "d1.loss_bbox": 6.561067795753479, "d2.loss_cls": 1.150243878364563, "d2.loss_bbox": 6.466058671474457, "d3.loss_cls": 1.160718309879303, "d3.loss_bbox": 6.50878232717514, "d4.loss_cls": 1.1740066707134247, "d4.loss_bbox": 6.8806525349617, "time": 23.910873460769654, "epoch": 1, "iter": 1000, "step": 1000}
10
+ {"NDS": 0.024568420980261345, "mAP": 0.00024522344729063566, "data_time": 0.5547568256204779, "time": 2.091413226994601, "step": 1000}
11
+ {"lr": 0.0001, "data_time": 0.9280791401863098, "loss": 35.56269006729126, "loss_cls": 1.1439091116189957, "loss_bbox": 4.727340304851532, "d0.loss_cls": 1.0815186262130738, "d0.loss_bbox": 4.890256083011627, "d1.loss_cls": 1.1172061949968337, "d1.loss_bbox": 4.832208633422852, "d2.loss_cls": 1.1246428489685059, "d2.loss_bbox": 4.8471301078796385, "d3.loss_cls": 1.1306370705366136, "d3.loss_bbox": 4.7437060356140135, "d4.loss_cls": 1.1405282199382782, "d4.loss_bbox": 4.783606648445129, "time": 22.772711980342866, "epoch": 1, "iter": 1050, "step": 1050}
12
+ {"lr": 0.0001, "data_time": 0.9757530927658081, "loss": 42.73475952148438, "loss_cls": 1.1595562100410461, "loss_bbox": 5.88040109872818, "d0.loss_cls": 1.0938430309295655, "d0.loss_bbox": 5.977837765216828, "d1.loss_cls": 1.1540126234292984, "d1.loss_bbox": 5.96634818315506, "d2.loss_cls": 1.1451045274734497, "d2.loss_bbox": 6.051124310493469, "d3.loss_cls": 1.1639264404773713, "d3.loss_bbox": 5.975519037246704, "d4.loss_cls": 1.1618152916431428, "d4.loss_bbox": 6.005270600318909, "time": 22.53203227519989, "epoch": 1, "iter": 1100, "step": 1100}