fangmingguo commited on
Commit
eb94ef6
·
verified ·
1 Parent(s): 910dcb3

Delete inference_axmodel.py

Browse files
Files changed (1) hide show
  1. inference_axmodel.py +0 -1001
inference_axmodel.py DELETED
@@ -1,1001 +0,0 @@
1
- #!/usr/bin/env python3
2
- import argparse
3
- import json
4
- import os
5
- import os.path as osp
6
- import cv2
7
- import numpy as np
8
- import axengine as axe
9
- from collections import defaultdict
10
- from tqdm import tqdm
11
-
12
-
13
- def parse_args():
14
- parser = argparse.ArgumentParser(description='BEVFormer AXEngine Inference from Extracted Data')
15
- parser.add_argument('model', help='AXModel path')
16
- parser.add_argument('config_json', help='JSON config file path')
17
- parser.add_argument('data_dir', help='extracted data directory (extracted_data)')
18
- parser.add_argument('--output-dir', default='./inference_results_extracted', help='output directory')
19
- parser.add_argument('--score-thr', type=float, default=0.1, help='score threshold')
20
- parser.add_argument('--fps', type=int, default=3, help='video fps')
21
- parser.add_argument('--start-scene', type=int, default=0, help='start scene index')
22
- parser.add_argument('--end-scene', type=int, default=None, help='end scene index (None for all)')
23
- return parser.parse_args()
24
-
25
-
26
- def load_axmodel(axmodel_path):
27
- """Load AXModel"""
28
- providers = ['AxEngineExecutionProvider']
29
- session = axe.InferenceSession(axmodel_path, providers=providers)
30
- return session
31
-
32
-
33
- def load_config_from_json(config_path):
34
- """Load configuration from JSON file"""
35
- with open(config_path, 'r') as f:
36
- config = json.load(f)
37
- return config
38
-
39
-
40
- def preprocess_image(img_path, img_norm_cfg, target_size=(480, 800)):
41
- """Preprocess image: load, resize, normalize
42
-
43
- Args:
44
- img_path: path to image file
45
- img_norm_cfg: normalization config with 'mean', 'std', 'to_rgb'
46
- target_size: (H, W) target size
47
-
48
- Returns:
49
- img: (C, H, W) normalized numpy array, float32
50
- """
51
- # Load image
52
- img = cv2.imread(img_path)
53
- if img is None:
54
- raise ValueError(f"Cannot load image: {img_path}")
55
-
56
- # Convert BGR to RGB if needed
57
- if img_norm_cfg.get('to_rgb', True):
58
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
59
-
60
- # Resize if needed
61
- if img.shape[:2] != target_size:
62
- img = cv2.resize(img, (target_size[1], target_size[0])) # (W, H)
63
-
64
- # Convert to float and normalize
65
- img = img.astype(np.float32)
66
- mean = np.array(img_norm_cfg.get('mean', [123.675, 116.28, 103.53]), dtype=np.float32)
67
- std = np.array(img_norm_cfg.get('std', [58.395, 57.12, 57.375]), dtype=np.float32)
68
-
69
- img = (img - mean) / std
70
- img = img.transpose(2, 0, 1) # (H, W, C) -> (C, H, W)
71
-
72
- return img
73
-
74
-
75
- def load_data(data_dir, scene_name, frame_idx):
76
- """Load data
77
-
78
- Args:
79
- data_dir: data directory path
80
- scene_name: scene name (scene token)
81
- frame_idx: frame index (sample index)
82
-
83
- Returns:
84
- img: (1, N, C, H, W) numpy array
85
- lidar2img: (1, N, 4, 4) numpy array
86
- can_bus: (1, 18) numpy array
87
- meta: dict with metadata
88
- """
89
- scene_dir = osp.join(data_dir, scene_name)
90
-
91
- # Load meta
92
- meta_path = osp.join(scene_dir, f'meta_{frame_idx:06d}.json')
93
- with open(meta_path, 'r') as f:
94
- meta = json.load(f)
95
-
96
- # Get normalization config
97
- img_norm_cfg = meta.get('img_norm_cfg', {
98
- 'mean': [123.675, 116.28, 103.53],
99
- 'std': [58.395, 57.12, 57.375],
100
- 'to_rgb': True
101
- })
102
-
103
- # Get image shape
104
- img_shape = meta.get('img_shape', [[480, 800, 3]] * 6)
105
- target_size = (img_shape[0][0], img_shape[0][1]) # (H, W)
106
-
107
- # Load images for all cameras
108
- num_cams = meta.get('num_cams', 6)
109
- imgs = []
110
- for cam_idx in range(num_cams):
111
- img_path = osp.join(scene_dir, f'cam_{cam_idx:02d}_{frame_idx:06d}.png')
112
- img = preprocess_image(img_path, img_norm_cfg, target_size)
113
- imgs.append(img)
114
-
115
- # Stack images: (N, C, H, W) -> (1, N, C, H, W)
116
- img = np.stack(imgs, axis=0) # (N, C, H, W)
117
- img = img[np.newaxis, ...] # (1, N, C, H, W)
118
-
119
- # Load lidar2img: (N, 4, 4) -> (1, N, 4, 4)
120
- lidar2img = np.array(meta['lidar2img'], dtype=np.float32) # (N, 4, 4)
121
- lidar2img = lidar2img[np.newaxis, ...] # (1, N, 4, 4)
122
-
123
- # Load can_bus: (18,) -> (1, 18)
124
- can_bus = np.array(meta['can_bus'], dtype=np.float32) # (18,)
125
- can_bus = can_bus[np.newaxis, ...] # (1, 18)
126
-
127
- return img, lidar2img, can_bus, meta
128
-
129
- CLASS_COLORS = {
130
- 0: (0, 255, 0), 1: (255, 255, 0), 2: (0, 0, 255), 3: (0, 165, 255),
131
- 4: (255, 0, 255), 5: (0, 255, 255), 6: (128, 0, 128), 7: (255, 165, 0),
132
- 8: (0, 0, 255), 9: (128, 128, 128),
133
- }
134
-
135
-
136
- def denormalize_bbox_np(normalized_bboxes, pc_range):
137
- """Denormalize bbox using numpy only"""
138
- # rotation
139
- rot_sine = normalized_bboxes[..., 6:7]
140
- rot_cosine = normalized_bboxes[..., 7:8]
141
-
142
- rot = np.arctan2(rot_sine, rot_cosine)
143
-
144
- # center in the bev
145
- cx = normalized_bboxes[..., 0:1]
146
- cy = normalized_bboxes[..., 1:2]
147
- cz = normalized_bboxes[..., 4:5]
148
-
149
- # size
150
- w = normalized_bboxes[..., 2:3]
151
- l = normalized_bboxes[..., 3:4]
152
- h = normalized_bboxes[..., 5:6]
153
-
154
- w = np.exp(w)
155
- l = np.exp(l)
156
- h = np.exp(h)
157
-
158
- if normalized_bboxes.shape[-1] > 8:
159
- # velocity
160
- vx = normalized_bboxes[:, 8:9]
161
- vy = normalized_bboxes[:, 9:10]
162
- denormalized_bboxes = np.concatenate([cx, cy, cz, w, l, h, rot, vx, vy], axis=-1)
163
- else:
164
- denormalized_bboxes = np.concatenate([cx, cy, cz, w, l, h, rot], axis=-1)
165
- return denormalized_bboxes
166
-
167
- def decode_bboxes_custom_np(all_cls_scores, all_bbox_preds, pc_range, post_center_range, max_num=100, score_threshold=None, num_classes=10):
168
- """Custom bbox decode function"""
169
- # Use output from the last decoder layer
170
- all_cls_scores = all_cls_scores[-1] # (bs, num_query, num_classes)
171
- all_bbox_preds = all_bbox_preds[-1] # (bs, num_query, 10)
172
-
173
- batch_size = all_cls_scores.shape[0]
174
- predictions_list = []
175
-
176
- for i in range(batch_size):
177
- cls_scores = all_cls_scores[i] # (num_query, num_classes)
178
- bbox_preds = all_bbox_preds[i] # (num_query, 10)
179
-
180
- # Apply sigmoid
181
- cls_scores = 1.0 / (1.0 + np.exp(-cls_scores))
182
-
183
- # TopK selection
184
- cls_scores_flat = cls_scores.reshape(-1)
185
- topk_indices = np.argsort(cls_scores_flat)[::-1][:max_num]
186
- scores = cls_scores_flat[topk_indices]
187
- labels = topk_indices % num_classes
188
- bbox_index = topk_indices // num_classes
189
- bbox_preds = bbox_preds[bbox_index]
190
-
191
- # Denormalize bbox
192
- final_box_preds = denormalize_bbox_np(bbox_preds, pc_range) # (max_num, 9)
193
- final_scores = scores
194
- final_preds = labels
195
-
196
- # Apply score threshold
197
- if score_threshold is not None:
198
- thresh_mask = final_scores > score_threshold
199
- tmp_score = score_threshold
200
- while thresh_mask.sum() == 0:
201
- tmp_score *= 0.9
202
- if tmp_score < 0.01:
203
- thresh_mask = np.ones(len(final_scores), dtype=bool)
204
- break
205
- thresh_mask = final_scores >= tmp_score
206
- else:
207
- thresh_mask = np.ones(len(final_scores), dtype=bool)
208
-
209
- # Apply post processing range filtering
210
- if post_center_range is not None:
211
- post_center_range_arr = np.array(post_center_range)
212
- mask = (final_box_preds[..., :3] >= post_center_range_arr[:3]).all(1)
213
- mask &= (final_box_preds[..., :3] <= post_center_range_arr[3:]).all(1)
214
- mask &= thresh_mask
215
-
216
- boxes3d = final_box_preds[mask]
217
- scores = final_scores[mask]
218
- labels = final_preds[mask]
219
- else:
220
- boxes3d = final_box_preds[thresh_mask]
221
- scores = final_scores[thresh_mask]
222
- labels = final_preds[thresh_mask]
223
-
224
- predictions_list.append({
225
- 'bboxes': boxes3d,
226
- 'scores': scores,
227
- 'labels': labels
228
- })
229
-
230
- return predictions_list
231
-
232
-
233
- def get_bboxes_custom_np(preds_dicts, pc_range, post_center_range, max_num=100, score_threshold=None, num_classes=10):
234
- """Custom get_bboxes function"""
235
- # Decode bounding boxes
236
- preds_list = decode_bboxes_custom_np(
237
- preds_dicts['all_cls_scores'],
238
- preds_dicts['all_bbox_preds'],
239
- pc_range,
240
- post_center_range,
241
- max_num,
242
- score_threshold,
243
- num_classes
244
- )
245
-
246
- num_samples = len(preds_list)
247
- ret_list = []
248
-
249
- for i in range(num_samples):
250
- preds = preds_list[i]
251
- bboxes = preds['bboxes']
252
-
253
- if len(bboxes) == 0:
254
- ret_list.append((
255
- np.zeros((0, 9), dtype=np.float32),
256
- np.zeros((0,), dtype=np.float32),
257
- np.zeros((0,), dtype=np.int64)
258
- ))
259
- continue
260
-
261
- # Adjust z coordinate: convert center z to bottom center z
262
- bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
263
-
264
- # Shrink box dimensions: multiply w, l, h by 0.9 to fix oversized boxes
265
- bboxes[:, 3:6] = bboxes[:, 3:6] * 0.9
266
-
267
- scores = preds['scores']
268
- labels = preds['labels']
269
-
270
- ret_list.append((bboxes, scores, labels))
271
-
272
- return ret_list
273
-
274
-
275
- def format_bbox_result_np(bboxes, scores, labels):
276
- return {
277
- 'boxes_3d': bboxes,
278
- 'scores_3d': scores,
279
- 'labels_3d': labels
280
- }
281
-
282
-
283
- def rotation_3d_in_axis_np(points, angles, axis=2):
284
- """Rotate points by angles according to axis"""
285
- rot_sin = np.sin(angles)
286
- rot_cos = np.cos(angles)
287
- ones = np.ones_like(rot_cos)
288
- zeros = np.zeros_like(rot_cos)
289
-
290
- if axis == 2 or axis == -1:
291
- # Rotate around z-axis
292
- # Build rotation matrix: (N, 3, 3)
293
- N = len(angles)
294
- rot_mat = np.zeros((N, 3, 3), dtype=points.dtype)
295
- rot_mat[:, 0, 0] = rot_cos
296
- rot_mat[:, 0, 1] = -rot_sin
297
- rot_mat[:, 0, 2] = zeros
298
- rot_mat[:, 1, 0] = rot_sin
299
- rot_mat[:, 1, 1] = rot_cos
300
- rot_mat[:, 1, 2] = zeros
301
- rot_mat[:, 2, 0] = zeros
302
- rot_mat[:, 2, 1] = zeros
303
- rot_mat[:, 2, 2] = ones
304
-
305
- # Rotation: (N, M, 3) @ (N, 3, 3) -> (N, M, 3)
306
- return np.einsum('aij,ajk->aik', points, rot_mat)
307
- else:
308
- raise ValueError(f'Only axis=2 (z-axis) is supported for LiDAR boxes')
309
-
310
-
311
- def compute_bbox_corners_np(bboxes):
312
- """Compute 8 corners of 3D bbox"""
313
- if len(bboxes) == 0:
314
- return np.zeros((0, 8, 3), dtype=np.float32)
315
-
316
- dtype = bboxes.dtype
317
-
318
- # Extract bbox parameters
319
- centers = bboxes[:, :3] # (N, 3) [x, y, z] - the bottom center
320
- w = bboxes[:, 3:4] # width (y direction)
321
- l = bboxes[:, 4:5] # length (x direction)
322
- h = bboxes[:, 5:6] # height (z direction)
323
- dims = np.concatenate([l, w, h], axis=1) # (N, 3) [x_size, y_size, z_size] = [l, w, h]
324
- yaws = bboxes[:, 6] # (N,) yaw angle
325
-
326
- # Fix: offset yaw by -80 degrees
327
- yaws = yaws - (np.pi / 2.0 - np.pi / 18.0)
328
-
329
- # Generate corners
330
- corners_norm = np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1).astype(dtype)
331
-
332
- # Rearrange to [0, 1, 3, 2, 4, 5, 7, 6]
333
- corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
334
-
335
- # Use relative origin [0.5, 0.5, 0] (bottom center)
336
- corners_norm = corners_norm - np.array([0.5, 0.5, 0], dtype=dtype)
337
-
338
- # Scale corners: dims is [x_size, y_size, z_size]
339
- corners = dims[:, np.newaxis, :] * corners_norm[np.newaxis, :, :] # (N, 8, 3)
340
-
341
- # Rotate around z-axis
342
- corners = rotation_3d_in_axis_np(corners, yaws, axis=2)
343
-
344
- # Translate to center point
345
- corners += centers[:, np.newaxis, :]
346
-
347
- return corners
348
-
349
-
350
- def draw_bbox3d_on_img_custom_np(bboxes, raw_img, lidar2img_rt, color=(0, 255, 0), thickness=2):
351
- """Custom 3D bbox drawing"""
352
- img = raw_img.copy()
353
-
354
- if len(bboxes) == 0:
355
- return img
356
-
357
- if not isinstance(bboxes, np.ndarray):
358
- bboxes = np.array(bboxes)
359
- if not isinstance(lidar2img_rt, np.ndarray):
360
- lidar2img_rt = np.array(lidar2img_rt)
361
-
362
- lidar2img_rt = lidar2img_rt.reshape(4, 4)
363
-
364
- # Compute corners
365
- corners_3d = compute_bbox_corners_np(bboxes) # (N, 8, 3)
366
-
367
- num_bbox = corners_3d.shape[0]
368
-
369
- # Project to 2D
370
- corners_3d_flat = corners_3d.reshape(-1, 3) # (N*8, 3)
371
- ones = np.ones((corners_3d_flat.shape[0], 1), dtype=np.float32)
372
- pts_4d = np.concatenate([corners_3d_flat, ones], axis=-1) # (N*8, 4)
373
-
374
- # Project
375
- pts_2d = pts_4d @ lidar2img_rt.T # (N*8, 4)
376
-
377
- # Perspective division
378
- pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
379
- pts_2d[:, 0] /= pts_2d[:, 2]
380
- pts_2d[:, 1] /= pts_2d[:, 2]
381
-
382
- imgfov_pts_2d = pts_2d[:, :2].reshape(num_bbox, 8, 2)
383
-
384
- line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
385
- (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
386
-
387
- for i in range(num_bbox):
388
- corners = imgfov_pts_2d[i].astype(np.int32)
389
- for start, end in line_indices:
390
- pt1 = (int(corners[start, 0]), int(corners[start, 1]))
391
- pt2 = (int(corners[end, 0]), int(corners[end, 1]))
392
- # Check if points are within image range
393
- h, w = img.shape[:2]
394
- if (0 <= pt1[0] < w and 0 <= pt1[1] < h) or (0 <= pt2[0] < w and 0 <= pt2[1] < h):
395
- cv2.line(img, pt1, pt2, color, thickness, cv2.LINE_AA)
396
-
397
- return img.astype(np.uint8)
398
-
399
-
400
- def post_process_outputs_np(all_cls_scores, all_bbox_preds, config, score_thr=0.1):
401
- bbox_coder = config['model']['bbox_coder']
402
- pc_range = bbox_coder['pc_range']
403
- post_center_range = bbox_coder['post_center_range']
404
- max_num = bbox_coder['max_num']
405
- score_threshold = bbox_coder.get('score_threshold', None)
406
- num_classes = bbox_coder['num_classes']
407
-
408
- preds_dicts = {
409
- 'all_cls_scores': all_cls_scores,
410
- 'all_bbox_preds': all_bbox_preds
411
- }
412
-
413
- bbox_list = get_bboxes_custom_np(
414
- preds_dicts, pc_range, post_center_range,
415
- max_num, score_threshold, num_classes
416
- )
417
-
418
- results = []
419
- for bboxes, scores, labels in bbox_list:
420
- # Set class score thresholds
421
- class_score_thrs = {
422
- 0: 0.3, # Car
423
- 1: 0.3, # Truck
424
- 2: 0.3, # Construction vehicle
425
- 3: 0.3, # Bus
426
- 4: 0.3, # Trailer
427
- 5: 0.3, # Barrier
428
- 6: 0.3, # Motorcycle
429
- 7: 0.3, # Bicycle
430
- 8: 0.3, # Pedestrian
431
- 9: 0.3, # Traffic cone
432
- }
433
- default_thr = score_thr
434
-
435
- keep_indices = []
436
- for i in range(len(scores)):
437
- cls_id = int(labels[i])
438
- thr = class_score_thrs.get(cls_id, default_thr)
439
- if scores[i] > thr:
440
- keep_indices.append(i)
441
-
442
- if len(keep_indices) == 0:
443
- results.append(format_bbox_result_np(
444
- np.zeros((0, 9), dtype=np.float32),
445
- np.zeros((0,), dtype=np.float32),
446
- np.zeros((0,), dtype=np.int64)
447
- ))
448
- continue
449
-
450
- keep_indices = np.array(keep_indices, dtype=np.int64)
451
- bboxes = bboxes[keep_indices]
452
- scores = scores[keep_indices]
453
- labels = labels[keep_indices]
454
-
455
- # Circle NMS
456
- dist_thrs = {
457
- 0: 2.0, 1: 3.0, 2: 2.5, 3: 4.0, 4: 3.0,
458
- 5: 1.0, 6: 1.5, 7: 1.0, 8: 0.5, 9: 0.3,
459
- }
460
-
461
- if len(scores) > 0:
462
- keep_nms = circle_nms_np(bboxes, scores, labels, dist_thrs)
463
- if len(keep_nms) > 0:
464
- bboxes = bboxes[keep_nms]
465
- scores = scores[keep_nms]
466
- labels = labels[keep_nms]
467
- else:
468
- results.append(format_bbox_result_np(
469
- np.zeros((0, 9), dtype=np.float32),
470
- np.zeros((0,), dtype=np.float32),
471
- np.zeros((0,), dtype=np.int64)
472
- ))
473
- continue
474
-
475
- results.append(format_bbox_result_np(bboxes, scores, labels))
476
-
477
- return results
478
-
479
-
480
- def circle_nms_np(bboxes, scores, labels, dist_thrs):
481
- if len(bboxes) == 0:
482
- return np.array([], dtype=np.int64)
483
-
484
- keep = []
485
- order = np.argsort(scores)[::-1]
486
- bboxes = bboxes[order]
487
- scores = scores[order]
488
- labels = labels[order]
489
-
490
- pts = bboxes[:, :2]
491
- labels_np = labels
492
-
493
- suppressed = np.zeros(len(bboxes), dtype=bool)
494
-
495
- for i in range(len(bboxes)):
496
- if suppressed[i]:
497
- continue
498
- keep.append(order[i])
499
-
500
- curr_cls = int(labels_np[i])
501
- radius = dist_thrs.get(curr_cls, 1.0)
502
-
503
- if i + 1 < len(bboxes):
504
- dists = np.linalg.norm(pts[i+1:] - pts[i], axis=1)
505
- idx_to_suppress = np.where(
506
- (dists < radius) & (labels_np[i+1:] == curr_cls)
507
- )[0]
508
- suppressed[i+1:][idx_to_suppress] = True
509
-
510
- return np.array(keep, dtype=np.int64)
511
-
512
-
513
- def denormalize_img_np(img_array, img_norm_cfg):
514
- """Denormalize image array (C, H, W) to (H, W, C) BGR"""
515
- mean = np.array(img_norm_cfg.get('mean', [123.675, 116.28, 103.53]))
516
- std = np.array(img_norm_cfg.get('std', [58.395, 57.12, 57.375]))
517
-
518
- # (C, H, W) RGB -> (H, W, C) RGB
519
- if img_array.ndim == 3:
520
- img = img_array.transpose(1, 2, 0)
521
- else:
522
- img = img_array
523
- img = (img * std + mean)
524
- img = np.clip(img, 0, 255).astype(np.uint8)
525
- img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
526
- return img
527
-
528
-
529
- def draw_bev_map(bboxes, labels, scores, pc_range, bev_size=(800, 800), score_thr=0.1):
530
- """Draw BEV (Bird's Eye View) map with detections
531
-
532
- Args:
533
- bboxes: (N, 9) numpy array, format: [x, y, z, w, l, h, yaw, vx, vy]
534
- labels: (N,) numpy array, class labels
535
- scores: (N,) numpy array, detection scores
536
- pc_range: [x_min, y_min, z_min, x_max, y_max, z_max]
537
- bev_size: (width, height) of BEV image
538
- score_thr: score threshold
539
-
540
- Returns:
541
- bev_img: (H, W, 3) numpy array, BEV visualization
542
- """
543
- bev_w, bev_h = bev_size # BEV image size
544
- bev_img = np.ones((bev_h, bev_w, 3), dtype=np.uint8) * 255 # White background
545
-
546
- # Draw grid
547
- x_min, y_min, z_min, x_max, y_max, z_max = pc_range
548
- x_range = x_max - x_min
549
- y_range = y_max - y_min
550
-
551
- # Draw grid lines
552
- grid_color = (200, 200, 200) # Light gray grid lines
553
- for i in range(-5, 6):
554
- x = x_min + (i + 5) * x_range / 10
555
- y = y_min + (i + 5) * y_range / 10
556
- # Vertical lines (y direction in LiDAR -> x direction in image)
557
- img_x = int((y - y_min) / y_range * bev_w)
558
- if 0 <= img_x < bev_w:
559
- cv2.line(bev_img, (img_x, 0), (img_x, bev_h), grid_color, 1)
560
- # Horizontal lines (x direction in LiDAR -> y direction in image, flipped)
561
- img_y = int((x_max - x) / x_range * bev_h)
562
- if 0 <= img_y < bev_h:
563
- cv2.line(bev_img, (0, img_y), (bev_w, img_y), grid_color, 1)
564
-
565
- # Draw center lines (ego vehicle position) - darker on white background
566
- center_x = int((0 - y_min) / y_range * bev_w)
567
- center_y = int((x_max - 0) / x_range * bev_h)
568
- cv2.line(bev_img, (center_x, 0), (center_x, bev_h), (150, 150, 150), 2)
569
- cv2.line(bev_img, (0, center_y), (bev_w, center_y), (150, 150, 150), 2)
570
-
571
-
572
- ego_length_px = 30 # pixels (representing ~4.5m, along x-axis rightward)
573
- ego_width_px = 12 # pixels (representing ~1.8m, along y-axis downward)
574
-
575
- ego_corners_local = np.array([
576
- [ego_length_px//2, -ego_width_px//2], # front-top (head)
577
- [ego_length_px//2, ego_width_px//2], # front-bottom
578
- [-ego_length_px//2, ego_width_px//2], # back-bottom
579
- [-ego_length_px//2, -ego_width_px//2], # back-top
580
- ], dtype=np.float32)
581
-
582
-
583
- rotation_angle_90 = np.pi / 2 # 90 degrees in radians
584
- cos_rot_90 = np.cos(rotation_angle_90)
585
- sin_rot_90 = np.sin(rotation_angle_90)
586
- rot_mat_90 = np.array([[cos_rot_90, -sin_rot_90], [sin_rot_90, cos_rot_90]])
587
-
588
- ego_corners_rotated_90 = ego_corners_local @ rot_mat_90.T
589
-
590
- ego_corners_rotated = ego_corners_rotated_90 @ rot_mat_90.T
591
-
592
- # Translate to image coordinates (center position)
593
- ego_corners = []
594
- for corner in ego_corners_rotated:
595
- corner_img_x = int(center_x + corner[0])
596
- corner_img_y = int(center_y + corner[1])
597
- ego_corners.append([corner_img_x, corner_img_y])
598
- ego_corners = np.array(ego_corners, dtype=np.int32)
599
-
600
- # Draw filled rectangle
601
- cv2.fillPoly(bev_img, [ego_corners], (0, 0, 255)) # Red filled
602
- cv2.polylines(bev_img, [ego_corners], True, (0, 0, 0), 2) # Black outline
603
-
604
-
605
- arrow_length = ego_length_px // 2
606
- initial_direction = np.array([1.0, 0.0])
607
- arrow_dir_rotated_90 = initial_direction @ rot_mat_90.T
608
- arrow_dir_rotated = arrow_dir_rotated_90 @ rot_mat_90.T
609
- arrow_end_x = int(center_x + arrow_length * arrow_dir_rotated[0])
610
- arrow_end_y = int(center_y + arrow_length * arrow_dir_rotated[1])
611
- cv2.arrowedLine(bev_img, (center_x, center_y), (arrow_end_x, arrow_end_y),
612
- (0, 0, 0), 3, tipLength=0.3) # Black arrow
613
-
614
- if len(bboxes) == 0:
615
- return bev_img
616
-
617
- if score_thr > 0:
618
- mask = scores > score_thr
619
- bboxes = bboxes[mask]
620
- labels = labels[mask]
621
- scores = scores[mask]
622
-
623
- if len(bboxes) == 0:
624
- return bev_img
625
-
626
- default_color = (255, 255, 255)
627
-
628
-
629
- for i in range(len(bboxes)):
630
- box = bboxes[i]
631
- label = int(labels[i])
632
- score = float(scores[i])
633
- color = CLASS_COLORS.get(label, default_color)
634
-
635
- x, y, z = box[0], box[1], box[2] # center position
636
- w, l, h = box[3], box[4], box[5] # width, length, height
637
- yaw = box[6] # yaw angle
638
-
639
- yaw = yaw - np.pi / 2.0 # Subtract 90 degrees (counterclockwise)
640
-
641
- # Convert to image coordinates
642
- # Note: In LiDAR coordinate, x is forward, y is left, z is up
643
- # In BEV image (top-down view):
644
- # - x (forward) -> image y (downward, flipped)
645
- # - y (left) -> image x (rightward)
646
- # So: img_x = (y - y_min) / y_range * bev_w
647
- # img_y = (x_max - x) / x_range * bev_h (flip x to get top-down view)
648
- img_x = int((y - y_min) / y_range * bev_w)
649
- img_y = int((x_max - x) / x_range * bev_h) # Flip x for top-down view
650
-
651
- # Skip if outside image
652
- if not (0 <= img_x < bev_w and 0 <= img_y < bev_h):
653
- continue
654
-
655
- # Calculate box dimensions in image space
656
- box_w_px = int(w / x_range * bev_w)
657
- box_l_px = int(l / y_range * bev_h)
658
-
659
- # Draw rotated rectangle
660
- # Calculate 4 corners of the box in LiDAR coordinates
661
- cos_yaw = np.cos(yaw)
662
- sin_yaw = np.sin(yaw)
663
-
664
- # Box corners relative to center (in LiDAR frame: x forward, y left)
665
- corners_local = np.array([
666
- [l/2, w/2], # front-right
667
- [l/2, -w/2], # front-left
668
- [-l/2, -w/2], # back-left
669
- [-l/2, w/2] # back-right
670
- ])
671
-
672
- # Rotate corners
673
- rot_mat = np.array([[cos_yaw, -sin_yaw], [sin_yaw, cos_yaw]])
674
- corners_rotated = corners_local @ rot_mat.T
675
-
676
- # Translate to world coordinates and convert to image space
677
- corners_img = []
678
- for corner in corners_rotated:
679
- corner_x = x + corner[0] # x in LiDAR (forward)
680
- corner_y = y + corner[1] # y in LiDAR (left)
681
- corner_img_x = int((corner_y - y_min) / y_range * bev_w) # y -> img_x
682
- corner_img_y = int((x_max - corner_x) / x_range * bev_h) # x -> img_y (flipped)
683
- corners_img.append([corner_img_x, corner_img_y])
684
-
685
- corners_img = np.array(corners_img, dtype=np.int32)
686
-
687
- # Draw filled polygon (semi-transparent on white background)
688
- overlay = bev_img.copy()
689
- cv2.fillPoly(overlay, [corners_img], color)
690
- cv2.addWeighted(overlay, 0.5, bev_img, 0.5, 0, bev_img)
691
- # Draw outline (black on white background)
692
- cv2.polylines(bev_img, [corners_img], True, (0, 0, 0), 2)
693
-
694
- # Draw direction arrow (forward direction) - black on white
695
- # In LiDAR: forward is +x, left is +y
696
- # In BEV image: x -> img_y (flipped), y -> img_x
697
- # So rotation: img_x += sin(yaw) * length, img_y -= cos(yaw) * length
698
- arrow_length = max(box_l_px // 2, 10)
699
- arrow_end_x = int(img_x + arrow_length * sin_yaw) # y component -> img_x
700
- arrow_end_y = int(img_y - arrow_length * cos_yaw) # x component -> img_y (flipped)
701
- cv2.arrowedLine(bev_img, (img_x, img_y), (arrow_end_x, arrow_end_y),
702
- (0, 0, 0), 2, tipLength=0.3) # Black arrow
703
-
704
- # Draw center point
705
- cv2.circle(bev_img, (img_x, img_y), 3, (0, 0, 0), -1) # Black center point
706
-
707
- # Rotate BEV map counterclockwise by 90 degrees (map only, not text)
708
- center = (bev_w // 2, bev_h // 2)
709
- rotation_matrix = cv2.getRotationMatrix2D(center, 90, 1.0) # 90 degrees counterclockwise
710
- bev_img = cv2.warpAffine(bev_img, rotation_matrix, (bev_w, bev_h), borderValue=(255, 255, 255))
711
-
712
- # Flip horizontally to fix mirror effect
713
- bev_img = cv2.flip(bev_img, 1) # 1 for horizontal flip
714
-
715
- text = 'BEV Map'
716
- font = cv2.FONT_HERSHEY_SIMPLEX
717
- font_scale = 1
718
- thickness = 2
719
- (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)
720
- text_x = bev_w - text_width - 10
721
- text_y = text_height + 10
722
- cv2.putText(bev_img, text, (text_x, text_y), font, font_scale, (0, 0, 0), thickness)
723
-
724
- return bev_img
725
-
726
-
727
- def visualize_results_np(img, result, lidar2img, img_norm_cfg, class_names, score_thr=0.3, pc_range=None):
728
- num_cams = img.shape[1] if img.ndim == 5 else 1
729
- raw_imgs = [denormalize_img_np(img[0, cam_idx], img_norm_cfg) for cam_idx in range(num_cams)]
730
- boxes_3d = result.get('boxes_3d')
731
- scores_3d = result.get('scores_3d')
732
- labels_3d = result.get('labels_3d')
733
- vis_imgs = []
734
- boxes_3d_for_bev = labels_3d_for_bev = scores_3d_for_bev = None
735
-
736
- if boxes_3d is not None and len(boxes_3d) > 0:
737
- mask = (scores_3d > score_thr) if (score_thr > 0 and scores_3d is not None) else np.ones_like(scores_3d, dtype=bool)
738
- if np.any(mask):
739
- boxes_3d = boxes_3d[mask]
740
- scores_3d = scores_3d[mask]
741
- labels_3d = labels_3d[mask]
742
- boxes_3d_for_bev = boxes_3d.copy()
743
- labels_3d_for_bev = labels_3d.copy()
744
- scores_3d_for_bev = scores_3d.copy()
745
- for cam_idx, vis_img in enumerate(raw_imgs):
746
- vis_img = vis_img.copy()
747
- if lidar2img.shape[1] > cam_idx:
748
- cam_lidar2img = lidar2img[0, cam_idx]
749
- for box, label in zip(boxes_3d, labels_3d):
750
- color = CLASS_COLORS.get(int(label), (255, 255, 255))
751
- try:
752
- vis_img = draw_bbox3d_on_img_custom_np(box[None], vis_img, cam_lidar2img, color=color, thickness=2)
753
- except Exception:
754
- pass
755
- vis_imgs.append(vis_img)
756
- else:
757
- vis_imgs = raw_imgs
758
-
759
- if pc_range is None:
760
- pc_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
761
-
762
- if boxes_3d_for_bev is not None and len(boxes_3d_for_bev) > 0:
763
- bev_size = (vis_imgs[0].shape[0], vis_imgs[0].shape[0]) if vis_imgs else (800, 800)
764
- bev_img = draw_bev_map(boxes_3d_for_bev, labels_3d_for_bev, scores_3d_for_bev, pc_range, bev_size=bev_size, score_thr=score_thr)
765
- else:
766
- bev_size = (vis_imgs[0].shape[0], vis_imgs[0].shape[0]) if vis_imgs else (800, 800)
767
- bev_img = np.full((bev_size[1], bev_size[0], 3), 255, np.uint8)
768
- cv2.putText(bev_img, 'BEV Map (No Detections)', (10, bev_size[1]//2), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
769
-
770
- if len(vis_imgs) == 6:
771
- target_height = max(img.shape[0] for img in vis_imgs)
772
- resized_imgs = [img if img.shape[0] == target_height else cv2.resize(img, (int(img.shape[1] * target_height / img.shape[0]), target_height)) for img in vis_imgs]
773
-
774
- reordered_imgs = [
775
- resized_imgs[2], resized_imgs[0], resized_imgs[1],
776
- cv2.flip(resized_imgs[4], 1), cv2.flip(resized_imgs[3], 1), cv2.flip(resized_imgs[5], 1)
777
- ]
778
- top_row = np.hstack(reordered_imgs[:3])
779
- bottom_row = np.hstack(reordered_imgs[3:])
780
- left_side = np.vstack([top_row, bottom_row])
781
- bev_img = cv2.resize(bev_img, (int(bev_img.shape[1] * left_side.shape[0] / bev_img.shape[0]), left_side.shape[0]))
782
- vis_img = np.hstack([left_side, bev_img])
783
- elif len(vis_imgs) > 1:
784
- target_height = max(img.shape[0] for img in vis_imgs)
785
- resized_imgs = [img if img.shape[0] == target_height else cv2.resize(img, (int(img.shape[1] * target_height / img.shape[0]), target_height)) for img in vis_imgs]
786
- if bev_img.shape[0] != target_height:
787
- bev_img = cv2.resize(bev_img, (int(bev_img.shape[1] * target_height / bev_img.shape[0]), target_height))
788
- vis_img = np.hstack([np.hstack(resized_imgs), bev_img])
789
- else:
790
- cam_img = vis_imgs[0] if vis_imgs else bev_img
791
- if bev_img.shape[0] != cam_img.shape[0]:
792
- bev_img = cv2.resize(bev_img, (int(bev_img.shape[1] * cam_img.shape[0] / bev_img.shape[0]), cam_img.shape[0]))
793
- vis_img = np.hstack([cam_img, bev_img]) if vis_imgs else bev_img
794
-
795
- return vis_img
796
-
797
-
798
- def create_video_from_images(image_dir, output_video_path, fps=3):
799
- import subprocess
800
-
801
- image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
802
- if len(image_files) == 0:
803
- return
804
-
805
- first_img = cv2.imread(osp.join(image_dir, image_files[0]))
806
- if first_img is None:
807
- return
808
-
809
- height, width = first_img.shape[:2]
810
-
811
- max_width, max_height = 1920, 1080
812
- if width > max_width or height > max_height:
813
- scale = min(max_width / width, max_height / height)
814
- width, height = int(width * scale), int(height * scale)
815
-
816
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
817
- video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
818
- if not video_writer.isOpened():
819
- fourcc = cv2.VideoWriter_fourcc(*'XVID')
820
- video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
821
-
822
- for img_file in tqdm(image_files, desc=f"Creating video: {osp.basename(output_video_path)}"):
823
- img_path = osp.join(image_dir, img_file)
824
- img = cv2.imread(img_path)
825
- if img is not None:
826
- if img.shape[:2] != (height, width):
827
- img = cv2.resize(img, (width, height))
828
- video_writer.write(img)
829
-
830
- video_writer.release()
831
-
832
- def main():
833
- args = parse_args()
834
-
835
- # Load configuration from JSON
836
- config = load_config_from_json(args.config_json)
837
-
838
- # Create output directory
839
- os.makedirs(args.output_dir, exist_ok=True)
840
-
841
- # Load AXModel
842
- ax_session = load_axmodel(args.model)
843
-
844
- # Get model parameters from config
845
- transformer_cfg = config['model']['transformer']
846
- bev_h = transformer_cfg['bev_h']
847
- bev_w = transformer_cfg['bev_w']
848
- embed_dims = transformer_cfg['embed_dims']
849
-
850
- # Load scene index
851
- scene_index_path = osp.join(args.data_dir, 'scene_index.json')
852
- with open(scene_index_path, 'r') as f:
853
- scene_index_data = json.load(f)
854
-
855
- scenes_dict = scene_index_data['scenes']
856
- scene_names = list(scenes_dict.keys())
857
-
858
- end_scene = args.end_scene if args.end_scene is not None else len(scene_names)
859
- end_scene = min(end_scene, len(scene_names))
860
-
861
- prev_frame_info = {
862
- 'prev_bev': None,
863
- 'scene_token': None,
864
- 'prev_pos': np.zeros(3, dtype=np.float32),
865
- 'prev_angle': 0.0,
866
- }
867
-
868
- scene_results = defaultdict(list)
869
-
870
- # Process all scenes
871
- for scene_idx in range(args.start_scene, end_scene):
872
- scene_name = scene_names[scene_idx]
873
- scene_info = scenes_dict[scene_name]
874
- sample_indices = scene_info['samples']
875
- num_frames = len(sample_indices)
876
-
877
- print(f"Processing scene {scene_idx+1}/{len(scene_names)}: {scene_name} ({num_frames} frames)")
878
-
879
- # Reset prev_bev for new scene
880
- if scene_name != prev_frame_info['scene_token']:
881
- prev_frame_info['prev_bev'] = None
882
- prev_frame_info['prev_pos'] = np.zeros(3, dtype=np.float32)
883
- prev_frame_info['prev_angle'] = 0.0
884
-
885
- prev_frame_info['scene_token'] = scene_name
886
-
887
- # Process all frames in this scene
888
- for local_idx, frame_idx in enumerate(tqdm(sample_indices, desc=f"Scene {scene_name}")):
889
- # Load data
890
- img, lidar2img, can_bus, meta = load_data(args.data_dir, scene_name, frame_idx)
891
-
892
- # Process can_bus (compute delta)
893
- curr_can_bus_np = can_bus[0] # (18,)
894
-
895
- tmp_pos = curr_can_bus_np[:3].copy()
896
- tmp_angle = curr_can_bus_np[-1]
897
-
898
- delta_can_bus_np = curr_can_bus_np.copy()
899
-
900
- if prev_frame_info['prev_bev'] is not None and prev_frame_info['scene_token'] == scene_name:
901
- delta_can_bus_np[:3] -= prev_frame_info['prev_pos']
902
- delta_can_bus_np[-1] -= prev_frame_info['prev_angle']
903
- else:
904
- delta_can_bus_np[:3] = 0.0
905
- delta_can_bus_np[-1] = 0.0
906
-
907
- prev_frame_info['prev_pos'] = tmp_pos
908
- prev_frame_info['prev_angle'] = tmp_angle
909
-
910
- # Prepare prev_bev
911
- prev_bev_input = next((inp for inp in ax_session.get_inputs() if inp.name == 'prev_bev'), None)
912
- expected_shape = (bev_h * bev_w, 1, embed_dims)
913
- if prev_bev_input is not None:
914
- expected_shape = list(prev_bev_input.shape)
915
- for i, dim in enumerate(expected_shape):
916
- if isinstance(dim, str) or dim < 0:
917
- expected_shape[i] = (bev_h * bev_w, 1, embed_dims)[i] if i < 3 else 1
918
- expected_shape = tuple(expected_shape)
919
-
920
- if prev_frame_info['prev_bev'] is None:
921
- prev_bev = np.zeros(expected_shape, dtype=np.float32)
922
- else:
923
- prev_bev = prev_frame_info['prev_bev']
924
- if prev_bev.shape != expected_shape and len(prev_bev.shape) == 3:
925
- prev_bev = prev_bev.reshape(expected_shape)
926
-
927
- # Prepare AXEngine inputs
928
- img_np = img.astype(np.float32)
929
- lidar2img_np = lidar2img.astype(np.float32)
930
- can_bus_np = delta_can_bus_np.reshape(1, -1).astype(np.float32)
931
-
932
- input_names = [inp.name for inp in ax_session.get_inputs()]
933
- ax_inputs = {}
934
- for name in input_names:
935
- if name == 'img':
936
- ax_inputs['img'] = img_np
937
- elif name == 'can_bus':
938
- ax_inputs['can_bus'] = can_bus_np
939
- elif name == 'lidar2img':
940
- ax_inputs['lidar2img'] = lidar2img_np
941
- elif name == 'prev_bev':
942
- ax_inputs['prev_bev'] = prev_bev
943
-
944
- # Run inference
945
- ax_outputs = ax_session.run(None, ax_inputs)
946
- bev_embed, all_cls_scores, all_bbox_preds = ax_outputs
947
-
948
- prev_frame_info['prev_bev'] = bev_embed
949
-
950
- # Post-process
951
- results = post_process_outputs_np(
952
- all_cls_scores, all_bbox_preds, config, args.score_thr
953
- )
954
-
955
- # Visualize
956
- img_norm_cfg = config['img_norm']
957
- class_names = config['dataset']['class_names']
958
- pc_range = config['model']['bbox_coder']['pc_range']
959
- vis_img = visualize_results_np(
960
- img, results[0], lidar2img, img_norm_cfg, class_names, args.score_thr, pc_range=pc_range
961
- )
962
-
963
- scene_results[scene_name].append({
964
- 'frame_idx': local_idx,
965
- 'result': results[0],
966
- 'vis_img': vis_img,
967
- 'meta': meta
968
- })
969
-
970
- # Save results
971
- for scene_name, frames in tqdm(scene_results.items(), desc="Save scene results"):
972
- scene_dir = osp.join(args.output_dir, scene_name)
973
- os.makedirs(scene_dir, exist_ok=True)
974
- images_dir = osp.join(scene_dir, 'images')
975
- os.makedirs(images_dir, exist_ok=True)
976
-
977
- for local_idx, frame_data in enumerate(frames):
978
- vis_img = frame_data['vis_img']
979
-
980
- if vis_img is None:
981
- continue
982
-
983
- if not isinstance(vis_img, np.ndarray):
984
- vis_img = np.array(vis_img)
985
-
986
- if vis_img.dtype != np.uint8:
987
- vis_img = (vis_img * 255).astype(np.uint8) if vis_img.max() <= 1.0 else vis_img.astype(np.uint8)
988
-
989
- if len(vis_img.shape) == 3 and vis_img.shape[0] in (1, 3):
990
- vis_img = vis_img.transpose(1, 2, 0)
991
-
992
- if vis_img.shape[0] > 0 and vis_img.shape[1] > 0:
993
- cv2.imwrite(osp.join(images_dir, f'frame_{local_idx:06d}.png'), vis_img)
994
-
995
- create_video_from_images(images_dir, osp.join(scene_dir, f'{scene_name}_result.mp4'), args.fps)
996
- print(f"✓ Scene {scene_name}: {len(frames)} frames, video: {osp.join(scene_dir, f'{scene_name}_result.mp4')}")
997
-
998
-
999
- if __name__ == '__main__':
1000
- main()
1001
-