AXERA-TECH
/

bevformer

+#!/usr/bin/env python3
+import argparse
+import json
+import os
+import os.path as osp
+import cv2
+import numpy as np
+import axengine as axe
+from collections import defaultdict
+from tqdm import tqdm
+def parse_args():
+    parser = argparse.ArgumentParser(description='BEVFormer AXEngine Inference from Extracted Data')
+    parser.add_argument('model', help='AXModel path')
+    parser.add_argument('config_json', help='JSON config file path')
+    parser.add_argument('data_dir', help='extracted data directory (extracted_data)')
+    parser.add_argument('--output-dir', default='./inference_results_extracted', help='output directory')
+    parser.add_argument('--score-thr', type=float, default=0.1, help='score threshold')
+    parser.add_argument('--fps', type=int, default=3, help='video fps')
+    parser.add_argument('--start-scene', type=int, default=0, help='start scene index')
+    parser.add_argument('--end-scene', type=int, default=None, help='end scene index (None for all)')
+    return parser.parse_args()
+def load_axmodel(axmodel_path):
+    """Load AXModel"""
+    # 尝试使用 AxEngineExecutionProvider 而不是 AXCLRTExecutionProvider
+    providers = ['AxEngineExecutionProvider']
+    session = axe.InferenceSession(axmodel_path, providers=providers)
+    return session
+def load_config_from_json(config_path):
+    """Load configuration from JSON file"""
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    return config
+def preprocess_image(img_path, img_norm_cfg, target_size=(480, 800)):
+    """Preprocess image: load, resize, normalize
+    Args:
+        img_path: path to image file
+        img_norm_cfg: normalization config with 'mean', 'std', 'to_rgb'
+        target_size: (H, W) target size
+    Returns:
+        img: (C, H, W) normalized numpy array, float32
+    """
+    # Load image
+    img = cv2.imread(img_path)
+    if img is None:
+        raise ValueError(f"Cannot load image: {img_path}")
+    # Convert BGR to RGB if needed
+    if img_norm_cfg.get('to_rgb', True):
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    # Resize if needed
+    if img.shape[:2] != target_size:
+        img = cv2.resize(img, (target_size[1], target_size[0]))  # (W, H)
+    # Convert to float and normalize
+    img = img.astype(np.float32)
+    mean = np.array(img_norm_cfg.get('mean', [123.675, 116.28, 103.53]), dtype=np.float32)
+    std = np.array(img_norm_cfg.get('std', [58.395, 57.12, 57.375]), dtype=np.float32)
+    img = (img - mean) / std
+    img = img.transpose(2, 0, 1)  # (H, W, C) -> (C, H, W)
+    return img
+def load_data(data_dir, scene_name, frame_idx):
+    """Load data
+    Args:
+        data_dir: data directory path
+        scene_name: scene name (scene token)
+        frame_idx: frame index (sample index)
+    Returns:
+        img: (1, N, C, H, W) numpy array
+        lidar2img: (1, N, 4, 4) numpy array
+        can_bus: (1, 18) numpy array
+        meta: dict with metadata
+    """
+    scene_dir = osp.join(data_dir, scene_name)
+    # Load meta
+    meta_path = osp.join(scene_dir, f'meta_{frame_idx:06d}.json')
+    with open(meta_path, 'r') as f:
+        meta = json.load(f)
+    # Get normalization config
+    img_norm_cfg = meta.get('img_norm_cfg', {
+        'mean': [123.675, 116.28, 103.53],
+        'std': [58.395, 57.12, 57.375],
+        'to_rgb': True
+    })
+    # Get image shape
+    img_shape = meta.get('img_shape', [[480, 800, 3]] * 6)
+    target_size = (img_shape[0][0], img_shape[0][1])  # (H, W)
+    # Load images for all cameras
+    num_cams = meta.get('num_cams', 6)
+    imgs = []
+    for cam_idx in range(num_cams):
+        img_path = osp.join(scene_dir, f'cam_{cam_idx:02d}_{frame_idx:06d}.png')
+        img = preprocess_image(img_path, img_norm_cfg, target_size)
+        imgs.append(img)
+    # Stack images: (N, C, H, W) -> (1, N, C, H, W)
+    img = np.stack(imgs, axis=0)  # (N, C, H, W)
+    img = img[np.newaxis, ...]  # (1, N, C, H, W)
+    # Load lidar2img: (N, 4, 4) -> (1, N, 4, 4)
+    lidar2img = np.array(meta['lidar2img'], dtype=np.float32)  # (N, 4, 4)
+    lidar2img = lidar2img[np.newaxis, ...]  # (1, N, 4, 4)
+    # Load can_bus: (18,) -> (1, 18)
+    can_bus = np.array(meta['can_bus'], dtype=np.float32)  # (18,)
+    can_bus = can_bus[np.newaxis, ...]  # (1, 18)
+    return img, lidar2img, can_bus, meta
+CLASS_COLORS = {
+    0: (0, 255, 0), 1: (255, 255, 0), 2: (0, 0, 255), 3: (0, 165, 255),
+    4: (255, 0, 255), 5: (0, 255, 255), 6: (128, 0, 128), 7: (255, 165, 0),
+    8: (0, 0, 255), 9: (128, 128, 128),
+}
+def denormalize_bbox_np(normalized_bboxes, pc_range):
+    """Denormalize bbox using numpy only"""
+    # rotation
+    rot_sine = normalized_bboxes[..., 6:7]
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = np.arctan2(rot_sine, rot_cosine)
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+    # size
+    w = normalized_bboxes[..., 2:3]
+    l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+    w = np.exp(w)
+    l = np.exp(l)
+    h = np.exp(h)
+    if normalized_bboxes.shape[-1] > 8:
+        # velocity
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = np.concatenate([cx, cy, cz, w, l, h, rot, vx, vy], axis=-1)
+    else:
+        denormalized_bboxes = np.concatenate([cx, cy, cz, w, l, h, rot], axis=-1)
+    return denormalized_bboxes
+def decode_bboxes_custom_np(all_cls_scores, all_bbox_preds, pc_range, post_center_range, max_num=100, score_threshold=None, num_classes=10):
+    """Custom bbox decode function"""
+    # Use output from the last decoder layer
+    all_cls_scores = all_cls_scores[-1]  # (bs, num_query, num_classes)
+    all_bbox_preds = all_bbox_preds[-1]  # (bs, num_query, 10)
+    batch_size = all_cls_scores.shape[0]
+    predictions_list = []
+    for i in range(batch_size):
+        cls_scores = all_cls_scores[i]  # (num_query, num_classes)
+        bbox_preds = all_bbox_preds[i]  # (num_query, 10)
+        # Apply sigmoid
+        cls_scores = 1.0 / (1.0 + np.exp(-cls_scores))
+        # TopK selection
+        cls_scores_flat = cls_scores.reshape(-1)
+        topk_indices = np.argsort(cls_scores_flat)[::-1][:max_num]
+        scores = cls_scores_flat[topk_indices]
+        labels = topk_indices % num_classes
+        bbox_index = topk_indices // num_classes
+        bbox_preds = bbox_preds[bbox_index]
+        # Denormalize bbox
+        final_box_preds = denormalize_bbox_np(bbox_preds, pc_range)  # (max_num, 9)
+        final_scores = scores
+        final_preds = labels
+        # Apply score threshold
+        if score_threshold is not None:
+            thresh_mask = final_scores > score_threshold
+            tmp_score = score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = np.ones(len(final_scores), dtype=bool)
+                    break
+                thresh_mask = final_scores >= tmp_score
+        else:
+            thresh_mask = np.ones(len(final_scores), dtype=bool)
+        # Apply post processing range filtering
+        if post_center_range is not None:
+            post_center_range_arr = np.array(post_center_range)
+            mask = (final_box_preds[..., :3] >= post_center_range_arr[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <= post_center_range_arr[3:]).all(1)
+            mask &= thresh_mask
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+        else:
+            boxes3d = final_box_preds[thresh_mask]
+            scores = final_scores[thresh_mask]
+            labels = final_preds[thresh_mask]
+        predictions_list.append({
+            'bboxes': boxes3d,
+            'scores': scores,
+            'labels': labels
+        })
+    return predictions_list
+def get_bboxes_custom_np(preds_dicts, pc_range, post_center_range, max_num=100, score_threshold=None, num_classes=10):
+    """Custom get_bboxes function"""
+    # Decode bounding boxes
+    preds_list = decode_bboxes_custom_np(
+        preds_dicts['all_cls_scores'],
+        preds_dicts['all_bbox_preds'],
+        pc_range,
+        post_center_range,
+        max_num,
+        score_threshold,
+        num_classes
+    )
+    num_samples = len(preds_list)
+    ret_list = []
+    for i in range(num_samples):
+        preds = preds_list[i]
+        bboxes = preds['bboxes']
+        if len(bboxes) == 0:
+            ret_list.append((
+                np.zeros((0, 9), dtype=np.float32),
+                np.zeros((0,), dtype=np.float32),
+                np.zeros((0,), dtype=np.int64)
+            ))
+            continue
+        # Adjust z coordinate: convert center z to bottom center z
+        bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+        # Shrink box dimensions: multiply w, l, h by 0.9 to fix oversized boxes
+        bboxes[:, 3:6] = bboxes[:, 3:6] * 0.9
+        scores = preds['scores']
+        labels = preds['labels']
+        ret_list.append((bboxes, scores, labels))
+    return ret_list
+def format_bbox_result_np(bboxes, scores, labels):
+    return {
+        'boxes_3d': bboxes,
+        'scores_3d': scores,
+        'labels_3d': labels
+    }
+def rotation_3d_in_axis_np(points, angles, axis=2):
+    """Rotate points by angles according to axis"""
+    rot_sin = np.sin(angles)
+    rot_cos = np.cos(angles)
+    ones = np.ones_like(rot_cos)
+    zeros = np.zeros_like(rot_cos)
+    if axis == 2 or axis == -1:
+        # Rotate around z-axis
+        # Build rotation matrix: (N, 3, 3)
+        N = len(angles)
+        rot_mat = np.zeros((N, 3, 3), dtype=points.dtype)
+        rot_mat[:, 0, 0] = rot_cos
+        rot_mat[:, 0, 1] = -rot_sin
+        rot_mat[:, 0, 2] = zeros
+        rot_mat[:, 1, 0] = rot_sin
+        rot_mat[:, 1, 1] = rot_cos
+        rot_mat[:, 1, 2] = zeros
+        rot_mat[:, 2, 0] = zeros
+        rot_mat[:, 2, 1] = zeros
+        rot_mat[:, 2, 2] = ones
+        # Rotation: (N, M, 3) @ (N, 3, 3) -> (N, M, 3)
+        return np.einsum('aij,ajk->aik', points, rot_mat)
+    else:
+        raise ValueError(f'Only axis=2 (z-axis) is supported for LiDAR boxes')
+def compute_bbox_corners_np(bboxes):
+    """Compute 8 corners of 3D bbox"""
+    if len(bboxes) == 0:
+        return np.zeros((0, 8, 3), dtype=np.float32)
+    dtype = bboxes.dtype
+    # Extract bbox parameters
+    centers = bboxes[:, :3]  # (N, 3) [x, y, z] - the bottom center
+    w = bboxes[:, 3:4]  # width (y direction)
+    l = bboxes[:, 4:5]  # length (x direction)
+    h = bboxes[:, 5:6]  # height (z direction)
+    dims = np.concatenate([l, w, h], axis=1)  # (N, 3) [x_size, y_size, z_size] = [l, w, h]
+    yaws = bboxes[:, 6]  # (N,) yaw angle
+    # Fix: offset yaw by -80 degrees
+    yaws = yaws - (np.pi / 2.0 - np.pi / 18.0)
+    # Generate corners
+    corners_norm = np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1).astype(dtype)
+    # Rearrange to [0, 1, 3, 2, 4, 5, 7, 6]
+    corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+    # Use relative origin [0.5, 0.5, 0] (bottom center)
+    corners_norm = corners_norm - np.array([0.5, 0.5, 0], dtype=dtype)
+    # Scale corners: dims is [x_size, y_size, z_size]
+    corners = dims[:, np.newaxis, :] * corners_norm[np.newaxis, :, :]  # (N, 8, 3)
+    # Rotate around z-axis
+    corners = rotation_3d_in_axis_np(corners, yaws, axis=2)
+    # Translate to center point
+    corners += centers[:, np.newaxis, :]
+    return corners
+def draw_bbox3d_on_img_custom_np(bboxes, raw_img, lidar2img_rt, color=(0, 255, 0), thickness=2):
+    """Custom 3D bbox drawing"""
+    img = raw_img.copy()
+    if len(bboxes) == 0:
+        return img
+    if not isinstance(bboxes, np.ndarray):
+        bboxes = np.array(bboxes)
+    if not isinstance(lidar2img_rt, np.ndarray):
+        lidar2img_rt = np.array(lidar2img_rt)
+    lidar2img_rt = lidar2img_rt.reshape(4, 4)
+    # Compute corners
+    corners_3d = compute_bbox_corners_np(bboxes)  # (N, 8, 3)
+    num_bbox = corners_3d.shape[0]
+    # Project to 2D
+    corners_3d_flat = corners_3d.reshape(-1, 3)  # (N*8, 3)
+    ones = np.ones((corners_3d_flat.shape[0], 1), dtype=np.float32)
+    pts_4d = np.concatenate([corners_3d_flat, ones], axis=-1)  # (N*8, 4)
+    # Project
+    pts_2d = pts_4d @ lidar2img_rt.T  # (N*8, 4)
+    # Perspective division
+    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+    imgfov_pts_2d = pts_2d[:, :2].reshape(num_bbox, 8, 2)
+    line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
+                    (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
+    for i in range(num_bbox):
+        corners = imgfov_pts_2d[i].astype(np.int32)
+        for start, end in line_indices:
+            pt1 = (int(corners[start, 0]), int(corners[start, 1]))
+            pt2 = (int(corners[end, 0]), int(corners[end, 1]))
+            # Check if points are within image range
+            h, w = img.shape[:2]
+            if (0 <= pt1[0] < w and 0 <= pt1[1] < h) or (0 <= pt2[0] < w and 0 <= pt2[1] < h):
+                cv2.line(img, pt1, pt2, color, thickness, cv2.LINE_AA)
+    return img.astype(np.uint8)
+def post_process_outputs_np(all_cls_scores, all_bbox_preds, config, score_thr=0.1):
+    bbox_coder = config['model']['bbox_coder']
+    pc_range = bbox_coder['pc_range']
+    post_center_range = bbox_coder['post_center_range']
+    max_num = bbox_coder['max_num']
+    score_threshold = bbox_coder.get('score_threshold', None)
+    num_classes = bbox_coder['num_classes']
+    preds_dicts = {
+        'all_cls_scores': all_cls_scores,
+        'all_bbox_preds': all_bbox_preds
+    }
+    bbox_list = get_bboxes_custom_np(
+        preds_dicts, pc_range, post_center_range,
+        max_num, score_threshold, num_classes
+    )
+    results = []
+    for bboxes, scores, labels in bbox_list:
+        # Set class score thresholds
+        class_score_thrs = {
+            0: 0.3,  # Car
+            1: 0.3,  # Truck
+            2: 0.3,  # Construction vehicle
+            3: 0.3,  # Bus
+            4: 0.3,  # Trailer
+            5: 0.3,  # Barrier
+            6: 0.3,  # Motorcycle
+            7: 0.3,  # Bicycle
+            8: 0.3, # Pedestrian
+            9: 0.3,  # Traffic cone
+        }
+        default_thr = score_thr
+        keep_indices = []
+        for i in range(len(scores)):
+            cls_id = int(labels[i])
+            thr = class_score_thrs.get(cls_id, default_thr)
+            if scores[i] > thr:
+                keep_indices.append(i)
+        if len(keep_indices) == 0:
+            results.append(format_bbox_result_np(
+                np.zeros((0, 9), dtype=np.float32),
+                np.zeros((0,), dtype=np.float32),
+                np.zeros((0,), dtype=np.int64)
+            ))
+            continue
+        keep_indices = np.array(keep_indices, dtype=np.int64)
+        bboxes = bboxes[keep_indices]
+        scores = scores[keep_indices]
+        labels = labels[keep_indices]
+        # Circle NMS
+        dist_thrs = {
+            0: 2.0, 1: 3.0, 2: 2.5, 3: 4.0, 4: 3.0,
+            5: 1.0, 6: 1.5, 7: 1.0, 8: 0.5, 9: 0.3,
+        }
+        if len(scores) > 0:
+            keep_nms = circle_nms_np(bboxes, scores, labels, dist_thrs)
+            if len(keep_nms) > 0:
+                bboxes = bboxes[keep_nms]
+                scores = scores[keep_nms]
+                labels = labels[keep_nms]
+            else:
+                results.append(format_bbox_result_np(
+                    np.zeros((0, 9), dtype=np.float32),
+                    np.zeros((0,), dtype=np.float32),
+                    np.zeros((0,), dtype=np.int64)
+                ))
+                continue
+        results.append(format_bbox_result_np(bboxes, scores, labels))
+    return results
+def circle_nms_np(bboxes, scores, labels, dist_thrs):
+    if len(bboxes) == 0:
+        return np.array([], dtype=np.int64)
+    keep = []
+    order = np.argsort(scores)[::-1]
+    bboxes = bboxes[order]
+    scores = scores[order]
+    labels = labels[order]
+    pts = bboxes[:, :2]
+    labels_np = labels
+    suppressed = np.zeros(len(bboxes), dtype=bool)
+    for i in range(len(bboxes)):
+        if suppressed[i]:
+            continue
+        keep.append(order[i])
+        curr_cls = int(labels_np[i])
+        radius = dist_thrs.get(curr_cls, 1.0)
+        if i + 1 < len(bboxes):
+            dists = np.linalg.norm(pts[i+1:] - pts[i], axis=1)
+            idx_to_suppress = np.where(
+                (dists < radius) & (labels_np[i+1:] == curr_cls)
+            )[0]
+            suppressed[i+1:][idx_to_suppress] = True
+    return np.array(keep, dtype=np.int64)
+def denormalize_img_np(img_array, img_norm_cfg):
+    """Denormalize image array (C, H, W) to (H, W, C) BGR"""
+    mean = np.array(img_norm_cfg.get('mean', [123.675, 116.28, 103.53]))
+    std = np.array(img_norm_cfg.get('std', [58.395, 57.12, 57.375]))
+    # (C, H, W) RGB -> (H, W, C) RGB
+    if img_array.ndim == 3:
+        img = img_array.transpose(1, 2, 0)
+    else:
+        img = img_array
+    img = (img * std + mean)
+    img = np.clip(img, 0, 255).astype(np.uint8)
+    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    return img
+def draw_bev_map(bboxes, labels, scores, pc_range, bev_size=(800, 800), score_thr=0.1):
+    """Draw BEV (Bird's Eye View) map with detections
+    Args:
+        bboxes: (N, 9) numpy array, format: [x, y, z, w, l, h, yaw, vx, vy]
+        labels: (N,) numpy array, class labels
+        scores: (N,) numpy array, detection scores
+        pc_range: [x_min, y_min, z_min, x_max, y_max, z_max]
+        bev_size: (width, height) of BEV image
+        score_thr: score threshold
+    Returns:
+        bev_img: (H, W, 3) numpy array, BEV visualization
+    """
+    bev_w, bev_h = bev_size # BEV image size
+    bev_img = np.ones((bev_h, bev_w, 3), dtype=np.uint8) * 255  # White background
+    # Draw grid
+    x_min, y_min, z_min, x_max, y_max, z_max = pc_range
+    x_range = x_max - x_min
+    y_range = y_max - y_min
+    # Draw grid lines
+    grid_color = (200, 200, 200)  # Light gray grid lines
+    for i in range(-5, 6):
+        x = x_min + (i + 5) * x_range / 10
+        y = y_min + (i + 5) * y_range / 10
+        # Vertical lines (y direction in LiDAR -> x direction in image)
+        img_x = int((y - y_min) / y_range * bev_w)
+        if 0 <= img_x < bev_w:
+            cv2.line(bev_img, (img_x, 0), (img_x, bev_h), grid_color, 1)
+        # Horizontal lines (x direction in LiDAR -> y direction in image, flipped)
+        img_y = int((x_max - x) / x_range * bev_h)
+        if 0 <= img_y < bev_h:
+            cv2.line(bev_img, (0, img_y), (bev_w, img_y), grid_color, 1)
+    # Draw center lines (ego vehicle position) - darker on white background
+    center_x = int((0 - y_min) / y_range * bev_w)
+    center_y = int((x_max - 0) / x_range * bev_h)
+    cv2.line(bev_img, (center_x, 0), (center_x, bev_h), (150, 150, 150), 2)
+    cv2.line(bev_img, (0, center_y), (bev_w, center_y), (150, 150, 150), 2)
+    ego_length_px = 30  # pixels (representing ~4.5m, along x-axis rightward)
+    ego_width_px = 12   # pixels (representing ~1.8m, along y-axis downward)
+    ego_corners_local = np.array([
+        [ego_length_px//2, -ego_width_px//2],   # front-top (head)
+        [ego_length_px//2, ego_width_px//2],    # front-bottom
+        [-ego_length_px//2, ego_width_px//2],   # back-bottom
+        [-ego_length_px//2, -ego_width_px//2],  # back-top
+    ], dtype=np.float32)
+    rotation_angle_90 = np.pi / 2  # 90 degrees in radians
+    cos_rot_90 = np.cos(rotation_angle_90)
+    sin_rot_90 = np.sin(rotation_angle_90)
+    rot_mat_90 = np.array([[cos_rot_90, -sin_rot_90], [sin_rot_90, cos_rot_90]])
+    ego_corners_rotated_90 = ego_corners_local @ rot_mat_90.T
+    ego_corners_rotated = ego_corners_rotated_90 @ rot_mat_90.T
+    # Translate to image coordinates (center position)
+    ego_corners = []
+    for corner in ego_corners_rotated:
+        corner_img_x = int(center_x + corner[0])
+        corner_img_y = int(center_y + corner[1])
+        ego_corners.append([corner_img_x, corner_img_y])
+    ego_corners = np.array(ego_corners, dtype=np.int32)
+    # Draw filled rectangle
+    cv2.fillPoly(bev_img, [ego_corners], (0, 0, 255))  # Red filled
+    cv2.polylines(bev_img, [ego_corners], True, (0, 0, 0), 2)  # Black outline
+    arrow_length = ego_length_px // 2
+    initial_direction = np.array([1.0, 0.0])
+    arrow_dir_rotated_90 = initial_direction @ rot_mat_90.T
+    arrow_dir_rotated = arrow_dir_rotated_90 @ rot_mat_90.T
+    arrow_end_x = int(center_x + arrow_length * arrow_dir_rotated[0])
+    arrow_end_y = int(center_y + arrow_length * arrow_dir_rotated[1])
+    cv2.arrowedLine(bev_img, (center_x, center_y), (arrow_end_x, arrow_end_y),
+                   (0, 0, 0), 3, tipLength=0.3)  # Black arrow
+    if len(bboxes) == 0:
+        return bev_img
+    if score_thr > 0:
+        mask = scores > score_thr
+        bboxes = bboxes[mask]
+        labels = labels[mask]
+        scores = scores[mask]
+    if len(bboxes) == 0:
+        return bev_img
+    default_color = (255, 255, 255)
+    for i in range(len(bboxes)):
+        box = bboxes[i]
+        label = int(labels[i])
+        score = float(scores[i])
+        color = CLASS_COLORS.get(label, default_color)
+        x, y, z = box[0], box[1], box[2]  # center position
+        w, l, h = box[3], box[4], box[5]  # width, length, height
+        yaw = box[6]  # yaw angle
+        yaw = yaw - np.pi / 2.0  # Subtract 90 degrees (counterclockwise)
+        # Convert to image coordinates
+        # Note: In LiDAR coordinate, x is forward, y is left, z is up
+        # In BEV image (top-down view):
+        #   - x (forward) -> image y (downward, flipped)
+        #   - y (left) -> image x (rightward)
+        # So: img_x = (y - y_min) / y_range * bev_w
+        #     img_y = (x_max - x) / x_range * bev_h  (flip x to get top-down view)
+        img_x = int((y - y_min) / y_range * bev_w)
+        img_y = int((x_max - x) / x_range * bev_h)  # Flip x for top-down view
+        # Skip if outside image
+        if not (0 <= img_x < bev_w and 0 <= img_y < bev_h):
+            continue
+        # Calculate box dimensions in image space
+        box_w_px = int(w / x_range * bev_w)
+        box_l_px = int(l / y_range * bev_h)
+        # Draw rotated rectangle
+        # Calculate 4 corners of the box in LiDAR coordinates
+        cos_yaw = np.cos(yaw)
+        sin_yaw = np.sin(yaw)
+        # Box corners relative to center (in LiDAR frame: x forward, y left)
+        corners_local = np.array([
+            [l/2, w/2],   # front-right
+            [l/2, -w/2],  # front-left
+            [-l/2, -w/2], # back-left
+            [-l/2, w/2]   # back-right
+        ])
+        # Rotate corners
+        rot_mat = np.array([[cos_yaw, -sin_yaw], [sin_yaw, cos_yaw]])
+        corners_rotated = corners_local @ rot_mat.T
+        # Translate to world coordinates and convert to image space
+        corners_img = []
+        for corner in corners_rotated:
+            corner_x = x + corner[0]  # x in LiDAR (forward)
+            corner_y = y + corner[1]  # y in LiDAR (left)
+            corner_img_x = int((corner_y - y_min) / y_range * bev_w)  # y -> img_x
+            corner_img_y = int((x_max - corner_x) / x_range * bev_h)  # x -> img_y (flipped)
+            corners_img.append([corner_img_x, corner_img_y])
+        corners_img = np.array(corners_img, dtype=np.int32)
+        # Draw filled polygon (semi-transparent on white background)
+        overlay = bev_img.copy()
+        cv2.fillPoly(overlay, [corners_img], color)
+        cv2.addWeighted(overlay, 0.5, bev_img, 0.5, 0, bev_img)
+        # Draw outline (black on white background)
+        cv2.polylines(bev_img, [corners_img], True, (0, 0, 0), 2)
+        # Draw direction arrow (forward direction) - black on white
+        # In LiDAR: forward is +x, left is +y
+        # In BEV image: x -> img_y (flipped), y -> img_x
+        # So rotation: img_x += sin(yaw) * length, img_y -= cos(yaw) * length
+        arrow_length = max(box_l_px // 2, 10)
+        arrow_end_x = int(img_x + arrow_length * sin_yaw)   # y component -> img_x
+        arrow_end_y = int(img_y - arrow_length * cos_yaw)  # x component -> img_y (flipped)
+        cv2.arrowedLine(bev_img, (img_x, img_y), (arrow_end_x, arrow_end_y),
+                       (0, 0, 0), 2, tipLength=0.3)  # Black arrow
+        # Draw center point
+        cv2.circle(bev_img, (img_x, img_y), 3, (0, 0, 0), -1)  # Black center point
+    # Rotate BEV map counterclockwise by 90 degrees (map only, not text)
+    center = (bev_w // 2, bev_h // 2)
+    rotation_matrix = cv2.getRotationMatrix2D(center, 90, 1.0)  # 90 degrees counterclockwise
+    bev_img = cv2.warpAffine(bev_img, rotation_matrix, (bev_w, bev_h), borderValue=(255, 255, 255))
+    # Flip horizontally to fix mirror effect
+    bev_img = cv2.flip(bev_img, 1)  # 1 for horizontal flip
+    text = 'BEV Map'
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 1
+    thickness = 2
+    (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)
+    text_x = bev_w - text_width - 10
+    text_y = text_height + 10
+    cv2.putText(bev_img, text, (text_x, text_y), font, font_scale, (0, 0, 0), thickness)
+    return bev_img
+def visualize_results_np(img, result, lidar2img, img_norm_cfg, class_names, score_thr=0.3, pc_range=None):
+    num_cams = img.shape[1] if img.ndim == 5 else 1
+    raw_imgs = [denormalize_img_np(img[0, cam_idx], img_norm_cfg) for cam_idx in range(num_cams)]
+    boxes_3d = result.get('boxes_3d')
+    scores_3d = result.get('scores_3d')
+    labels_3d = result.get('labels_3d')
+    vis_imgs = []
+    boxes_3d_for_bev = labels_3d_for_bev = scores_3d_for_bev = None
+    if boxes_3d is not None and len(boxes_3d) > 0:
+        mask = (scores_3d > score_thr) if (score_thr > 0 and scores_3d is not None) else np.ones_like(scores_3d, dtype=bool)
+        if np.any(mask):
+            boxes_3d = boxes_3d[mask]
+            scores_3d = scores_3d[mask]
+            labels_3d = labels_3d[mask]
+        boxes_3d_for_bev = boxes_3d.copy()
+        labels_3d_for_bev = labels_3d.copy()
+        scores_3d_for_bev = scores_3d.copy()
+        for cam_idx, vis_img in enumerate(raw_imgs):
+            vis_img = vis_img.copy()
+            if lidar2img.shape[1] > cam_idx:
+                cam_lidar2img = lidar2img[0, cam_idx]
+                for box, label in zip(boxes_3d, labels_3d):
+                    color = CLASS_COLORS.get(int(label), (255, 255, 255))
+                    try:
+                        vis_img = draw_bbox3d_on_img_custom_np(box[None], vis_img, cam_lidar2img, color=color, thickness=2)
+                    except Exception:
+                        pass
+            vis_imgs.append(vis_img)
+    else:
+        vis_imgs = raw_imgs
+    if pc_range is None:
+        pc_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+    if boxes_3d_for_bev is not None and len(boxes_3d_for_bev) > 0:
+        bev_size = (vis_imgs[0].shape[0], vis_imgs[0].shape[0]) if vis_imgs else (800, 800)
+        bev_img = draw_bev_map(boxes_3d_for_bev, labels_3d_for_bev, scores_3d_for_bev, pc_range, bev_size=bev_size, score_thr=score_thr)
+    else:
+        bev_size = (vis_imgs[0].shape[0], vis_imgs[0].shape[0]) if vis_imgs else (800, 800)
+        bev_img = np.full((bev_size[1], bev_size[0], 3), 255, np.uint8)
+        cv2.putText(bev_img, 'BEV Map (No Detections)', (10, bev_size[1]//2), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
+    if len(vis_imgs) == 6:
+        target_height = max(img.shape[0] for img in vis_imgs)
+        resized_imgs = [img if img.shape[0] == target_height else cv2.resize(img, (int(img.shape[1] * target_height / img.shape[0]), target_height)) for img in vis_imgs]
+        reordered_imgs = [
+            resized_imgs[2], resized_imgs[0], resized_imgs[1],
+            cv2.flip(resized_imgs[4], 1), cv2.flip(resized_imgs[3], 1), cv2.flip(resized_imgs[5], 1)
+        ]
+        top_row = np.hstack(reordered_imgs[:3])
+        bottom_row = np.hstack(reordered_imgs[3:])
+        left_side = np.vstack([top_row, bottom_row])
+        bev_img = cv2.resize(bev_img, (int(bev_img.shape[1] * left_side.shape[0] / bev_img.shape[0]), left_side.shape[0]))
+        vis_img = np.hstack([left_side, bev_img])
+    elif len(vis_imgs) > 1:
+        target_height = max(img.shape[0] for img in vis_imgs)
+        resized_imgs = [img if img.shape[0] == target_height else cv2.resize(img, (int(img.shape[1] * target_height / img.shape[0]), target_height)) for img in vis_imgs]
+        if bev_img.shape[0] != target_height:
+            bev_img = cv2.resize(bev_img, (int(bev_img.shape[1] * target_height / bev_img.shape[0]), target_height))
+        vis_img = np.hstack([np.hstack(resized_imgs), bev_img])
+    else:
+        cam_img = vis_imgs[0] if vis_imgs else bev_img
+        if bev_img.shape[0] != cam_img.shape[0]:
+            bev_img = cv2.resize(bev_img, (int(bev_img.shape[1] * cam_img.shape[0] / bev_img.shape[0]), cam_img.shape[0]))
+        vis_img = np.hstack([cam_img, bev_img]) if vis_imgs else bev_img
+    return vis_img
+def create_video_from_images(image_dir, output_video_path, fps=3):
+    import subprocess
+    image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
+    if len(image_files) == 0:
+        return
+    first_img = cv2.imread(osp.join(image_dir, image_files[0]))
+    if first_img is None:
+        return
+    height, width = first_img.shape[:2]
+    max_width, max_height = 1920, 1080
+    if width > max_width or height > max_height:
+        scale = min(max_width / width, max_height / height)
+        width, height = int(width * scale), int(height * scale)
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
+    if not video_writer.isOpened():
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+        video_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
+    for img_file in tqdm(image_files, desc=f"Creating video: {osp.basename(output_video_path)}"):
+        img_path = osp.join(image_dir, img_file)
+        img = cv2.imread(img_path)
+        if img is not None:
+            if img.shape[:2] != (height, width):
+                img = cv2.resize(img, (width, height))
+            video_writer.write(img)
+    video_writer.release()
+def main():
+    args = parse_args()
+    # Load configuration from JSON
+    config = load_config_from_json(args.config_json)
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load AXModel
+    ax_session = load_axmodel(args.model)
+    # Get model parameters from config
+    transformer_cfg = config['model']['transformer']
+    bev_h = transformer_cfg['bev_h']
+    bev_w = transformer_cfg['bev_w']
+    embed_dims = transformer_cfg['embed_dims']
+    # Load scene index
+    scene_index_path = osp.join(args.data_dir, 'scene_index.json')
+    with open(scene_index_path, 'r') as f:
+        scene_index_data = json.load(f)
+    scenes_dict = scene_index_data['scenes']
+    scene_names = list(scenes_dict.keys())
+    end_scene = args.end_scene if args.end_scene is not None else len(scene_names)
+    end_scene = min(end_scene, len(scene_names))
+    prev_frame_info = {
+        'prev_bev': None,
+        'scene_token': None,
+        'prev_pos': np.zeros(3, dtype=np.float32),
+        'prev_angle': 0.0,
+    }
+    scene_results = defaultdict(list)
+    # Process all scenes
+    for scene_idx in range(args.start_scene, end_scene):
+        scene_name = scene_names[scene_idx]
+        scene_info = scenes_dict[scene_name]
+        sample_indices = scene_info['samples']
+        num_frames = len(sample_indices)
+        print(f"Processing scene {scene_idx+1}/{len(scene_names)}: {scene_name} ({num_frames} frames)")
+        # Reset prev_bev for new scene
+        if scene_name != prev_frame_info['scene_token']:
+            prev_frame_info['prev_bev'] = None
+            prev_frame_info['prev_pos'] = np.zeros(3, dtype=np.float32)
+            prev_frame_info['prev_angle'] = 0.0
+        prev_frame_info['scene_token'] = scene_name
+        # Process all frames in this scene
+        for local_idx, frame_idx in enumerate(tqdm(sample_indices, desc=f"Scene {scene_name}")):
+            # Load data
+            img, lidar2img, can_bus, meta = load_data(args.data_dir, scene_name, frame_idx)
+            # Process can_bus (compute delta)
+            curr_can_bus_np = can_bus[0]  # (18,)
+            tmp_pos = curr_can_bus_np[:3].copy()
+            tmp_angle = curr_can_bus_np[-1]
+            delta_can_bus_np = curr_can_bus_np.copy()
+            if prev_frame_info['prev_bev'] is not None and prev_frame_info['scene_token'] == scene_name:
+                delta_can_bus_np[:3] -= prev_frame_info['prev_pos']
+                delta_can_bus_np[-1] -= prev_frame_info['prev_angle']
+            else:
+                delta_can_bus_np[:3] = 0.0
+                delta_can_bus_np[-1] = 0.0
+            prev_frame_info['prev_pos'] = tmp_pos
+            prev_frame_info['prev_angle'] = tmp_angle
+            # Prepare prev_bev
+            prev_bev_input = next((inp for inp in ax_session.get_inputs() if inp.name == 'prev_bev'), None)
+            expected_shape = (bev_h * bev_w, 1, embed_dims)
+            if prev_bev_input is not None:
+                expected_shape = list(prev_bev_input.shape)
+                for i, dim in enumerate(expected_shape):
+                    if isinstance(dim, str) or dim < 0:
+                        expected_shape[i] = (bev_h * bev_w, 1, embed_dims)[i] if i < 3 else 1
+                expected_shape = tuple(expected_shape)
+            if prev_frame_info['prev_bev'] is None:
+                prev_bev = np.zeros(expected_shape, dtype=np.float32)
+            else:
+                prev_bev = prev_frame_info['prev_bev']
+                if prev_bev.shape != expected_shape and len(prev_bev.shape) == 3:
+                    prev_bev = prev_bev.reshape(expected_shape)
+            # Prepare AXEngine inputs
+            img_np = img.astype(np.float32)
+            lidar2img_np = lidar2img.astype(np.float32)
+            can_bus_np = delta_can_bus_np.reshape(1, -1).astype(np.float32)
+            input_names = [inp.name for inp in ax_session.get_inputs()]
+            ax_inputs = {}
+            for name in input_names:
+                if name == 'img':
+                    ax_inputs['img'] = img_np
+                elif name == 'can_bus':
+                    ax_inputs['can_bus'] = can_bus_np
+                elif name == 'lidar2img':
+                    ax_inputs['lidar2img'] = lidar2img_np
+                elif name == 'prev_bev':
+                    ax_inputs['prev_bev'] = prev_bev
+            # Run inference
+            ax_outputs = ax_session.run(None, ax_inputs)
+            bev_embed, all_cls_scores, all_bbox_preds = ax_outputs
+            prev_frame_info['prev_bev'] = bev_embed
+            # Post-process
+            results = post_process_outputs_np(
+                all_cls_scores, all_bbox_preds, config, args.score_thr
+            )
+            # Visualize
+            img_norm_cfg = config['img_norm']
+            class_names = config['dataset']['class_names']
+            pc_range = config['model']['bbox_coder']['pc_range']
+            vis_img = visualize_results_np(
+                img, results[0], lidar2img, img_norm_cfg, class_names, args.score_thr, pc_range=pc_range
+            )
+            scene_results[scene_name].append({
+                'frame_idx': local_idx,
+                'result': results[0],
+                'vis_img': vis_img,
+                'meta': meta
+            })
+    # Save results
+    for scene_name, frames in tqdm(scene_results.items(), desc="Save scene results"):
+        scene_dir = osp.join(args.output_dir, scene_name)
+        os.makedirs(scene_dir, exist_ok=True)
+        images_dir = osp.join(scene_dir, 'images')
+        os.makedirs(images_dir, exist_ok=True)
+        for local_idx, frame_data in enumerate(frames):
+            vis_img = frame_data['vis_img']
+            if vis_img is None:
+                continue
+            if not isinstance(vis_img, np.ndarray):
+                vis_img = np.array(vis_img)
+            if vis_img.dtype != np.uint8:
+                vis_img = (vis_img * 255).astype(np.uint8) if vis_img.max() <= 1.0 else vis_img.astype(np.uint8)
+            if len(vis_img.shape) == 3 and vis_img.shape[0] in (1, 3):
+                vis_img = vis_img.transpose(1, 2, 0)
+            if vis_img.shape[0] > 0 and vis_img.shape[1] > 0:
+                cv2.imwrite(osp.join(images_dir, f'frame_{local_idx:06d}.png'), vis_img)
+        create_video_from_images(images_dir, osp.join(scene_dir, f'{scene_name}_result.mp4'), args.fps)
+        print(f"✓ Scene {scene_name}: {len(frames)} frames, video: {osp.join(scene_dir, f'{scene_name}_result.mp4')}")
+if __name__ == '__main__':
+    main()