guoyb0 commited on Mar 9

Commit

9fe982a

verified ·

1 Parent(s): 17200d8

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

README.md +333 -0
configs/REPRODUCTION.md +200 -0
configs/ds_zero2.json +17 -0
eval_atlas.py +1175 -0
extract_streampetr_tokens.py +568 -0
extract_topomlp_tokens.py +381 -0
scripts/eval_checkpoint_offline.sh +44 -0
scripts/gen_atlas_caption_dashscope.py +272 -0
scripts/gen_atlas_caption_qa.py +274 -0
scripts/gen_atlas_openlane_subsetB_lane_qa.py +251 -0
scripts/gen_atlas_planning_qa.py +491 -0
scripts/run_val_extraction.sh +56 -0
scripts/train_no_caption_baseline.sh +50 -0
scripts/train_no_caption_baseline_offline.sh +48 -0
scripts/train_with_caption_balanced.sh +48 -0
scripts/vis_atlas_lane_gt_pred.py +500 -0
scripts/vis_atlas_planning_qualitative.py +800 -0
scripts/vis_traffic_violation.py +516 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/prompting.cpython-310.pyc +0 -0
src/__pycache__/prompting.cpython-38.pyc +0 -0
src/audit/__pycache__/__init__.cpython-310.pyc +0 -0
src/audit/__pycache__/audit_utils.cpython-310.pyc +0 -0
src/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
src/dataset/__pycache__/atlas_dataset.cpython-310.pyc +0 -0
src/dataset/__pycache__/atlas_dataset.cpython-38.pyc +0 -0
src/dataset/__pycache__/scene_sampler.cpython-310.pyc +0 -0
src/dataset/__pycache__/scene_sampler.cpython-38.pyc +0 -0
src/dataset/atlas_dataset.py +1416 -0
src/dataset/scene_sampler.py +111 -0
src/eval/__pycache__/__init__.cpython-310.pyc +0 -0
src/eval/__pycache__/__init__.cpython-38.pyc +0 -0
src/eval/__pycache__/metrics.cpython-310.pyc +0 -0
src/eval/__pycache__/metrics.cpython-38.pyc +0 -0
src/eval/metrics.py +852 -0
src/model/__init__.py +28 -0
src/model/__pycache__/__init__.cpython-310.pyc +0 -0
src/model/__pycache__/__init__.cpython-38.pyc +0 -0
src/model/__pycache__/configuration_atlas.cpython-310.pyc +0 -0
src/model/__pycache__/modeling_atlas.cpython-310.pyc +0 -0
src/model/__pycache__/modeling_atlas.cpython-38.pyc +0 -0
src/model/__pycache__/streampetr_adapter.cpython-310.pyc +0 -0
src/model/__pycache__/streampetr_adapter.cpython-38.pyc +0 -0
src/model/__pycache__/topomlp_adapter.cpython-310.pyc +0 -0
src/model/__pycache__/topomlp_adapter.cpython-38.pyc +0 -0
src/model/modeling_atlas.py +549 -0
src/model/streampetr_adapter.py +110 -0
src/model/topomlp_adapter.py +88 -0
src/prompting.py +277 -0
train_atlas.py +1018 -0

README.md ADDED Viewed

	@@ -0,0 +1,333 @@

+---
+license: apache-2.0
+tags:
+  - autonomous-driving
+  - 3d-detection
+  - lane-detection
+  - planning
+  - multimodal
+  - vicuna
+---
+# Atlas — 3D-Tokenized LLM for Autonomous Driving
+基于 [Atlas 论文](https://arxiv.org/abs/2405.18361) 的多模态自动驾驶大语言模型实现。将 **StreamPETR**（3D 目标检测）和 **TopoMLP**（车道线检测）提取的 3D visual tokens 注入 **Vicuna-7B** LLM，实现检测、车道线、规划等多任务统一生成。
+## 项目结构
+```
+3dtokenizer-atlas/
+├── train_atlas.py                  # Atlas LLM 训练入口
+├── eval_atlas.py                   # Atlas 评估入口
+├── extract_streampetr_tokens.py    # 预提取 StreamPETR detection tokens
+├── extract_topomlp_tokens.py      # 预提取 TopoMLP lane tokens
+├── train_streampetr.sh             # StreamPETR 预训练启动脚本
+├── train_topomlp.sh                # TopoMLP 预训练启动脚本
+│
+├── configs/
+│   ├── streampetr_atlas_aligned.py # StreamPETR 配置 (EVA-02 ViT-L, 800x1600)
+│   ├── topomlp_atlas_aligned.py    # TopoMLP 配置 (EVA-02 ViT-L, 800x1600)
+│   ├── ds_zero2.json               # DeepSpeed ZeRO-2 配置
+│   └── REPRODUCTION.md             # 复现文档
+│
+├── src/
+│   ├── model/
+│   │   ├── modeling_atlas.py       # AtlasForCausalLM 主模型
+│   │   ├── streampetr_adapter.py   # StreamPETR → 检测 token 适配器
+│   │   └── topomlp_adapter.py      # TopoMLP → 地图 token 适配器 (Top-K selection)
+│   ├── dataset/
+│   │   ├── atlas_dataset.py        # AtlasDataset + Collate
+│   │   └── scene_sampler.py        # SceneSequentialSampler (时序采样)
+│   ├── eval/
+│   │   └── metrics.py              # 评估指标 (F1/Chamfer/L2/Collision)
+│   └── prompting.py                # 多任务 Prompt 模板
+│
+├── scripts/
+│   ├── gen_atlas_full_data.py               # nuScenes → 检测 QA JSON
+│   ├── gen_atlas_openlane_subsetB_lane_qa.py # OpenLane-V2 → 车道线 QA JSON
+│   ├── gen_atlas_planning_qa.py             # nuScenes → 规划 QA JSON
+│   ├── train_no_caption_baseline.sh         # 无 caption 训练脚本
+│   └── train_with_caption_balanced.sh       # 含 caption 训练脚本
+│
+├── data/                                    # 训练/验证数据 (JSON)
+│   ├── atlas_nuscenes_train.json            # 检测 (28,130 样本)
+│   ├── atlas_nuscenes_val.json              # 检测验证 (6,019 样本)
+│   ├── openlane_subsetB_lane_train_4pt.json # 车道线 (27,968 样本, 4 点/lane)
+│   ├── openlane_subsetB_lane_val_4pt.json   # 车道线验证 (6,019 样本)
+│   ├── atlas_planning_train_uniad_command.json  # 规划 (23,541 样本, UniAD-style command)
+│   ├── atlas_planning_val_uniad_command.json    # 规划验证 (5,037 样本, UniAD-style command)
+│   ├── atlas_caption_train.json             # 环境描述 caption
+│   └── atlas_caption_val.json               # 环境描述 caption 验证
+│
+├── pretrained/                     # 预训练权重
+│   ├── vicuna-7b-v1.5/            # Vicuna-7B-v1.5 LLM
+│   ├── eva02_L_coco_det_sys_o365_remapped_fixed.pth
+│   └── streampetr/
+│       └── streampetr_eva02_ep24.pth
+│
+├── work_dirs/
+│   ├── _quick_eval_cpu.py          # 快速检测评估 (CPU, micro-avg F1)
+│   ├── _quick_eval_gpu.py          # 快速检测评估 (GPU)
+│   ├── _quick_eval_lane_gpu.py     # 快速车道线评估 (GPU)
+│   ├── _quick_eval_plan_gpu.py     # 快速规划评估 (GPU, scene-sequential)
+│   ├── precomputed_det_tokens_offline/  # 预提取的 StreamPETR tokens (offline 备选)
+│   │   ├── train/                  # 56,099 个 .pt 文件 (det + lane，planning 与 det 共享 ID)
+│   │   └── val/                    # 12,039 个 .pt 文件
+│   ├── precomputed_map_tokens_offline/  # 预提取的 TopoMLP tokens (offline 备选)
+│   │   ├── train/                  # 51,510 个 .pt 文件 (lane + planning)
+│   │   └── val/                    # 11,057 个 .pt 文件
+│   └── topomlp_atlas_aligned/     # TopoMLP 预训练权重
+│       └── epoch_24.pth
+│
+└── external/                       # 外部依赖
+    ├── StreamPETR/
+    ├── TopoMLP_Repo/
+    └── nuscenes-devkit/
+```
+## 模型架构
+```
+                   ┌─────────────────────────────────────┐
+  6x 环视相机图片 → │ StreamPETR (frozen, EVA-02 ViT-L)    │→ det tokens [B, 256, 256]
+                   │ TopoMLP   (frozen, EVA-02 ViT-L)    │→ one-to-one lane queries (300) → Top-K → map tokens [B, 256, 256]
+                   └─────────────────────────────────────┘
+                                    ↓
+                         AtlasUnifiedProjector
+                     ┌────────────────────────────────┐
+                     │ projector_det: Linear(256→4096) │  ← 单层线性投影
+                     │ projector_map: Linear(256→4096) │
+                     │ projector_rp:  Linear(3→256)    │  ← Reference Point, zero-init
+                     │ features += projector_rp(ref)   │
+                     └────────────────────────────────┘
+                                    ↓
+                    注入到 <query> token 位置 (256 det + 256 map)
+                                    ↓
+                   ┌───────────────────────────────────────┐
+                   │   Vicuna-7B (当前运行: 全参数微调)      │
+                   │   Causal Language Modeling Loss        │
+                   └───────────────────────────────────────┘
+                                    ↓
+                         多任务文本输出
+              (3D 检测 / 车道线 / 规划轨迹)
+```
+## 训练配置（来源与优先级）
+为避免“论文描述 / 脚本默认 / 实际运行”混淆，本仓库统一按以下优先级解释配置：
+1. **论文描述**：以 Atlas 原论文（arXiv:2405.18361）正文 Section 3.2 与 Appendix B.2 为准。
+2. **代码默认**：以 `train_atlas.py` 的 argparse 默认值为准。
+3. **实际运行**：以 `work_dirs/<exp>/args.json` + `work_dirs/*train*.log` 为准（最高优先级）。
+若三者冲突，请按第 3 条解释实验结果，不以 README 中示例命令覆盖真实运行参数。
+### 论文原文（Atlas LLM，Section 3.2 + Appendix B.2）
+| 项目 | 论文描述 |
+|------|----------|
+| Batch Size | 1 per GPU |
+| Learning Rate | 2e-5 |
+| Optimizer | AdamW (weight_decay=1e-4) |
+| LR Schedule | 3% linear warmup + cosine decay |
+| Max Length | 4096 |
+| 硬件/时长 | 8x A100，约 100 小时 |
+| LoRA | 论文附录未显式给出 LoRA 开关 |
+### 当前实际运行（示例：`work_dirs/atlas_no_caption_v3_linear_warmup`）
+来源：`work_dirs/atlas_no_caption_v3_linear_warmup/args.json` 与 `work_dirs/train_no_caption_v3_linear_warmup.log`
+| 参数 | 实际值 |
+|------|--------|
+| LLM | Vicuna-7B-v1.5 |
+| 微调方式 | **全参数微调** (`use_lora=false`) |
+| 可训练参数 | 6,740,530,176 |
+| Learning Rate | 2e-5 |
+| Optimizer | AdamW (`weight_decay=1e-4`, `torch_adam`, `adam_w_mode`) |
+| LR Schedule | WarmupCosineLR（warmup ratio=3%） |
+| Epochs | 10 |
+| Batch Size | 1 per GPU |
+| Gradient Accumulation | 2 |
+| Effective Batch Size | 8 (4 GPU x 1 x 2 accum) |
+| Total Steps | 99,550 |
+| Warmup Steps | 2,986 |
+| Max Sequence Length | 4096 |
+| 分布式 | DeepSpeed ZeRO-2 |
+| GPU | 4x NVIDIA H100 80GB |
+| 精度 | BF16（由 `configs/ds_zero2.json` 启用） |
+| Visual Tokens | **在线** (live frozen StreamPETR + TopoMLP, temporal memory)；离线预提取仅作为 fallback |
+### 训练数据
+| 任务 | 数据文件 | 样本数 |
+|------|---------|--------|
+| 3D 目标检测 | `atlas_nuscenes_train.json` | 28,130 |
+| 3D 车道线检测 | `openlane_subsetB_lane_train_4pt.json` | 27,968 |
+| 轨迹规划 | `atlas_planning_train_uniad_command.json` | 23,541 |
+| 环境描述 (可选) | `atlas_caption_train.json` | — |
+| **总计 (无 caption)** | | **79,639** |
+车道线数据使用 4 个采样点/lane（论文 Appendix A.2 要求 four lane points，本仓库实现为均匀采样），不设 lane 数量上限（论文未指定上限），按 BEV 距离近→远排序。实际平均约 25 条 lane/样本，最多约 80 条。所有坐标使用 1000-bin 离散化。规划数据包含 `gt_boxes_3d_per_timestep` 字段用于 ST-P3 对齐的 per-timestep 碰撞评测。
+三类主任务的 question pool 统一采用“前 3 条按论文 Table 6 / 7 / 9 原话整理，后 2 条为仓库补充的同风格扩展模板”的策略；其中车道线 Table 7 的第 2 条按论文现有文本原样保留。
+为避免运行时再依赖 prompt 文本猜任务，四类样本的磁盘 JSON 统一显式写入 `task` 字段：`detection` / `lane` / `planning` / `caption`。这是仓库层面的工程化 schema，对论文中的 question-answer 文本格式不做额外解释。
+caption 数据按论文 Appendix A.3 的单视角设定生成：Table 8 作为 GPT-4V 标注 prompt，human prompt 采用 Figure 5 风格的单模板，并注入具体 `camera_name`。
+当前仓库不再向 prompt 追加 bins-format hint；detection / lane / caption 默认以磁盘 JSON 中的 `human` 文本作为 prompt 主体语义来源。planning ��务运行时仍会按 `planning_table3_mode` 对磁盘 `human` prompt 做轻量重写，只负责插入/剥离 command 句和 ego-state 句，再统一做 `<query>` 展开、空白归一化和 `USER: ... / ASSISTANT:` 包装。
+当前 detection 的 canonical answer 格式为：`category: [x_bin, y_bin, z_bin], [x_bin, y_bin, z_bin]; category: [x_bin, y_bin, z_bin].`。当前 lane/map 的 canonical answer 格式为：`Lane: [x_bin, y_bin, z_bin], [x_bin, y_bin, z_bin], ...; [x_bin, y_bin, z_bin], ... .`。旧的 detection flat 文本和 `lane_centerline(id=...)` legacy 文本不再作为受支持协议。
+planning 的 answer/output protocol 采用 Figure 5 风格表述，但保持论文 Table 9 的二维语义：`Ego car speed value:[vx_bin, vy_bin]. Ego car acceleration value:[ax_bin, ay_bin]. Based on the ego car speed and acceleration you predicted, requeset the ego car planning waypoint in 3-seconds: [x_bin, y_bin], ...`。当前实现不为 planning 引入第三维，也不使用固定 `z=500` 占位。
+当前 planning JSON 的顶层 `route_command` 采用 **UniAD-style future-GT-derived command**：根据 future planning GT / future waypoints 的最后一个有效 timestep 的横向位移离散为 `turn left` / `turn right` / `go straight`。它不是 raw nuScenes 原生字段，也不是独立导航命令；因此 `atlas_high_level*` 在本仓库中的含义更接近 UniAD 风格条件输入，而不是 Atlas 论文 Table 3 严格意义上的独立 route command。
+### 3D Tokenizer 预训练 (已完成)
+| 参数 | StreamPETR | TopoMLP |
+|------|-----------|---------|
+| Backbone | EVA-02 ViT-L (embed_dim=1024) | EVA-02 ViT-L (embed_dim=1024) |
+| Resolution | 800x1600 | 800x1600 |
+| Queries | 256 (detection) | 256 (map, Top-K from 300 one-to-one queries) |
+| Control Points | - | 4 per lane |
+| Epochs | 24 | 24 |
+| 数据集 | nuScenes trainval | OpenLane-V2 subset-B |
+## 快速开始
+### 1. 环境
+```bash
+conda activate streampetr
+# 主要依赖: PyTorch 2.0+, transformers, peft, flash-attn, mmcv 1.7, mmdet3d 1.0
+# DeepSpeed (ZeRO-2): pip install deepspeed
+```
+### 2. 数据准备
+```bash
+# nuScenes 数据根目录 (含 v1.0-trainval/ 和 samples/)
+export DATA_ROOT=/path/to/nuscenes
+# OpenLane-V2 subset-B
+export OPENLANE_ROOT=/path/to/OpenLane-V2/subset_B
+# 生成检测 QA 数据 (按类别分组, 论文 Figure 5 格式)
+python scripts/gen_atlas_full_data.py \
+  --data-root $DATA_ROOT --split train \
+  --output data/atlas_nuscenes_train.json
+python scripts/gen_atlas_full_data.py \
+  --data-root $DATA_ROOT --split val \
+  --output data/atlas_nuscenes_val.json
+# 生成车道线 QA 数据 (4 点/lane, 无 lane 数量上限, BEV 距离排序)
+python scripts/gen_atlas_openlane_subsetB_lane_qa.py \
+  --openlane_root $OPENLANE_ROOT \
+  --split train --out_json data/openlane_subsetB_lane_train_4pt.json
+python scripts/gen_atlas_openlane_subsetB_lane_qa.py \
+  --openlane_root $OPENLANE_ROOT \
+  --split val --out_json data/openlane_subsetB_lane_val_4pt.json
+# 生成规划 QA 数据
+# 默认输出:
+#   data/atlas_planning_train_uniad_command.json
+#   data/atlas_planning_val_uniad_command.json
+# 默认写顶层 route_command（UniAD-style future-GT-derived command）
+# 默认 materialize atlas_high_level human prompt；运行时仍可通过
+# --planning_table3_mode 改写为 atlas_base / atlas_high_level_ego
+python scripts/gen_atlas_planning_qa.py \
+  --data-root $DATA_ROOT --split train
+python scripts/gen_atlas_planning_qa.py \
+  --data-root $DATA_ROOT --split val
+```
+### 3. 训练
+默认使用 **在线模式**（`--visual_token_mode online`），训练时 live 前向 frozen StreamPETR（含 temporal memory）和 TopoMLP，无需预提取 token。
+```bash
+# ===== 推荐：在线模式训练（默认）=====
+# 无 caption: det + planning + lane
+bash scripts/train_no_caption_baseline.sh
+# 含 caption: det + planning + lane + caption
+bash scripts/train_with_caption_balanced.sh
+```
+等效手动命令（以无 caption 为例）：
+```bash
+deepspeed --num_gpus 4 train_atlas.py \
+  --llm_model pretrained/vicuna-7b-v1.5 \
+  --data_json data/atlas_nuscenes_train.json,data/atlas_planning_train_uniad_command.json,data/openlane_subsetB_lane_train_4pt.json \
+  --data_root $DATA_ROOT \
+  --visual_token_mode online \
+  --streampetr_config configs/streampetr_atlas_aligned.py \
+  --streampetr_ckpt pretrained/streampetr/streampetr_eva02_ep24.pth \
+  --topomlp_config configs/topomlp_atlas_aligned.py \
+  --topomlp_ckpt work_dirs/topomlp_atlas_aligned/epoch_24.pth \
+  --deepspeed configs/ds_zero2.json \
+  --output_dir work_dirs/atlas_no_caption_online \
+  --lr 2e-5 --weight_decay 1e-4 \
+  --batch_size 1 --epochs 10 --gradient_accumulation_steps 2 \
+  --warmup_ratio 0.03 --max_grad_norm 1.0 \
+  --save_epochs 1 --log_steps 100 \
+  --seed 42 --num_workers 4
+```
+> **离线 fallback**：如需使用预提取 token 训练（速度更快，但 det 无 temporal memory），
+> 先运行预提取脚本，再使用 `bash scripts/train_no_caption_baseline_offline.sh`。
+> 预提取 token 存放于 `work_dirs/precomputed_*_tokens_offline/`。
+### 4. 评估
+`eval_atlas.py` 支持两种模式：**在线模式**（默认，live frozen encoder + temporal memory）和离线 fallback。
+```bash
+# ===== 推荐：在线模式评估（默认）=====
+# 检测
+bash scripts/eval_checkpoint.sh <checkpoint> data/atlas_nuscenes_val.json
+# 车道线
+bash scripts/eval_checkpoint.sh <checkpoint> data/openlane_subsetB_lane_val_4pt.json
+# 规划
+bash scripts/eval_checkpoint.sh <checkpoint> data/atlas_planning_val_uniad_command.json
+```
+等效手动命令：
+```bash
+python eval_atlas.py \
+  --checkpoint work_dirs/atlas_no_caption_online/final/checkpoint.pt \
+  --llm_model pretrained/vicuna-7b-v1.5 \
+  --visual_token_mode online \
+  --streampetr_config configs/streampetr_atlas_aligned.py \
+  --streampetr_ckpt pretrained/streampetr/streampetr_eva02_ep24.pth \
+  --topomlp_config configs/topomlp_atlas_aligned.py \
+  --topomlp_ckpt work_dirs/topomlp_atlas_aligned/epoch_24.pth \
+  --data_json data/atlas_nuscenes_val.json \
+  --data_root $DATA_ROOT \
+  --batch_size 1 --max_new_tokens 2700 --bf16
+```
+> **离线 fallback**：使用预提取 token 评估（速度更快，但 det 无 temporal memory）：
+> `bash scripts/eval_checkpoint_offline.sh <checkpoint> <data_json>`
+>
+> **快速验证脚本**（`work_dirs/_quick_eval_*.py`）使用离线 token，仅用于开发调试，**不等价于主在线评测，不应用于正式结果**。其口径与主评测存在差异：planning 解析更宽松、不走 live encoders/temporal memory、不自动检测 LoRA。
+> **评测协议说明**：
+> - **检测**：micro-averaged F1 @ 0.5/1.0/2.0/4.0m，BEV 中心距离匹配。
+> - **车道线**：使用 OpenLane-V2 官方 F-Score 评测器（`openlanev2` 为必需依赖，缺失时直接报错，不再退化为 Chamfer）。
+> - **规划**：L2 误差 + 碰撞率。规划数据含 `gt_boxes_3d_per_timestep` 字段时使用 ST-P3 对齐的 per-timestep 碰撞检测；旧数据自动退化为静态 box 检测。
+> - 在线主评测（`eval_atlas.py`）需要 `mmcv`、`mmdet3d`、`openlanev2` 三个关键依赖，缺失时会在启动前报错。
+## 参考
+- [Atlas: Is a 3D-Tokenized LLM the Key to Reliable Autonomous Driving?](https://arxiv.org/abs/2405.18361)
+- [StreamPETR](https://github.com/exiawsh/StreamPETR)
+- [TopoMLP](https://github.com/wudongming97/TopoMLP)
+- [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/)

configs/REPRODUCTION.md ADDED Viewed

	@@ -0,0 +1,200 @@

+# StreamPETR Atlas-Aligned 复现指南
+本文档记录了复现 Atlas 论文中 3D Tokenizer (StreamPETR) 所需的完整环境和数据准备信息。
+## 1. 版本矩阵（官方 StreamPETR 要求）
+| 依赖 | 版本 | 备注 |
+|------|------|------|
+| Python | >= 3.8 | 推荐 3.8 |
+| CUDA | 11.2 | 或兼容版本 |
+| PyTorch | 1.9.0 | `pip install torch==1.9.0+cu111` |
+| torchvision | 0.10.0 | |
+| mmcv-full | 1.6.0 | `pip install mmcv-full==1.6.0 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html` |
+| mmdet | 2.28.2 | |
+| mmsegmentation | 0.30.0 | |
+| mmdetection3d | 1.0.0rc6 | `git checkout v1.0.0rc6` |
+| flash-attn | 0.2.2 | 可选，但高分辨率训练需要 |
+### 注意事项
+- 如果使用 PyTorch >= 1.13，需要对应 flash-attn 0.2.8
+- Tesla V100 可能不兼容 flash-attn，需要注释相关代码
+## 2. StreamPETR 代码版本
+```bash
+# 克隆 StreamPETR
+git clone https://github.com/exiawsh/StreamPETR
+# 克隆 mmdetection3d 并切换到指定版本
+cd StreamPETR
+git clone https://github.com/open-mmlab/mmdetection3d.git
+cd mmdetection3d
+git checkout v1.0.0rc6
+pip install -e .
+```
+**当前仓库状态**：`external/StreamPETR` 是直接复制的代码，非 git 仓库，无法校验具体 commit。
+## 3. EVA-02 预训练权重
+| 文件 | 来源 | MD5 |
+|------|------|-----|
+| `eva02_L_coco_det_sys_o365_remapped.pth` | [GitHub Release](https://github.com/exiawsh/storage/releases/download/v1.0/eva02_L_coco_det_sys_o365_remapped.pth) | `15c389fe4e987275c3d08405ca9eeb92` |
+```bash
+# 下载权重
+mkdir -p /root/autodl-tmp
+wget -O /root/autodl-tmp/eva02_L_coco_det_sys_o365_remapped.pth \
+    https://github.com/exiawsh/storage/releases/download/v1.0/eva02_L_coco_det_sys_o365_remapped.pth
+# 验证 MD5
+md5sum /root/autodl-tmp/eva02_L_coco_det_sys_o365_remapped.pth
+# 应该输出: 15c389fe4e987275c3d08405ca9eeb92
+```
+## 4. nuScenes 数据准备
+### 4.1 下载数据集
+从 [nuScenes 官网](https://www.nuscenes.org/download) 下载：
+- Full dataset (v1.0-trainval): ~330GB
+- Mini dataset (v1.0-mini): ~4GB（调试用）
+### 4.2 生成 temporal infos pkl
+```bash
+# 在本仓库中请使用 external/StreamPETR 目录
+cd external/StreamPETR
+# 生成 nuscenes2d_temporal_infos_{train,val}.pkl
+python tools/create_data_nusc.py \
+    --root-path ./data/nuscenes \
+    --out-dir ./data/nuscenes \
+    --extra-tag nuscenes2d \
+    --version v1.0
+```
+### 4.3 或下载预处理好的 pkl
+| 文件 | 下载链接 |
+|------|----------|
+| train.pkl | [nuscenes2d_temporal_infos_train.pkl](https://github.com/exiawsh/storage/releases/download/v1.0/nuscenes2d_temporal_infos_train.pkl) |
+| val.pkl | [nuscenes2d_temporal_infos_val.pkl](https://github.com/exiawsh/storage/releases/download/v1.0/nuscenes2d_temporal_infos_val.pkl) |
+| test.pkl | [nuscenes2d_temporal_infos_test.pkl](https://github.com/exiawsh/storage/releases/download/v1.0/nuscenes2d_temporal_infos_test.pkl) |
+### 4.4 目录结构（官方 StreamPETR 期望）
+```
+external/StreamPETR/data/nuscenes/
+├── maps/
+├── samples/
+├── sweeps/
+├── v1.0-trainval/
+├── nuscenes2d_temporal_infos_train.pkl
+├── nuscenes2d_temporal_infos_val.pkl
+└── nuscenes2d_temporal_infos_test.pkl
+```
+## 5. 训练配置
+### 5.1 3D Tokenizer 预训练设置（Appendix B.1）
+| 参数 | 设置 | 备注 |
+|------|-----------|------|
+| Backbone | EVA-02 ViT-L | 1024 embed_dim |
+| Resolution | 800×1600 | 高分辨率 |
+| Detection queries | 256 | |
+| Epochs | 24 | |
+| Batch size | 1 per GPU | 高分辨率需要小 batch |
+| GPUs | 8× A100 | 论文使用 |
+| Base LR | 4e-4 | bs=16 (8 GPU × 2) |
+| Backbone LR | 0.1× base | |
+### 5.2 Atlas LLM 配置（论文原文 Appendix B.1）
+| 参数 | 论文描述 |
+|------|----------|
+| Batch size | 1 per GPU |
+| Learning rate | 2e-5 |
+| Optimizer | AdamW (weight_decay=1e-4) |
+| LR schedule | 3% linear warmup + cosine |
+| Max length | 4096 |
+| 训练硬件/时长 | 8x A100，约 100 小时 |
+> 说明：论文附录未显式给出 LoRA 开关。
+> 如需解释某次实验结果，请以该实验的 `work_dirs/<exp>/args.json` 与训练日志为准。
+### 5.3 当前配置（A100 环境）
+当前配置使用线性缩放学习率（官方注释: "bs 8: 2e-4 || bs 16: 4e-4"）：
+- **8× A100 GPU**
+- **batch_size = 1 per GPU**
+- **Base LR = 2e-4** (effective bs=8，按官方注释使用 2e-4)
+- **配置文件**：`configs/streampetr_atlas_aligned.py`
+### 5.4 有效 batch size 计算
+```
+有效 batch size = num_gpus × batch_size_per_gpu × gradient_accumulation
+              = 8 × 1 × 1 = 8
+```
+官方学习率参考: "bs 8: 2e-4 || bs 16: 4e-4"
+当前 effective bs=8，所以使用 lr=2e-4。
+如需匹配官方 bs=16，可设置 `gradient_accumulation_steps=2`，并将 lr 改为 4e-4。
+### 5.5 其他硬件适配
+如果 GPU 数量不同，需要线性缩放学习率：
+```
+调整后 LR = 2e-4 × (实际 effective batch size / 8)
+```
+例如 6×GPU、bs=1：`LR = 2e-4 × 6/8 = 1.5e-4`
+## 6. 训练命令（官方 StreamPETR）
+```bash
+# 推荐：从本仓库根目录启动（已强制使用官方 StreamPETR 配置）
+bash scripts/run_train_streampetr.sh
+# 或直接使用官方 StreamPETR 工具
+cd external/StreamPETR
+bash tools/dist_train.sh projects/configs/Atlas/atlas_streampetr_eva02_800x1600_256q_24e.py 8
+```
+> 注意：不要使用本仓库内的 `scripts/train_streampetr.py`，它是非官方实现，仅用于对比与调试。
+## 7. 验证环境
+```bash
+# 检查 CUDA
+python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}')"
+# 检查 mmdet3d
+python -c "import mmdet3d; print(f'mmdet3d: {mmdet3d.__version__}')"
+# 检查 flash-attn
+python -c "from flash_attn import flash_attn_func; print('flash-attn OK')"
+```
+## 8. 参考资料
+- [StreamPETR 官方仓库](https://github.com/exiawsh/StreamPETR)
+- [Atlas 论文](https://arxiv.org/abs/2405.18361)
+- [nuScenes 数据集](https://www.nuscenes.org/)
+- [EVA-02 论文](https://arxiv.org/abs/2303.11926)
+---
+## 缺失项清单（需补齐，否则无法严格复现）
+1. `external/StreamPETR` 的**准确 commit hash**（当前不是 git 仓库，无法校验版本）。
+2. Atlas 论文中 StreamPETR 预训练的**实际梯度累积与有效 batch size**（官方配置注释为 bs=16，但未显式开启 accum）。
+3. 论文实验所用的**完整依赖版本锁定**（除了版本矩阵以外，未提供 lockfile/requirements 固定依赖）。
+---
+**最后更新**: 2026-01-29

configs/ds_zero2.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "bf16": {"enabled": true},
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 50000000,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 50000000,
+    "contiguous_gradients": true
+  },
+  "gradient_accumulation_steps": 2,
+  "gradient_clipping": 1.0,
+  "train_batch_size": 8,
+  "train_micro_batch_size_per_gpu": 1,
+  "wall_clock_breakdown": false
+}

eval_atlas.py ADDED Viewed

	@@ -0,0 +1,1175 @@

+#!/usr/bin/env python3
+import argparse
+import os
+import sys
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List, Optional
+from collections import Counter, defaultdict
+import torch
+import numpy as np
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from src.model.modeling_atlas import AtlasForCausalLM
+from src.model.topomlp_adapter import TopoMLPToAtlasMapTokens
+from src.model.streampetr_adapter import extract_streampetr_topk_tokens
+from src.dataset.atlas_dataset import AtlasDataset, infer_task_type, make_atlas_collate_fn, load_tokenizer
+from src.dataset.scene_sampler import SceneSequentialSampler
+from src.prompting import PLANNING_TABLE3_MODES
+from src.eval.metrics import (
+    parse_atlas_output,
+    parse_planning_output,
+    normalize_ground_truths,
+    calculate_detection_f1,
+    calculate_multi_threshold_detection_f1,
+    calculate_lane_detection_metrics,
+    calculate_planning_metrics,
+)
+logger = logging.getLogger("eval_atlas")
+_DET_POINT_BLOCK_RE = re.compile(
+    r"\[\s*[-+]?\d+(?:\.\d+)?\s*,\s*[-+]?\d+(?:\.\d+)?\s*,\s*[-+]?\d+(?:\.\d+)?\s*\]"
+)
+def summarize_detection_parse(gen_text: str, det_preds: List[Dict]) -> Dict[str, object]:
+    stripped = gen_text.strip().rstrip(". \t\n")
+    parsed_count = int(len(det_preds))
+    raw_point_count = int(len(_DET_POINT_BLOCK_RE.findall(gen_text)))
+    is_empty_negative = stripped.lower() == "no objects detected within range"
+    partial_parse_suspected = bool(parsed_count > 0 and raw_point_count > parsed_count)
+    parse_failed = False
+    failure_reason = ""
+    if parsed_count == 0 and not is_empty_negative:
+        parse_failed = True
+        failure_reason = (
+            "coordinates_present_but_unparsed"
+            if raw_point_count > 0 else
+            "no_detection_pattern"
+        )
+    elif partial_parse_suspected:
+        failure_reason = "partial_parse_suspected"
+    return {
+        "detection_parse_failed": parse_failed,
+        "detection_parse_failure_reason": failure_reason,
+        "detection_partial_parse_suspected": partial_parse_suspected,
+        "detection_raw_point_count": raw_point_count,
+        "detection_parsed_count": parsed_count,
+        "detection_is_empty_negative": is_empty_negative,
+    }
+def summarize_lane_parse(gen_text: str, lane_preds: List[Dict]) -> Dict[str, object]:
+    stripped = gen_text.strip().rstrip(". \t\n")
+    raw_point_count = int(len(_DET_POINT_BLOCK_RE.findall(gen_text)))
+    parsed_lane_count = int(len(lane_preds))
+    parsed_point_count = int(sum(len(l.get("points", [])) for l in lane_preds))
+    is_empty_negative = stripped.lower() == "no lane centerlines detected within range"
+    partial_parse_suspected = bool(
+        parsed_lane_count > 0 and raw_point_count > parsed_point_count
+    )
+    parse_failed = False
+    failure_reason = ""
+    lowered = stripped.lower()
+    if parsed_lane_count == 0 and not is_empty_negative:
+        parse_failed = True
+        if lowered.startswith("lane_centerline("):
+            failure_reason = "legacy_lane_format"
+        elif raw_point_count > 0:
+            failure_reason = "coordinates_present_but_unparsed"
+        elif "lane:" in lowered:
+            failure_reason = "lane_prefix_without_valid_points"
+        else:
+            failure_reason = "no_lane_pattern"
+    elif partial_parse_suspected:
+        failure_reason = "partial_parse_suspected"
+    return {
+        "lane_parse_failed": parse_failed,
+        "lane_parse_failure_reason": failure_reason,
+        "lane_partial_parse_suspected": partial_parse_suspected,
+        "lane_raw_point_count": raw_point_count,
+        "lane_parsed_lane_count": parsed_lane_count,
+        "lane_parsed_point_count": parsed_point_count,
+        "lane_is_empty_negative": is_empty_negative,
+    }
+def _audit_lane_gt_point_counts(dataset, max_samples: int = 100):
+    """Spot-check lane GT point counts; warn if median != 4 (non-canonical format)."""
+    lane_indices = [i for i, t in enumerate(dataset._task_types) if t == "lane"]
+    if not lane_indices:
+        return
+    check = lane_indices[:max_samples]
+    pt_counts = []
+    for idx in check:
+        item = dataset.data[idx]
+        conv = item.get("conversations", [])
+        answer = ""
+        for turn in conv:
+            if turn.get("from") in ("gpt", "assistant"):
+                answer = turn.get("value", "")
+                break
+        lanes = parse_atlas_output(answer)
+        for lane in lanes:
+            pts = lane.get("points", [])
+            if pts:
+                pt_counts.append(len(pts))
+    if not pt_counts:
+        return
+    median_pts = int(sorted(pt_counts)[len(pt_counts) // 2])
+    if median_pts != 4:
+        logger.warning(
+            "Lane GT point-count median is %d (expected 4 for canonical _4pt format). "
+            "Sampled %d lanes from %d samples. "
+            "Check that --data_json points to the correct *_4pt.json files.",
+            median_pts, len(pt_counts), len(check),
+        )
+    else:
+        logger.info(
+            "Lane GT point-count check OK: median=%d, sampled %d lanes from %d samples",
+            median_pts, len(pt_counts), len(check),
+        )
+def summarize_lane_gt_parse(gt_answer: str, gt_lanes: List[Dict]) -> Dict[str, object]:
+    """Mirror of summarize_lane_parse() but for the GT side."""
+    stripped = gt_answer.strip().rstrip(". \t\n")
+    raw_point_count = int(len(_DET_POINT_BLOCK_RE.findall(gt_answer)))
+    parsed_lane_count = int(len(gt_lanes))
+    parsed_point_count = int(sum(len(l.get("points", [])) for l in gt_lanes))
+    is_empty_negative = stripped.lower() == "no lane centerlines detected within range"
+    partial_parse_suspected = bool(
+        parsed_lane_count > 0 and raw_point_count > parsed_point_count
+    )
+    parse_failed = False
+    failure_reason = ""
+    lowered = stripped.lower()
+    if parsed_lane_count == 0 and not is_empty_negative:
+        parse_failed = True
+        if lowered.startswith("lane_centerline("):
+            failure_reason = "legacy_lane_format"
+        elif raw_point_count > 0:
+            failure_reason = "coordinates_present_but_unparsed"
+        elif "lane:" in lowered:
+            failure_reason = "lane_prefix_without_valid_points"
+        else:
+            failure_reason = "no_lane_pattern"
+    elif partial_parse_suspected:
+        failure_reason = "partial_parse_suspected"
+    return {
+        "gt_lane_parse_failed": parse_failed,
+        "gt_lane_parse_failure_reason": failure_reason,
+        "gt_lane_partial_parse_suspected": partial_parse_suspected,
+        "gt_lane_raw_point_count": raw_point_count,
+        "gt_lane_parsed_lane_count": parsed_lane_count,
+        "gt_lane_parsed_point_count": parsed_point_count,
+        "gt_lane_is_empty_negative": is_empty_negative,
+    }
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--checkpoint", required=True)
+    p.add_argument("--llm_model", default="lmsys/vicuna-7b-v1.5")
+    p.add_argument("--visual_hidden_size", type=int, default=256)
+    p.add_argument("--num_det_queries", type=int, default=256)
+    p.add_argument("--num_map_queries", type=int, default=256)
+    p.add_argument("--streampetr_config", default=None)
+    p.add_argument("--streampetr_ckpt", default=None)
+    p.add_argument("--topomlp_config", default=None)
+    p.add_argument("--topomlp_ckpt", default=None)
+    p.add_argument("--data_json", required=True)
+    p.add_argument("--data_root", default="/mnt/data/nuscenes")
+    p.add_argument("--max_length", type=int, default=4096)
+    p.add_argument("--max_new_tokens", type=int, default=2700)
+    p.add_argument("--batch_size", type=int, default=1)
+    p.add_argument("--num_workers", type=int, default=4)
+    p.add_argument("--use_lora", action="store_true")
+    p.add_argument("--lora_r", type=int, default=64)
+    p.add_argument("--lora_alpha", type=int, default=64)
+    p.add_argument("--load_in_4bit", action="store_true")
+    p.add_argument("--output_json", default=None)
+    p.add_argument("--max_samples", type=int, default=0)
+    p.add_argument("--fp16", action="store_true")
+    p.add_argument("--bf16", action="store_true")
+    p.add_argument("--precomputed_det_tokens", default=None,
+                   help="[offline only] Dir with precomputed det tokens (.pt files)")
+    p.add_argument("--precomputed_map_tokens", default=None,
+                   help="[offline only] Dir with precomputed TopoMLP map tokens (.pt files)")
+    p.add_argument("--visual_token_mode", choices=("online", "offline"), default="online",
+                   help="Visual token source: online=live frozen encoders (default), offline=read *_offline dirs")
+    p.add_argument(
+        "--planning_table3_mode",
+        choices=PLANNING_TABLE3_MODES,
+        default="atlas_base",
+        help=(
+            "Planning prompt variant matching Atlas Table 3: "
+            "atlas_base=no command/no explicit ego state; "
+            "atlas_high_level=requires top-level route_command "
+            "(this repo uses a UniAD-style future-GT-derived command); "
+            "atlas_high_level_ego=requires top-level route_command plus "
+            "velocity/acceleration bins."
+        ),
+    )
+    return p.parse_args()
+def infer_task(item: Dict) -> str:
+    return infer_task_type(item)
+def load_frozen_encoder(config_path, ckpt_path, model_type, device):
+    if config_path is None or ckpt_path is None:
+        return None
+    try:
+        from mmcv import Config
+        from mmdet3d.models import build_model
+        from mmcv.runner import load_checkpoint
+    except ImportError:
+        raise RuntimeError(
+            f"mmcv/mmdet3d not installed but --{model_type}_config and "
+            f"--{model_type}_ckpt were explicitly provided. "
+            f"Install mmcv/mmdet3d or remove these arguments."
+        )
+    project_root = Path(__file__).resolve().parent
+    if model_type == "streampetr":
+        sp_root = str(project_root / "external" / "StreamPETR")
+        if sp_root not in sys.path:
+            sys.path.insert(0, sp_root)
+        try:
+            import projects.mmdet3d_plugin  # noqa: F401
+        except ImportError:
+            raise RuntimeError(
+                f"StreamPETR plugin not found under {sp_root}/projects/mmdet3d_plugin. "
+                f"Ensure the submodule is checked out, or remove --streampetr_config/--streampetr_ckpt."
+            )
+    elif model_type == "topomlp":
+        tp_root = str(project_root / "external" / "TopoMLP_Repo")
+        if tp_root not in sys.path:
+            sys.path.insert(0, tp_root)
+        try:
+            os.environ["ATLAS_TOPOMLP_MODELS_ONLY"] = "1"
+            from mmcv.utils import registry as _reg
+            _orig = _reg.Registry._register_module
+            def _tolerant_register(self, module, module_name=None, force=False):
+                return _orig(self, module, module_name=module_name, force=True)
+            _reg.Registry._register_module = _tolerant_register
+            import projects.topomlp  # noqa: F401
+            _reg.Registry._register_module = _orig
+        except ImportError:
+            raise RuntimeError(
+                f"TopoMLP plugin not found under {tp_root}/projects/topomlp. "
+                f"Ensure the submodule is checked out, or remove --topomlp_config/--topomlp_ckpt."
+            )
+    cfg = Config.fromfile(config_path)
+    model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"))
+    load_checkpoint(model, ckpt_path, map_location="cpu")
+    model.eval()
+    model.to(device)
+    for param in model.parameters():
+        param.requires_grad_(False)
+    logger.info("Loaded frozen %s from %s", model_type, ckpt_path)
+    return model
+def _run_streampetr_forward(model, imgs, img_metas, batch, device, prev_exists=None):
+    """Run frozen StreamPETR forward. prev_exists controls temporal memory."""
+    B, N = imgs.shape[:2]
+    img_feats = model.extract_img_feat(imgs, 1)
+    data = {
+        "img": imgs,
+        "img_feats": img_feats,
+        "prev_exists": prev_exists if prev_exists is not None else imgs.new_zeros(B),
+    }
+    if "intrinsics_det" in batch:
+        K3 = batch["intrinsics_det"].to(device)
+        K4 = torch.zeros(B, N, 4, 4, device=device, dtype=K3.dtype)
+        K4[:, :, :3, :3] = K3
+        K4[:, :, 3, 3] = 1.0
+        data["intrinsics"] = K4
+    else:
+        data["intrinsics"] = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).expand(B, N, -1, -1).contiguous()
+    if "lidar2img_det" in batch:
+        data["lidar2img"] = batch["lidar2img_det"].to(device)
+    else:
+        data["lidar2img"] = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).expand(B, N, -1, -1).contiguous()
+    if "ego_pose" in batch and batch["ego_pose"] is not None:
+        data["ego_pose"] = batch["ego_pose"].to(device)
+    else:
+        data["ego_pose"] = torch.eye(4, device=device).unsqueeze(0).expand(B, -1, -1).contiguous()
+    if "ego_pose_inv" in batch and batch["ego_pose_inv"] is not None:
+        data["ego_pose_inv"] = batch["ego_pose_inv"].to(device)
+    else:
+        data["ego_pose_inv"] = torch.inverse(data["ego_pose"])
+    if "timestamp" in batch and batch["timestamp"] is not None:
+        data["timestamp"] = batch["timestamp"].to(device)
+    else:
+        data["timestamp"] = torch.zeros(B, device=device)
+    location = model.prepare_location(img_metas, **data)
+    outs_roi = model.forward_roi_head(location, **data)
+    topk_indexes = outs_roi["topk_indexes"]
+    outs = model.pts_bbox_head(location, img_metas, topk_indexes, **data)
+    return outs
+def _reconstruct_topomlp_outs(saved: dict, device, dtype):
+    """Convert precomputed .pt dict back to the format adapter.forward() expects."""
+    def _restore(t):
+        return t.to(device=device, dtype=dtype).unsqueeze(0)
+    return {
+        "lc_outs_dec_list": [_restore(saved["lc_outs_dec"])],
+        "all_lc_cls_scores_list": [_restore(saved["lc_cls_scores"])],
+        "all_lc_preds_list": [_restore(saved["lc_preds"])],
+        "lc_outs_dec_one2many_list": [_restore(saved["lc_outs_dec_o2m"])],
+        "all_lc_cls_scores_one2many_list": [_restore(saved["lc_cls_scores_o2m"])],
+        "all_lc_preds_one2many_list": [_restore(saved["lc_preds_o2m"])],
+    }
+def extract_visual_tokens(
+    streampetr_model, topomlp_model, topomlp_adapter,
+    batch, device, num_det_queries, visual_hidden_size,
+    visual_token_mode="online",
+    streaming_state=None,
+    query_token_id=None,
+):
+    """Extract det + map visual tokens (eval version).
+    Mirrors train_atlas.extract_visual_tokens: in online mode with
+    streaming_state, StreamPETR temporal memory is scene-aware and
+    duplicate physical frames are protected. The needs_map gating
+    skips TopoMLP when the current sample only requires det queries.
+    """
+    B = batch["pixel_values_det"].shape[0]
+    N = batch["pixel_values_det"].shape[1]
+    vis: Dict[str, torch.Tensor] = {}
+    needs_map = False
+    if query_token_id is not None and "input_ids" in batch:
+        n_queries = int((batch["input_ids"] == query_token_id).sum(dim=-1).max().item())
+        needs_map = n_queries > num_det_queries
+    # ---- Detection tokens ----
+    if visual_token_mode == "offline" and "precomputed_det" in batch and "precomputed_det_ref" in batch:
+        vis["detection"] = batch["precomputed_det"].to(device)
+        vis["detection_ref_points"] = batch["precomputed_det_ref"].to(device)
+    elif visual_token_mode == "offline":
+        raise RuntimeError(
+            "visual_token_mode=offline but detection precomputed tokens are missing "
+            "for the current batch. Refusing to zero-fill."
+        )
+    elif streampetr_model is not None:
+        current_sample_id = batch.get("sample_id", [None])[0]
+        current_scene = batch.get("scene_id", ["__atlas_eval__"])[0]
+        reuse_cache = False
+        if streaming_state is not None:
+            prev_scene = streaming_state.get("prev_scene_token")
+            prev_sample_id = streaming_state.get("prev_sample_id")
+            ts_tensor = batch.get("timestamp")
+            current_ts = float(ts_tensor[0].item()) if ts_tensor is not None else None
+            prev_ts = streaming_state.get("prev_timestamp")
+            is_new_segment = (
+                prev_scene is None
+                or current_scene != prev_scene
+                or (current_ts is not None and prev_ts is not None and current_ts <= prev_ts)
+            )
+            if current_sample_id is not None and current_sample_id == prev_sample_id:
+                cached = streaming_state.get("cached_det")
+                if cached is not None:
+                    reuse_cache = True
+                    vis["detection"] = cached["detection"]
+                    vis["detection_ref_points"] = cached["detection_ref_points"]
+            if not reuse_cache:
+                if is_new_segment:
+                    streampetr_model.pts_bbox_head.reset_memory()
+                prev_exists_val = 0.0 if is_new_segment else 1.0
+                prev_exists = batch["pixel_values_det"].new_full((B,), prev_exists_val)
+                imgs_det = batch["pixel_values_det"].to(device)
+                fH, fW = 800, 1600
+                img_metas = [{
+                    "pad_shape": [(fH, fW, 3)] * N,
+                    "img_shape": [(fH, fW, 3)] * N,
+                    "scene_token": current_scene,
+                } for _ in range(B)]
+                if "lidar2img_det" in batch:
+                    for b in range(B):
+                        img_metas[b]["lidar2img"] = batch["lidar2img_det"][b].cpu().numpy()
+                with torch.no_grad():
+                    _run_streampetr_forward(streampetr_model, imgs_det, img_metas, batch, device, prev_exists=prev_exists)
+                ego_pose_for_ref = batch.get("ego_pose")
+                if ego_pose_for_ref is not None:
+                    ego_pose_for_ref = ego_pose_for_ref.to(device)
+                det_out = extract_streampetr_topk_tokens(
+                    streampetr_model.pts_bbox_head, topk=num_det_queries,
+                    ego_pose=ego_pose_for_ref,
+                )
+                vis["detection"] = det_out["detection"]
+                vis["detection_ref_points"] = det_out["detection_ref_points"]
+                streaming_state["cached_det"] = {
+                    "detection": vis["detection"],
+                    "detection_ref_points": vis["detection_ref_points"],
+                }
+            streaming_state["prev_scene_token"] = current_scene
+            streaming_state["prev_sample_id"] = current_sample_id
+            if batch.get("timestamp") is not None:
+                streaming_state["prev_timestamp"] = float(batch["timestamp"][0].item())
+        else:
+            imgs_det = batch["pixel_values_det"].to(device)
+            fH, fW = 800, 1600
+            img_metas = [{
+                "pad_shape": [(fH, fW, 3)] * N,
+                "img_shape": [(fH, fW, 3)] * N,
+                "scene_token": current_scene,
+            } for _ in range(B)]
+            if "lidar2img_det" in batch:
+                for b in range(B):
+                    img_metas[b]["lidar2img"] = batch["lidar2img_det"][b].cpu().numpy()
+            with torch.no_grad():
+                streampetr_model.pts_bbox_head.reset_memory()
+                _run_streampetr_forward(streampetr_model, imgs_det, img_metas, batch, device)
+            ego_pose_for_ref = batch.get("ego_pose")
+            if ego_pose_for_ref is not None:
+                ego_pose_for_ref = ego_pose_for_ref.to(device)
+            det_out = extract_streampetr_topk_tokens(
+                streampetr_model.pts_bbox_head, topk=num_det_queries,
+                ego_pose=ego_pose_for_ref,
+            )
+            vis["detection"] = det_out["detection"]
+            vis["detection_ref_points"] = det_out["detection_ref_points"]
+    elif visual_token_mode == "online":
+        raise RuntimeError(
+            "visual_token_mode=online but StreamPETR model is None. "
+            "Provide --streampetr_config and --streampetr_ckpt."
+        )
+    else:
+        vis["detection"] = torch.zeros(B, num_det_queries, visual_hidden_size, device=device)
+        vis["detection_ref_points"] = torch.zeros(B, num_det_queries, 3, device=device)
+    # ---- Map tokens ----
+    num_map_queries = num_det_queries
+    if topomlp_adapter is not None:
+        num_map_queries = topomlp_adapter.num_map_tokens
+    map_filled = False
+    if visual_token_mode == "offline" and topomlp_adapter is not None and "precomputed_map" in batch:
+        _params = list(topomlp_adapter.parameters())
+        _bufs = list(topomlp_adapter.buffers())
+        adapter_dtype = _params[0].dtype if _params else (_bufs[0].dtype if _bufs else torch.float32)
+        if B == 1:
+            outs = _reconstruct_topomlp_outs(batch["precomputed_map"][0], device, adapter_dtype)
+        else:
+            per_sample = [_reconstruct_topomlp_outs(batch["precomputed_map"][b], device, adapter_dtype) for b in range(B)]
+            outs = {}
+            for k in per_sample[0]:
+                outs[k] = [torch.cat([s[k][i] for s in per_sample], dim=0) for i in range(len(per_sample[0][k]))]
+        with torch.no_grad():
+            map_out = topomlp_adapter(outs)
+        vis["map"] = map_out["map"]
+        vis["map_ref_points"] = map_out["map_ref_points"]
+        map_filled = True
+    elif visual_token_mode == "offline" and topomlp_adapter is not None:
+        raise RuntimeError(
+            "visual_token_mode=offline but map precomputed tokens are missing "
+            "for the current batch. Refusing to zero-fill."
+        )
+    elif needs_map and topomlp_model is not None and topomlp_adapter is not None:
+        imgs_map = batch["pixel_values_map"].to(device)
+        img_metas = []
+        for b in range(B):
+            meta = {"img_shape": [(800, 1600, 3)] * N, "pad_shape": [(800, 1600, 3)] * N}
+            meta["scale_factor"] = 1.0
+            meta["te_yolov8"] = None
+            if "lidar2img_map" in batch:
+                meta["lidar2img"] = batch["lidar2img_map"][b].cpu().numpy()
+            img_metas.append(meta)
+        with torch.no_grad():
+            outs = topomlp_model.simple_forward(imgs_map, img_metas)
+            map_out = topomlp_adapter(outs)
+        vis["map"] = map_out["map"]
+        vis["map_ref_points"] = map_out["map_ref_points"]
+        map_filled = True
+    if topomlp_adapter is not None and not map_filled:
+        vis["map"] = torch.zeros(B, num_map_queries, visual_hidden_size, device=device)
+        vis["map_ref_points"] = torch.zeros(B, num_map_queries, 3, device=device)
+    return vis
+def parse_gt_from_item(item: Dict, task: str) -> Dict:
+    gt = {}
+    if task == "detection":
+        annotations = item.get("gt_boxes_3d", item.get("annotations", []))
+        gt_dets = []
+        for ann in annotations:
+            if isinstance(ann, dict):
+                cat = ann.get("category_name", ann.get("category", "unknown"))
+                if "box" in ann:
+                    coords = ann["box"][:3]
+                elif "translation" in ann:
+                    coords = ann["translation"][:3]
+                else:
+                    continue
+                gt_dets.append({
+                    "category": cat,
+                    "world_coords": list(coords),
+                })
+        gt["detections"] = normalize_ground_truths(gt_dets)
+    elif task == "lane":
+        conv = item.get("conversations", [])
+        answer = ""
+        for turn in conv:
+            if turn.get("from") in ("gpt", "assistant"):
+                answer = turn.get("value", "")
+                break
+        gt["lanes"] = parse_atlas_output(answer)
+    elif task == "planning":
+        ego = item.get("ego_motion", {})
+        gt["waypoints"] = ego.get("waypoints", [])
+        gt["gt_boxes"] = item.get("gt_boxes_3d", [])
+        if "gt_boxes_3d_per_timestep" in item:
+            gt["gt_boxes_per_timestep"] = item["gt_boxes_3d_per_timestep"]
+    return gt
+def _check_task_dependencies(args, *, has_lane: bool, is_online: bool):
+    """Check dependencies based on actual task distribution in the dataset."""
+    missing = []
+    if is_online:
+        for mod in ("mmcv", "mmdet3d"):
+            try:
+                __import__(mod)
+            except ImportError:
+                missing.append(mod)
+    if has_lane:
+        try:
+            __import__("openlanev2")
+        except ImportError:
+            missing.append("openlanev2 (needed for lane F-Score)")
+    if missing:
+        raise RuntimeError(
+            f"Missing dependencies for this eval run: {', '.join(missing)}. "
+            f"Install them before running eval_atlas.py."
+        )
+def main():
+    args = parse_args()
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = load_tokenizer(args.llm_model)
+    if "<query>" not in tokenizer.get_vocab():
+        tokenizer.add_tokens(["<query>"])
+    dtype = torch.float32
+    if args.bf16:
+        dtype = torch.bfloat16
+    elif args.fp16:
+        dtype = torch.float16
+    dm = "auto" if args.load_in_4bit else None
+    atlas = AtlasForCausalLM(
+        llm_model_name=args.llm_model,
+        visual_hidden_size=args.visual_hidden_size,
+        num_queries=args.num_det_queries,
+        num_map_queries=args.num_map_queries,
+        load_in_4bit=args.load_in_4bit,
+        use_flash_attention=True,
+        device_map=dm,
+        torch_dtype=dtype,
+        use_lora=args.use_lora,
+        lora_r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+    )
+    atlas.resize_token_embeddings(len(tokenizer))
+    atlas.set_query_token_id(tokenizer.convert_tokens_to_ids("<query>"))
+    if dm is None:
+        atlas = atlas.to(device)
+    topomlp_adapter = None
+    ckpt = torch.load(args.checkpoint, map_location="cpu")
+    if "atlas_state_dict" not in ckpt:
+        raise RuntimeError(
+            f"Checkpoint missing 'atlas_state_dict'. "
+            f"Top-level keys: {sorted(ckpt.keys()) if isinstance(ckpt, dict) else type(ckpt).__name__}. "
+            f"Make sure --checkpoint points to an Atlas training checkpoint (checkpoint.pt), "
+            f"not a frozen encoder weight (.pth)."
+        )
+    atlas_sd = ckpt["atlas_state_dict"]
+    if not isinstance(atlas_sd, dict) or len(atlas_sd) == 0:
+        raise RuntimeError(
+            f"'atlas_state_dict' is empty or not a dict (type={type(atlas_sd).__name__}, "
+            f"len={len(atlas_sd) if isinstance(atlas_sd, dict) else 'N/A'}). "
+            f"Checkpoint is likely corrupted."
+        )
+    # Auto-detect LoRA: if checkpoint has LoRA keys but model doesn't have LoRA,
+    # rebuild the model with LoRA enabled to prevent silent degradation.
+    has_lora_keys = any("lora_" in k for k in atlas_sd)
+    if has_lora_keys and not args.use_lora:
+        logger.warning(
+            "Checkpoint contains LoRA weights but --use_lora was not set. "
+            "Auto-enabling LoRA to prevent silent degradation."
+        )
+        args.use_lora = True
+        atlas = AtlasForCausalLM(
+            llm_model_name=args.llm_model,
+            visual_hidden_size=args.visual_hidden_size,
+            num_queries=args.num_det_queries,
+            num_map_queries=args.num_map_queries,
+            load_in_4bit=args.load_in_4bit,
+            use_flash_attention=True,
+            device_map=dm,
+            torch_dtype=dtype,
+            use_lora=True,
+            lora_r=args.lora_r,
+            lora_alpha=args.lora_alpha,
+        )
+        atlas.resize_token_embeddings(len(tokenizer))
+        atlas.set_query_token_id(tokenizer.convert_tokens_to_ids("<query>"))
+        if dm is None:
+            atlas = atlas.to(device)
+    missing, unexpected = atlas.load_state_dict(atlas_sd, strict=False)
+    if missing:
+        raise RuntimeError(
+            f"Atlas checkpoint is incomplete: {len(missing)} missing keys "
+            f"(first 10: {missing[:10]}). This means the checkpoint does not "
+            f"match the current model architecture. Refusing to evaluate with "
+            f"partially-initialized weights."
+        )
+    if unexpected:
+        logger.warning("Unexpected keys in checkpoint (possibly ignored): %d keys, first 5: %s",
+                       len(unexpected), unexpected[:5])
+    logger.info("Loaded Atlas weights from %s (%d keys, 0 missing)", args.checkpoint, len(atlas_sd))
+    _tp_bev_range = (-51.2, -25.6, -8.0, 51.2, 25.6, 4.0)
+    if args.topomlp_config:
+        try:
+            from mmcv import Config as _Cfg
+            _tp_cfg = _Cfg.fromfile(args.topomlp_config)
+            if hasattr(_tp_cfg, "point_cloud_range"):
+                _tp_bev_range = tuple(float(v) for v in _tp_cfg.point_cloud_range)
+                logger.info("TopoMLP bev_range from config: %s", _tp_bev_range)
+        except Exception as e:
+            logger.warning("Failed to read point_cloud_range from TopoMLP config: %s. Using default: %s", e, _tp_bev_range)
+    if args.topomlp_config or args.topomlp_ckpt or args.precomputed_map_tokens:
+        topomlp_adapter = TopoMLPToAtlasMapTokens(
+            num_map_tokens=args.num_map_queries,
+            hidden_size=args.visual_hidden_size,
+            bev_range=_tp_bev_range,
+        ).to(device)
+        if "adapter_state_dict" in ckpt:
+            topomlp_adapter.load_state_dict(ckpt["adapter_state_dict"], strict=False)
+        topomlp_adapter.eval()
+    atlas.eval()
+    is_online = args.visual_token_mode == "online"
+    _precomp_det = args.precomputed_det_tokens if not is_online else None
+    _precomp_map = args.precomputed_map_tokens if not is_online else None
+    dataset = AtlasDataset(
+        json_file=args.data_json,
+        image_root=args.data_root,
+        tokenizer=tokenizer,
+        max_length=args.max_length,
+        is_training=False,
+        planning_table3_mode=args.planning_table3_mode,
+        precomputed_det_tokens=_precomp_det,
+        precomputed_map_tokens=_precomp_map,
+    )
+    _task_counts = Counter(dataset._task_types)
+    has_lane = "lane" in _task_counts
+    has_planning = "planning" in _task_counts
+    needs_map_tokens = has_lane or has_planning
+    logger.info("Task needs: has_lane=%s, has_planning=%s, needs_map=%s",
+                has_lane, has_planning, needs_map_tokens)
+    if has_lane:
+        _audit_lane_gt_point_counts(dataset, max_samples=100)
+    _check_task_dependencies(args, has_lane=has_lane, is_online=is_online)
+    if is_online:
+        if args.precomputed_det_tokens or args.precomputed_map_tokens:
+            raise RuntimeError(
+                "visual_token_mode=online forbids --precomputed_* arguments."
+            )
+        if args.batch_size != 1:
+            raise RuntimeError(
+                "visual_token_mode=online with temporal memory requires "
+                "--batch_size 1. Got: %d" % args.batch_size
+            )
+        if not args.streampetr_config or not args.streampetr_ckpt:
+            raise RuntimeError(
+                "online mode requires --streampetr_config and --streampetr_ckpt"
+            )
+        for p in (args.streampetr_config, args.streampetr_ckpt):
+            if not os.path.exists(p):
+                raise RuntimeError(f"Required online asset does not exist: {p}")
+        if needs_map_tokens:
+            if not args.topomlp_config or not args.topomlp_ckpt:
+                raise RuntimeError(
+                    "online mode with lane/planning tasks requires "
+                    "--topomlp_config and --topomlp_ckpt"
+                )
+            for p in (args.topomlp_config, args.topomlp_ckpt):
+                if not os.path.exists(p):
+                    raise RuntimeError(f"Required online asset does not exist: {p}")
+    else:
+        if not _precomp_det:
+            raise RuntimeError(
+                "offline mode requires --precomputed_det_tokens"
+            )
+        if needs_map_tokens and not _precomp_map:
+            raise RuntimeError(
+                "offline mode with lane/planning tasks requires "
+                "--precomputed_map_tokens"
+            )
+        for p in (_precomp_det, _precomp_map):
+            if p and not os.path.isdir(p):
+                raise RuntimeError(f"Offline token directory does not exist: {p}")
+    streampetr_model = load_frozen_encoder(
+        args.streampetr_config, args.streampetr_ckpt, "streampetr", device,
+    )
+    topomlp_model = load_frozen_encoder(
+        args.topomlp_config, args.topomlp_ckpt, "topomlp", device,
+    )
+    if is_online:
+        scene_groups = dataset.get_scene_groups()
+        sampler = SceneSequentialSampler(scene_groups, shuffle_scenes=False)
+        logger.info("Online eval: scene-sequential sampler (%d scenes)", len(scene_groups))
+    else:
+        sampler = None
+    collate_fn = make_atlas_collate_fn(tokenizer.pad_token_id)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1 if is_online else args.batch_size,
+        shuffle=False,
+        sampler=sampler,
+        num_workers=args.num_workers,
+        collate_fn=collate_fn,
+        pin_memory=True,
+    )
+    streaming_state = {} if is_online else None
+    task_preds: Dict[str, List] = defaultdict(list)
+    task_gts: Dict[str, List] = defaultdict(list)
+    all_outputs: List[Dict] = []
+    sample_count = 0
+    logger.info("Starting evaluation on %d samples (mode=%s)...", len(dataset), args.visual_token_mode)
+    if is_online and streampetr_model is not None:
+        streampetr_model.pts_bbox_head.reset_memory()
+    for batch_idx, batch in enumerate(dataloader):
+        if args.max_samples > 0 and sample_count >= args.max_samples:
+            break
+        B = batch["input_ids"].shape[0]
+        input_ids = batch["input_ids"].to(device)
+        attention_mask = batch["attention_mask"].to(device)
+        visual_features = extract_visual_tokens(
+            streampetr_model, topomlp_model, topomlp_adapter,
+            batch, device, args.num_det_queries, args.visual_hidden_size,
+            visual_token_mode=args.visual_token_mode,
+            streaming_state=streaming_state,
+            query_token_id=atlas.query_token_id,
+        )
+        with torch.no_grad():
+            generated_ids = atlas.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                visual_features=visual_features,
+                max_new_tokens=args.max_new_tokens,
+                do_sample=False,
+            )
+        for b in range(B):
+            if args.max_samples > 0 and sample_count >= args.max_samples:
+                break
+            gen_text_full = tokenizer.decode(generated_ids[b], skip_special_tokens=True)
+            # Extract assistant response: take content after the last "ASSISTANT:" tag
+            _split_tag = "ASSISTANT:"
+            if _split_tag in gen_text_full:
+                gen_text = gen_text_full.split(_split_tag)[-1].strip()
+            else:
+                gen_text = gen_text_full.strip()
+            sample_id = batch["sample_id"][b] if "sample_id" in batch else str(sample_count)
+            item_idx = int(batch["dataset_idx"][b].item()) if "dataset_idx" in batch else (batch_idx * B + b)
+            item = dataset.data[item_idx]
+            task = infer_task(item)
+            gt = parse_gt_from_item(item, task)
+            record = {
+                "sample_id": sample_id,
+                "task": task,
+                "generated_text": gen_text,
+            }
+            if task == "detection":
+                preds = parse_atlas_output(gen_text)
+                det_preds = [p for p in preds if p.get("type") == "detection"]
+                gt_dets = gt.get("detections", [])
+                task_preds["detection"].append(det_preds)
+                task_gts["detection"].append(gt_dets)
+                record.update(summarize_detection_parse(gen_text, det_preds))
+                record["num_preds"] = len(det_preds)
+                record["num_gt"] = len(gt_dets)
+            elif task == "lane":
+                preds = parse_atlas_output(gen_text)
+                lane_preds = [p for p in preds if p.get("type") == "lane"]
+                gt_lanes = gt.get("lanes", [])
+                task_preds["lane"].append(lane_preds)
+                task_gts["lane"].append(gt_lanes)
+                record.update(summarize_lane_parse(gen_text, lane_preds))
+                record["num_preds"] = len(lane_preds)
+                record["num_gt"] = len(gt_lanes)
+                gt_answer_text = ""
+                for _turn in item.get("conversations", []):
+                    if _turn.get("from") in ("gpt", "assistant"):
+                        gt_answer_text = _turn.get("value", "")
+                        break
+                record.update(summarize_lane_gt_parse(gt_answer_text, gt_lanes))
+            elif task == "planning":
+                loose_plan_pred = parse_planning_output(gen_text, require_full_vap=False)
+                strict_plan_pred = parse_planning_output(gen_text, require_full_vap=True)
+                gt_wps = gt.get("waypoints", [])
+                gt_boxes = gt.get("gt_boxes", [])
+                has_waypoints = bool(
+                    loose_plan_pred is not None and "waypoints" in loose_plan_pred
+                )
+                has_velocity = bool(
+                    loose_plan_pred is not None and "velocity_bins" in loose_plan_pred
+                )
+                has_acceleration = bool(
+                    loose_plan_pred is not None and "acceleration_bins" in loose_plan_pred
+                )
+                vap_complete = bool(strict_plan_pred is not None)
+                require_strict = args.planning_table3_mode == "atlas_high_level_ego"
+                accepted_pred = strict_plan_pred if require_strict else loose_plan_pred
+                parse_ok = accepted_pred is not None and "waypoints" in accepted_pred
+                if parse_ok:
+                    plan_pred = accepted_pred
+                    record["planning_parse_failed"] = False
+                    record["planning_vap_complete"] = vap_complete
+                    record["planning_has_velocity"] = has_velocity
+                    record["planning_has_acceleration"] = has_acceleration
+                    record["planning_has_waypoints"] = True
+                    if require_strict:
+                        record["planning_parse_failure_reason"] = ""
+                    else:
+                        record["planning_parse_failure_reason"] = (
+                            "" if vap_complete else "missing_velocity_or_acceleration"
+                        )
+                else:
+                    plan_pred = {"waypoints": [[0.0, 0.0]] * max(len(gt_wps), 6)}
+                    record["planning_parse_failed"] = True
+                    record["planning_vap_complete"] = False
+                    record["planning_has_velocity"] = has_velocity
+                    record["planning_has_acceleration"] = has_acceleration
+                    record["planning_has_waypoints"] = has_waypoints
+                    if require_strict and has_waypoints and not vap_complete:
+                        record["planning_parse_failure_reason"] = (
+                            "strict_mode_missing_velocity_or_acceleration"
+                        )
+                    elif has_waypoints:
+                        record["planning_parse_failure_reason"] = "missing_velocity_or_acceleration"
+                    else:
+                        record["planning_parse_failure_reason"] = "unparseable_waypoints"
+                task_preds["planning"].append(plan_pred)
+                plan_gt_entry = {
+                    "waypoints": gt_wps,
+                    "gt_boxes": gt_boxes,
+                }
+                if "gt_boxes_per_timestep" in gt:
+                    plan_gt_entry["gt_boxes_per_timestep"] = gt["gt_boxes_per_timestep"]
+                task_gts["planning"].append(plan_gt_entry)
+                record["has_plan"] = has_waypoints
+            elif task == "caption":
+                record["skipped"] = True
+            else:
+                logger.warning("Unknown task %r for sample %s — skipping metrics", task, sample_id)
+                record["skipped"] = True
+            all_outputs.append(record)
+            sample_count += 1
+        if (batch_idx + 1) % 50 == 0:
+            logger.info("Processed %d / %d samples", sample_count, len(dataset))
+    logger.info("Evaluation complete. Total samples: %d", sample_count)
+    _skipped = sum(1 for r in all_outputs if r.get("skipped"))
+    if _skipped:
+        logger.warning(
+            "%d samples were not scored (caption or unknown task). "
+            "These consumed GPU time but produced no metrics.",
+            _skipped,
+        )
+    results = {}
+    if task_preds["detection"] and task_gts["detection"]:
+        thresholds = (0.5, 1.0, 2.0, 4.0)
+        global_counts = {t: {"tp": 0, "fp": 0, "fn": 0} for t in thresholds}
+        for s_preds, s_gts in zip(task_preds["detection"], task_gts["detection"]):
+            for t in thresholds:
+                m = calculate_detection_f1(s_preds, s_gts, threshold=t)
+                global_counts[t]["tp"] += m["tp"]
+                global_counts[t]["fp"] += m["fp"]
+                global_counts[t]["fn"] += m["fn"]
+        det_results = {}
+        for t in thresholds:
+            tp, fp, fn = global_counts[t]["tp"], global_counts[t]["fp"], global_counts[t]["fn"]
+            p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+            r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+            f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0.0
+            det_results[f"F1@{t}m"] = round(f1, 4)
+            det_results[f"P@{t}m"] = round(p, 4)
+            det_results[f"R@{t}m"] = round(r, 4)
+        n_det_total = len(task_preds["detection"])
+        n_det_failed = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "detection" and r.get("detection_parse_failed", False)
+        )
+        n_det_partial = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "detection" and r.get("detection_partial_parse_suspected", False)
+        )
+        n_det_empty_negative = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "detection" and r.get("detection_is_empty_negative", False)
+        )
+        det_results["num_samples"] = n_det_total
+        det_results["parse_fail_count"] = n_det_failed
+        det_results["parse_fail_rate"] = n_det_failed / max(n_det_total, 1)
+        det_results["partial_parse_count"] = n_det_partial
+        det_results["partial_parse_rate"] = n_det_partial / max(n_det_total, 1)
+        det_results["empty_negative_count"] = n_det_empty_negative
+        det_results["empty_negative_rate"] = n_det_empty_negative / max(n_det_total, 1)
+        results["detection"] = det_results
+        logger.info("Detection results (micro-averaged):")
+        for k, v in sorted(results["detection"].items()):
+            if isinstance(v, float):
+                logger.info("  %s: %.4f", k, v)
+    if task_preds["lane"] and task_gts["lane"]:
+        try:
+            from openlanev2.evaluation.f_score import LaneEval
+        except ImportError:
+            raise RuntimeError(
+                "openlanev2 is required for lane evaluation but could not be imported. "
+                "Install it with: pip install openlanev2"
+            )
+        _lane_evaluator = LaneEval()
+        def _lanes_to_ndarray_list(lanes):
+            out = []
+            for lane in lanes:
+                pts = lane.get("points", [])
+                if not pts:
+                    continue
+                rows = []
+                for pt in pts:
+                    if isinstance(pt, dict):
+                        rows.append(pt.get("world_coords", [0, 0, 0])[:3])
+                    else:
+                        rows.append(list(pt)[:3])
+                arr = np.array(rows, dtype=np.float64)
+                if arr.shape[0] >= 2:
+                    out.append(arr)
+            return out
+        stats = []
+        for pl, gl in zip(task_preds["lane"], task_gts["lane"]):
+            pa = _lanes_to_ndarray_list(pl)
+            ga = _lanes_to_ndarray_list(gl)
+            pc = [np.int8(1)] * len(pa)
+            gc = [np.int8(1)] * len(ga)
+            r, p, c, ng, np_, mn = _lane_evaluator.bench(pa, pc, ga, gc)
+            stats.append(np.array([r, p, c, ng, np_, mn]))
+        if stats:
+            s = np.array(stats)
+            tg = np.sum(s[:, 3])
+            tp_sum = np.sum(s[:, 4])
+            lane_r = float(np.sum(s[:, 0]) / max(tg, 1e-6))
+            lane_p = float(np.sum(s[:, 1]) / max(tp_sum, 1e-6))
+            lane_f1 = 2 * lane_p * lane_r / (lane_p + lane_r) if (lane_p + lane_r) > 0 else 0.0
+        else:
+            lane_p = lane_r = lane_f1 = 0.0
+        results["lane"] = {
+            "lane_precision": round(lane_p, 4),
+            "lane_recall": round(lane_r, 4),
+            "lane_f1": round(lane_f1, 4),
+            "method": "openlanev2_f_score",
+        }
+        n_lane_total = len(task_preds["lane"])
+        n_lane_failed = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "lane" and r.get("lane_parse_failed", False)
+        )
+        n_lane_partial = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "lane" and r.get("lane_partial_parse_suspected", False)
+        )
+        n_lane_empty_negative = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "lane" and r.get("lane_is_empty_negative", False)
+        )
+        results["lane"]["num_samples"] = n_lane_total
+        results["lane"]["parse_fail_count"] = n_lane_failed
+        results["lane"]["parse_fail_rate"] = n_lane_failed / max(n_lane_total, 1)
+        results["lane"]["partial_parse_count"] = n_lane_partial
+        results["lane"]["partial_parse_rate"] = n_lane_partial / max(n_lane_total, 1)
+        results["lane"]["empty_negative_count"] = n_lane_empty_negative
+        results["lane"]["empty_negative_rate"] = n_lane_empty_negative / max(n_lane_total, 1)
+        n_gt_lane_failed = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "lane" and r.get("gt_lane_parse_failed", False)
+        )
+        n_gt_lane_partial = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "lane" and r.get("gt_lane_partial_parse_suspected", False)
+        )
+        n_gt_lane_empty_negative = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "lane" and r.get("gt_lane_is_empty_negative", False)
+        )
+        results["lane"]["gt_parse_fail_count"] = n_gt_lane_failed
+        results["lane"]["gt_parse_fail_rate"] = n_gt_lane_failed / max(n_lane_total, 1)
+        results["lane"]["gt_partial_parse_count"] = n_gt_lane_partial
+        results["lane"]["gt_partial_parse_rate"] = n_gt_lane_partial / max(n_lane_total, 1)
+        results["lane"]["gt_empty_negative_count"] = n_gt_lane_empty_negative
+        results["lane"]["gt_empty_negative_rate"] = n_gt_lane_empty_negative / max(n_lane_total, 1)
+        logger.info("Lane results (OpenLane-V2 official F-Score):")
+        for k, v in sorted(results["lane"].items()):
+            if isinstance(v, float):
+                logger.info("  %s: %.4f", k, v)
+            else:
+                logger.info("  %s: %s", k, v)
+    if task_preds["planning"] and task_gts["planning"]:
+        results["planning"] = calculate_planning_metrics(
+            task_preds["planning"], task_gts["planning"],
+        )
+        n_plan_total = len(task_preds["planning"])
+        n_plan_failed = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "planning" and r.get("planning_parse_failed", False)
+        )
+        n_plan_vap_complete = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "planning" and r.get("planning_vap_complete", False)
+        )
+        n_plan_missing_va = sum(
+            1
+            for r in all_outputs
+            if r.get("task") == "planning"
+            and (not r.get("planning_parse_failed", False))
+            and (not r.get("planning_vap_complete", True))
+        )
+        results["planning"]["num_samples"] = n_plan_total
+        results["planning"]["parse_fail_count"] = n_plan_failed
+        results["planning"]["parse_fail_rate"] = (
+            n_plan_failed / max(n_plan_total, 1)
+        )
+        results["planning"]["vap_complete_count"] = n_plan_vap_complete
+        results["planning"]["vap_complete_rate"] = (
+            n_plan_vap_complete / max(n_plan_total, 1)
+        )
+        results["planning"]["missing_velocity_or_acceleration_count"] = n_plan_missing_va
+        results["planning"]["missing_velocity_or_acceleration_rate"] = (
+            n_plan_missing_va / max(n_plan_total, 1)
+        )
+        logger.info("Planning results:")
+        for k, v in sorted(results["planning"].items()):
+            if isinstance(v, float):
+                logger.info("  %s: %.4f", k, v)
+            else:
+                logger.info("  %s: %s", k, v)
+    output_path = args.output_json
+    if output_path is None:
+        ckpt_dir = Path(args.checkpoint).parent
+        output_path = str(ckpt_dir / "eval_results.json")
+    with open(output_path, "w") as f:
+        json.dump({
+            "metrics": results,
+            "num_samples": sample_count,
+            "args": vars(args),
+            "predictions": all_outputs[:100],
+        }, f, indent=2, ensure_ascii=False)
+    logger.info("Results saved to %s", output_path)
+if __name__ == "__main__":
+    main()

extract_streampetr_tokens.py ADDED Viewed

	@@ -0,0 +1,568 @@

+#!/usr/bin/env python3
+"""Pre-extract frozen StreamPETR detection tokens for offline Atlas training.
+Isolated by default. Set ATLAS_ALLOW_OFFLINE=1 to run.
+For online training (default), use: bash scripts/train_no_caption_baseline.sh
+"""
+import os
+import sys
+if os.environ.get("ATLAS_ALLOW_OFFLINE", "").lower() not in ("1", "true", "yes"):
+    print(
+        "ERROR: This is an OFFLINE token extraction script.\n"
+        "It is isolated by default to prevent accidental use.\n"
+        "If you really need it, set:  ATLAS_ALLOW_OFFLINE=1\n"
+        "For online training use:  bash scripts/train_no_caption_baseline.sh",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+import argparse
+import json
+import re
+import time
+from collections import defaultdict
+from pathlib import Path
+import torch
+import numpy as np
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from src.model.streampetr_adapter import extract_streampetr_topk_tokens
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--streampetr_config", required=True)
+    p.add_argument("--streampetr_ckpt", required=True)
+    p.add_argument("--data_json", required=True)
+    p.add_argument("--data_root", default="/home/guoyuanbo/autodl-tmp/data/nuscenes")
+    p.add_argument("--output_dir", required=True)
+    p.add_argument("--topk", type=int, default=256)
+    p.add_argument("--image_path_remap", default=None)
+    p.add_argument("--shard_id", type=int, default=0)
+    p.add_argument("--num_shards", type=int, default=1)
+    return p.parse_args()
+def load_streampetr(config_path, ckpt_path, device):
+    sp_root = str(Path(__file__).resolve().parent / "external" / "StreamPETR")
+    if sp_root not in sys.path:
+        sys.path.insert(0, sp_root)
+    import projects.mmdet3d_plugin  # noqa: F401
+    from mmcv import Config
+    from mmdet3d.models import build_model
+    from mmcv.runner import load_checkpoint
+    cfg = Config.fromfile(config_path)
+    model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"))
+    load_checkpoint(model, ckpt_path, map_location="cpu")
+    model.eval()
+    model.to(device)
+    for param in model.parameters():
+        param.requires_grad_(False)
+    return model
+def _parse_openlane_scene_timestamp(item):
+    sample_id = str(item.get("id", ""))
+    m = re.match(r"openlane_subsetB_(?:train|val)_(.+?)_(\d+)$", sample_id)
+    if m is not None:
+        return f"openlane_{m.group(1)}", int(m.group(2))
+    image_paths = item.get("image_paths", [])
+    if image_paths:
+        p0 = str(image_paths[0]).replace("\\", "/")
+        m2 = re.search(r"/(?:train|val)/([^/]+)/image/[^/]+/(\d+)\.(?:jpg|jpeg|png)$", p0, flags=re.IGNORECASE)
+        if m2 is not None:
+            return f"openlane_{m2.group(1)}", int(m2.group(2))
+    return None, None
+def build_scene_order(data_items, data_root):
+    nuscenes_root = Path(data_root)
+    sample_file = None
+    for v in ["v1.0-trainval", "v1.0-mini", "v1.0-test"]:
+        sf = nuscenes_root / v / "sample.json"
+        if sf.exists():
+            sample_file = sf
+            break
+    samples_meta = {}
+    if sample_file is None:
+        print("WARNING: sample.json not found — using OpenLane/id-based scene ordering where possible")
+    else:
+        with open(sample_file) as f:
+            samples_meta = {s["token"]: s for s in json.load(f)}
+    n_nuscenes = 0
+    n_openlane = 0
+    n_unknown = 0
+    scene_map = defaultdict(list)
+    for idx, item in enumerate(data_items):
+        sample_token = str(item.get("id", ""))
+        meta = samples_meta.get(sample_token, None)
+        if meta is not None:
+            scene_token = meta.get("scene_token", f"_nus_unknown_{idx}")
+            timestamp = int(meta.get("timestamp", 0))
+            scene_map[scene_token].append((timestamp, idx))
+            n_nuscenes += 1
+            continue
+        scene_token, timestamp = _parse_openlane_scene_timestamp(item)
+        if scene_token is not None:
+            scene_map[scene_token].append((int(timestamp), idx))
+            n_openlane += 1
+        else:
+            scene_map[f"_unknown_{idx}"].append((0, idx))
+            n_unknown += 1
+    scenes = []
+    for scene_token in sorted(scene_map.keys()):
+        frames = sorted(scene_map[scene_token], key=lambda x: x[0])
+        scenes.append([idx for _, idx in frames])
+    total = sum(len(s) for s in scenes)
+    print(
+        "Scene grouping: %d scenes, %d samples (nuScenes=%d, OpenLane=%d, unknown=%d)"
+        % (len(scenes), total, n_nuscenes, n_openlane, n_unknown)
+    )
+    return scenes
+def load_and_preprocess_images(item, data_root, image_path_remap, streampetr_conf, image_transform):
+    from PIL import Image
+    import torchvision.transforms as transforms
+    fH, fW = streampetr_conf["final_dim"]
+    images = []
+    intrinsics_list = []
+    extrinsics_list = []
+    lidar2img_list = []
+    calibration = streampetr_conf.get("_calibration")
+    cam_names = [
+        'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
+        'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT',
+    ]
+    ego_pose_out = None
+    ego_pose_inv_out = None
+    timestamp_out = None
+    for i, img_path in enumerate(item["image_paths"]):
+        camera_name = cam_names[i] if i < len(cam_names) else f"CAM_{i}"
+        for cam in sorted(cam_names, key=len, reverse=True):
+            if cam in str(img_path):
+                camera_name = cam
+                break
+        remapped = img_path
+        for old_prefix, new_prefix in image_path_remap.items():
+            if remapped.startswith(old_prefix):
+                remapped = new_prefix + remapped[len(old_prefix):]
+                break
+        if os.path.isabs(remapped):
+            full_path = remapped
+        else:
+            full_path = os.path.normpath(os.path.join(data_root, remapped))
+        img = Image.open(full_path).convert("RGB")
+        W_orig, H_orig = img.size
+        resize = max(fH / H_orig, fW / W_orig)
+        rW, rH = int(W_orig * resize), int(H_orig * resize)
+        crop_h = rH - fH
+        crop_w = max(0, rW - fW) // 2
+        if resize != 1.0:
+            img = img.resize((rW, rH), Image.BILINEAR)
+        img = img.crop((crop_w, crop_h, crop_w + fW, crop_h + fH))
+        K = None
+        E = None
+        ep_rec = None
+        sd_rec = None
+        if calibration is not None:
+            norm_path = str(img_path).replace("\\", "/").lstrip("./")
+            sd_rec = None
+            for cand in [img_path, norm_path]:
+                sd_rec = calibration["sample_data_by_filename"].get(cand)
+                if sd_rec:
+                    break
+            if sd_rec is None:
+                for key in ("samples/", "sweeps/"):
+                    if key in norm_path:
+                        sd_rec = calibration["sample_data_by_filename"].get(norm_path[norm_path.index(key):])
+                        if sd_rec:
+                            break
+            if sd_rec is not None:
+                cs = calibration["calibrated_sensor_by_token"].get(sd_rec.get("calibrated_sensor_token"))
+                ep_rec = calibration["ego_pose_by_token"].get(sd_rec.get("ego_pose_token"))
+                if cs is not None:
+                    K = np.array(cs["camera_intrinsic"], dtype=np.float32)
+                    q = cs["rotation"]
+                    t = cs["translation"]
+                    w, x, y, z = q
+                    R = np.array([
+                        [1 - 2*(y*y + z*z), 2*(x*y - z*w), 2*(x*z + y*w)],
+                        [2*(x*y + z*w), 1 - 2*(x*x + z*z), 2*(y*z - x*w)],
+                        [2*(x*z - y*w), 2*(y*z + x*w), 1 - 2*(x*x + y*y)],
+                    ], dtype=np.float32)
+                    E = np.eye(4, dtype=np.float32)
+                    E[:3, :3] = R
+                    E[:3, 3] = np.array(t, dtype=np.float32)
+                if ego_pose_out is None and ep_rec is not None:
+                    q_ep = ep_rec.get("rotation")
+                    t_ep = ep_rec.get("translation")
+                    if q_ep is not None and t_ep is not None:
+                        w, x, y, z = q_ep
+                        R_ep = np.array([
+                            [1 - 2*(y*y + z*z), 2*(x*y - z*w), 2*(x*z + y*w)],
+                            [2*(x*y + z*w), 1 - 2*(x*x + z*z), 2*(y*z - x*w)],
+                            [2*(x*z - y*w), 2*(y*z + x*w), 1 - 2*(x*x + y*y)],
+                        ], dtype=np.float32)
+                        ego_m = np.eye(4, dtype=np.float32)
+                        ego_m[:3, :3] = R_ep
+                        ego_m[:3, 3] = np.array(t_ep, dtype=np.float32)
+                        ego_pose_out = torch.tensor(ego_m, dtype=torch.float32)
+                        try:
+                            ego_pose_inv_out = torch.tensor(np.linalg.inv(ego_m), dtype=torch.float32)
+                        except Exception:
+                            ego_pose_inv_out = None
+                if timestamp_out is None and sd_rec is not None:
+                    ts = sd_rec.get("timestamp")
+                    if ts is not None:
+                        timestamp_out = torch.tensor(float(ts) * 1e-6, dtype=torch.float32)
+        if K is None or E is None:
+            sensor = (item or {}).get("sensor", None) if isinstance(item, dict) else None
+            if isinstance(sensor, dict) and camera_name in sensor:
+                cam_s = sensor[camera_name]
+                K = np.array(cam_s["intrinsic"]["K"], dtype=np.float32)
+                R = np.array(cam_s["extrinsic"]["rotation"], dtype=np.float32)
+                t = np.array(cam_s["extrinsic"]["translation"], dtype=np.float32)
+                E = np.eye(4, dtype=np.float32)
+                E[:3, :3] = R
+                E[:3, 3] = t
+        if K is None or E is None:
+            raise RuntimeError(f"no camera params for {img_path} (camera={camera_name})")
+        K_adj = K.copy()
+        K_adj[0, 0] *= resize
+        K_adj[1, 1] *= resize
+        K_adj[0, 2] = K_adj[0, 2] * resize - crop_w
+        K_adj[1, 2] = K_adj[1, 2] * resize - crop_h
+        images.append(image_transform(img))
+        intrinsics_list.append(torch.tensor(K_adj, dtype=torch.float32))
+        extrinsics_list.append(torch.tensor(E, dtype=torch.float32))
+        cam2ego = E.astype(np.float32)
+        ego2cam = np.linalg.inv(cam2ego)
+        K4 = np.eye(4, dtype=np.float32)
+        K4[:3, :3] = K_adj.astype(np.float32)
+        def _qR(qwxyz):
+            ww, xx, yy, zz = qwxyz
+            return np.array([
+                [1-2*(yy*yy+zz*zz), 2*(xx*yy-zz*ww), 2*(xx*zz+yy*ww)],
+                [2*(xx*yy+zz*ww), 1-2*(xx*xx+zz*zz), 2*(yy*zz-xx*ww)],
+                [2*(xx*zz-yy*ww), 2*(yy*zz+xx*ww), 1-2*(xx*xx+yy*yy)],
+            ], dtype=np.float32)
+        def _T(Rm, tv):
+            T = np.eye(4, dtype=np.float32)
+            T[:3, :3] = Rm
+            T[:3, 3] = np.array(tv, dtype=np.float32)
+            return T
+        lidar2img_mat = K4 @ ego2cam
+        if calibration is not None and sd_rec is not None and ep_rec is not None:
+            sample_tk = sd_rec.get("sample_token")
+            if sample_tk:
+                ego2global_c = _T(_qR(ep_rec["rotation"]), ep_rec["translation"])
+                global2ego_c = np.linalg.inv(ego2global_c)
+                lidar_sd = calibration.get("lidar_sd_by_sample_token", {}).get(str(sample_tk))
+                if lidar_sd is not None:
+                    lidar_cs = calibration["calibrated_sensor_by_token"].get(lidar_sd.get("calibrated_sensor_token"))
+                    lidar_ep = calibration["ego_pose_by_token"].get(lidar_sd.get("ego_pose_token"))
+                    if lidar_cs is not None and lidar_ep is not None:
+                        lidar2ego = _T(_qR(lidar_cs["rotation"]), lidar_cs["translation"])
+                        ego2global_l = _T(_qR(lidar_ep["rotation"]), lidar_ep["translation"])
+                        lidar2cam = ego2cam @ global2ego_c @ ego2global_l @ lidar2ego
+                        lidar2img_mat = K4 @ lidar2cam
+        lidar2img_list.append(torch.tensor(lidar2img_mat, dtype=torch.float32))
+    # Fallback: if nuScenes calibration lookup failed (e.g. OpenLane samples),
+    # recover ego_pose from item["pose"] and timestamp from item["timestamp"].
+    if ego_pose_out is None and isinstance(item, dict):
+        pose_data = item.get("pose", None)
+        if isinstance(pose_data, dict):
+            try:
+                rot_raw = pose_data.get("rotation", None)
+                t_p = pose_data.get("translation", None)
+                if rot_raw is not None and t_p is not None:
+                    arr = np.array(rot_raw, dtype=np.float32)
+                    if arr.shape == (3, 3):
+                        R_p = arr
+                    elif arr.shape == (4,):
+                        w, x, y, z = arr
+                        R_p = np.array([
+                            [1-2*(y*y+z*z), 2*(x*y-z*w), 2*(x*z+y*w)],
+                            [2*(x*y+z*w), 1-2*(x*x+z*z), 2*(y*z-x*w)],
+                            [2*(x*z-y*w), 2*(y*z+x*w), 1-2*(x*x+y*y)],
+                        ], dtype=np.float32)
+                    else:
+                        raise ValueError(f"Unsupported rotation shape: {arr.shape}")
+                    T_p = np.eye(4, dtype=np.float32)
+                    T_p[:3, :3] = R_p
+                    T_p[:3, 3] = np.array(t_p, dtype=np.float32)
+                    ego_pose_out = torch.tensor(T_p, dtype=torch.float32)
+                    try:
+                        ego_pose_inv_out = torch.tensor(np.linalg.inv(T_p), dtype=torch.float32)
+                    except Exception:
+                        ego_pose_inv_out = None
+            except Exception as e:
+                print(f"WARNING: Failed to parse item['pose']: {e}")
+    if timestamp_out is None and isinstance(item, dict):
+        ts_raw = item.get("timestamp", None)
+        if ts_raw is not None:
+            try:
+                timestamp_out = torch.tensor(float(ts_raw) * 1e-6, dtype=torch.float32)
+            except Exception:
+                pass
+    return {
+        "pixel_values_det": torch.stack(images).unsqueeze(0),
+        "intrinsics_det": torch.stack(intrinsics_list).unsqueeze(0),
+        "lidar2img_det": torch.stack(lidar2img_list).unsqueeze(0),
+        "ego_pose": ego_pose_out.unsqueeze(0) if ego_pose_out is not None else None,
+        "ego_pose_inv": ego_pose_inv_out.unsqueeze(0) if ego_pose_inv_out is not None else None,
+        "timestamp": timestamp_out.unsqueeze(0) if timestamp_out is not None else None,
+    }
+def load_nuscenes_calibration(data_root):
+    nuscenes_root = Path(data_root)
+    version_dir = None
+    for v in ["v1.0-trainval", "v1.0-mini", "v1.0-test"]:
+        if (nuscenes_root / v).exists():
+            version_dir = nuscenes_root / v
+            break
+    if version_dir is None:
+        return None
+    needed = ["sample_data.json", "calibrated_sensor.json", "ego_pose.json", "sample.json"]
+    for n in needed:
+        if not (version_dir / n).exists():
+            return None
+    with open(version_dir / "sample_data.json") as f:
+        sample_data = json.load(f)
+    with open(version_dir / "calibrated_sensor.json") as f:
+        calibrated_sensor = json.load(f)
+    with open(version_dir / "ego_pose.json") as f:
+        ego_pose = json.load(f)
+    sd_by_fn = {r["filename"]: r for r in sample_data if "filename" in r}
+    cs_by_tok = {r["token"]: r for r in calibrated_sensor}
+    ep_by_tok = {r["token"]: r for r in ego_pose}
+    lidar_sd_by_sample = {}
+    for r in sample_data:
+        fn = str(r.get("filename", "")).replace("\\", "/")
+        if "/LIDAR_TOP/" in fn and fn.startswith("samples/") and r.get("is_key_frame"):
+            st = r.get("sample_token")
+            if st:
+                lidar_sd_by_sample.setdefault(str(st), r)
+    print(f"Calibration loaded: {len(sd_by_fn)} sample_data, {len(cs_by_tok)} cal_sensor, {len(ep_by_tok)} ego_pose, {len(lidar_sd_by_sample)} lidar_kf")
+    return {
+        "sample_data_by_filename": sd_by_fn,
+        "calibrated_sensor_by_token": cs_by_tok,
+        "ego_pose_by_token": ep_by_tok,
+        "lidar_sd_by_sample_token": lidar_sd_by_sample,
+    }
+@torch.no_grad()
+def run_streampetr_forward_temporal(model, batch, device, prev_exists_val, scene_token="__extract__"):
+    imgs = batch["pixel_values_det"].to(device)
+    B, N = imgs.shape[:2]
+    fH, fW = 800, 1600
+    img_metas = [{
+        "pad_shape": [(fH, fW, 3)] * N,
+        "img_shape": [(fH, fW, 3)] * N,
+        "scene_token": str(scene_token),
+    } for _ in range(B)]
+    if batch.get("lidar2img_det") is not None:
+        for b in range(B):
+            img_metas[b]["lidar2img"] = batch["lidar2img_det"][b].cpu().numpy()
+    img_feats = model.extract_img_feat(imgs, 1)
+    data = {
+        "img": imgs,
+        "img_feats": img_feats,
+        "prev_exists": imgs.new_tensor([prev_exists_val]),
+    }
+    if batch.get("intrinsics_det") is not None:
+        K3 = batch["intrinsics_det"].to(device)
+        K4 = torch.zeros(B, N, 4, 4, device=device, dtype=K3.dtype)
+        K4[:, :, :3, :3] = K3
+        K4[:, :, 3, 3] = 1.0
+        data["intrinsics"] = K4
+    else:
+        data["intrinsics"] = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).expand(B, N, -1, -1).contiguous()
+    if batch.get("lidar2img_det") is not None:
+        data["lidar2img"] = batch["lidar2img_det"].to(device)
+    else:
+        data["lidar2img"] = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).expand(B, N, -1, -1).contiguous()
+    if batch.get("ego_pose") is not None:
+        data["ego_pose"] = batch["ego_pose"].to(device)
+    else:
+        data["ego_pose"] = torch.eye(4, device=device).unsqueeze(0).expand(B, -1, -1).contiguous()
+    if batch.get("ego_pose_inv") is not None:
+        data["ego_pose_inv"] = batch["ego_pose_inv"].to(device)
+    else:
+        data["ego_pose_inv"] = torch.eye(4, device=device).unsqueeze(0).expand(B, -1, -1).contiguous()
+    if batch.get("timestamp") is not None:
+        data["timestamp"] = batch["timestamp"].to(device)
+    else:
+        data["timestamp"] = torch.zeros(B, device=device)
+    location = model.prepare_location(img_metas, **data)
+    outs_roi = model.forward_roi_head(location, **data)
+    topk_indexes = outs_roi["topk_indexes"]
+    model.pts_bbox_head(location, img_metas, topk_indexes, **data)
+def main():
+    args = parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    image_path_remap = {}
+    if args.image_path_remap:
+        for pair in args.image_path_remap.split(","):
+            if "=" in pair:
+                old, new = pair.split("=", 1)
+                image_path_remap[old] = new
+    paths = [p.strip() for p in args.data_json.split(",") if p.strip()]
+    data_items_raw = []
+    for p in paths:
+        with open(p) as f:
+            chunk = json.load(f)
+        data_items_raw.extend(chunk)
+    data_items = []
+    seen_ids = set()
+    for i, item in enumerate(data_items_raw):
+        sid = str(item.get("id", f"__idx_{i}"))
+        if sid in seen_ids:
+            continue
+        seen_ids.add(sid)
+        data_items.append(item)
+    print(f"Loaded {len(data_items)} unique samples from {len(paths)} file(s) (raw={len(data_items_raw)})")
+    calibration = load_nuscenes_calibration(args.data_root)
+    all_scenes = build_scene_order(data_items, args.data_root)
+    if args.num_shards > 1:
+        scenes = [s for i, s in enumerate(all_scenes) if i % args.num_shards == args.shard_id]
+        print(f"Shard {args.shard_id}/{args.num_shards}: {len(scenes)}/{len(all_scenes)} scenes")
+    else:
+        scenes = all_scenes
+    model = load_streampetr(args.streampetr_config, args.streampetr_ckpt, device)
+    import torchvision.transforms as transforms
+    image_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    sp_conf = {"final_dim": (800, 1600), "_calibration": calibration}
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    total_samples = sum(len(s) for s in scenes)
+    num_saved = 0
+    num_existed = 0
+    t0 = time.time()
+    for scene_idx, scene_indices in enumerate(scenes):
+        model.pts_bbox_head.reset_memory()
+        scene_token = f"scene_{scene_idx}"
+        for frame_idx, data_idx in enumerate(scene_indices):
+            item = data_items[data_idx]
+            sample_id = str(item.get("id", data_idx))
+            out_path = output_dir / f"{sample_id}.pt"
+            already_done = out_path.exists()
+            batch = load_and_preprocess_images(
+                item, args.data_root, image_path_remap, sp_conf, image_transform
+            )
+            prev_exists_val = 0.0 if frame_idx == 0 else 1.0
+            run_streampetr_forward_temporal(
+                model, batch, device, prev_exists_val, scene_token=scene_token
+            )
+            if not already_done:
+                ego_pose_for_ref = batch.get("ego_pose")
+                if ego_pose_for_ref is not None:
+                    ego_pose_for_ref = ego_pose_for_ref.to(device)
+                det_out = extract_streampetr_topk_tokens(
+                    model.pts_bbox_head, topk=args.topk, ego_pose=ego_pose_for_ref,
+                )
+                torch.save({
+                    "detection": det_out["detection"][0].cpu().half(),
+                    "detection_ref_points": det_out["detection_ref_points"][0].cpu().half(),
+                }, out_path)
+                num_saved += 1
+            else:
+                num_existed += 1
+            done = num_saved + num_existed
+            if done % 200 == 0:
+                elapsed = time.time() - t0
+                rate = done / max(elapsed, 1)
+                eta = (total_samples - done) / max(rate, 0.01)
+                print(f"  [{done}/{total_samples}] saved={num_saved} existed={num_existed} "
+                      f"{elapsed:.0f}s elapsed, ETA {eta:.0f}s")
+    elapsed = time.time() - t0
+    print(f"Done. saved={num_saved}, existed={num_existed}, total={total_samples}, time={elapsed:.0f}s")
+    print(f"Output: {output_dir}")
+    index = {}
+    for pt in output_dir.glob("*.pt"):
+        index[pt.stem] = pt.name
+    with open(output_dir / "index.json", "w") as f:
+        json.dump(index, f)
+    print(f"Index written: {len(index)} entries")
+if __name__ == "__main__":
+    main()

extract_topomlp_tokens.py ADDED Viewed

	@@ -0,0 +1,381 @@

+#!/usr/bin/env python3
+"""Pre-extract frozen TopoMLP raw outputs for offline Atlas training.
+Isolated by default. Set ATLAS_ALLOW_OFFLINE=1 to run.
+For online training (default), use: bash scripts/train_no_caption_baseline.sh
+Saves the 6 tensors that TopoMLPToAtlasMapTokens.forward() needs from the
+last decoder layer. The adapter performs Top-K selection online during
+training; only the frozen TopoMLP forward pass is pre-computed here.
+Usage (4-GPU parallel, requires ATLAS_ALLOW_OFFLINE=1):
+    for i in 0 1 2 3; do
+        ATLAS_ALLOW_OFFLINE=1 CUDA_VISIBLE_DEVICES=$i python extract_topomlp_tokens.py \
+            --topomlp_config configs/topomlp_atlas_aligned.py \
+            --topomlp_ckpt work_dirs/topomlp_atlas_aligned/epoch_24.pth \
+            --data_json data/openlane_subsetB_lane_train_4pt.json \
+            --output_dir work_dirs/precomputed_map_tokens_offline/train \
+            --shard_id $i --num_shards 4 &
+    done; wait
+"""
+import os
+import sys
+if os.environ.get("ATLAS_ALLOW_OFFLINE", "").lower() not in ("1", "true", "yes"):
+    print(
+        "ERROR: This is an OFFLINE token extraction script.\n"
+        "It is isolated by default to prevent accidental use.\n"
+        "If you really need it, set:  ATLAS_ALLOW_OFFLINE=1\n"
+        "For online training use:  bash scripts/train_no_caption_baseline.sh",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+import argparse
+import json
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image
+import torchvision.transforms as transforms
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+def _load_nuscenes_calibration(data_root):
+    nuscenes_root = Path(data_root)
+    version_dir = None
+    for v in ["v1.0-trainval", "v1.0-mini", "v1.0-test"]:
+        if (nuscenes_root / v).exists():
+            version_dir = nuscenes_root / v
+            break
+    if version_dir is None:
+        return None
+    needed = ["sample_data.json", "calibrated_sensor.json", "ego_pose.json"]
+    for n in needed:
+        if not (version_dir / n).exists():
+            return None
+    with open(version_dir / "sample_data.json") as f:
+        sample_data = json.load(f)
+    with open(version_dir / "calibrated_sensor.json") as f:
+        calibrated_sensor = json.load(f)
+    with open(version_dir / "ego_pose.json") as f:
+        ego_pose_data = json.load(f)
+    sd_by_fn = {r["filename"]: r for r in sample_data if "filename" in r}
+    cs_by_tok = {r["token"]: r for r in calibrated_sensor}
+    ep_by_tok = {r["token"]: r for r in ego_pose_data}
+    lidar_sd_by_sample = {}
+    for r in sample_data:
+        fn = str(r.get("filename", "")).replace("\\", "/")
+        if "/LIDAR_TOP/" in fn and fn.startswith("samples/") and r.get("is_key_frame"):
+            st = r.get("sample_token")
+            if st:
+                lidar_sd_by_sample.setdefault(str(st), r)
+    print(f"nuScenes calibration: {len(sd_by_fn)} sample_data, {len(cs_by_tok)} cal_sensor, "
+          f"{len(ep_by_tok)} ego_pose, {len(lidar_sd_by_sample)} lidar_kf")
+    return {
+        "sample_data_by_filename": sd_by_fn,
+        "calibrated_sensor_by_token": cs_by_tok,
+        "ego_pose_by_token": ep_by_tok,
+        "lidar_sd_by_sample_token": lidar_sd_by_sample,
+    }
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--topomlp_config", required=True)
+    p.add_argument("--topomlp_ckpt", required=True)
+    p.add_argument("--data_json", required=True)
+    p.add_argument("--data_root", default="")
+    p.add_argument("--output_dir", required=True)
+    p.add_argument("--image_path_remap", default=None)
+    p.add_argument("--shard_id", type=int, default=0)
+    p.add_argument("--num_shards", type=int, default=1)
+    return p.parse_args()
+def load_topomlp(config_path, ckpt_path, device):
+    tp_root = str(Path(__file__).resolve().parent / "external" / "TopoMLP_Repo")
+    if tp_root not in sys.path:
+        sys.path.insert(0, tp_root)
+    sp_root = str(Path(__file__).resolve().parent / "external" / "StreamPETR")
+    if sp_root not in sys.path:
+        sys.path.insert(0, sp_root)
+    try:
+        import projects.mmdet3d_plugin  # noqa: F401
+    except ImportError:
+        pass
+    os.environ["ATLAS_TOPOMLP_MODELS_ONLY"] = "1"
+    from mmcv.utils import registry as _reg
+    _orig = _reg.Registry._register_module
+    def _tolerant_register(self, module, module_name=None, force=False):
+        return _orig(self, module, module_name=module_name, force=True)
+    _reg.Registry._register_module = _tolerant_register
+    import projects.topomlp  # noqa: F401
+    _reg.Registry._register_module = _orig
+    from mmcv import Config
+    from mmdet3d.models import build_model
+    from mmcv.runner import load_checkpoint
+    cfg = Config.fromfile(config_path)
+    model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"))
+    load_checkpoint(model, ckpt_path, map_location="cpu")
+    model.eval()
+    model.to(device)
+    for param in model.parameters():
+        param.requires_grad_(False)
+    print(f"Loaded frozen TopoMLP from {ckpt_path}")
+    return model
+def _quat_to_rot(q):
+    w, x, y, z = q
+    return np.array([
+        [1 - 2*(y*y + z*z), 2*(x*y - z*w), 2*(x*z + y*w)],
+        [2*(x*y + z*w), 1 - 2*(x*x + z*z), 2*(y*z - x*w)],
+        [2*(x*z - y*w), 2*(y*z + x*w), 1 - 2*(x*x + y*y)],
+    ], dtype=np.float32)
+def load_and_preprocess_images(item, data_root, image_path_remap, image_transform,
+                               calibration=None):
+    tW, tH = 1600, 800
+    cam_names = [
+        "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_FRONT_LEFT",
+        "CAM_BACK", "CAM_BACK_LEFT", "CAM_BACK_RIGHT",
+    ]
+    images = []
+    lidar2img_list = []
+    sensor = item.get("sensor", {})
+    for i, img_path in enumerate(item["image_paths"]):
+        camera_name = cam_names[i] if i < len(cam_names) else f"CAM_{i}"
+        for cam in sorted(cam_names, key=len, reverse=True):
+            if cam in str(img_path):
+                camera_name = cam
+                break
+        remapped = str(img_path)
+        if image_path_remap:
+            for old_prefix, new_prefix in image_path_remap.items():
+                if remapped.startswith(old_prefix):
+                    remapped = new_prefix + remapped[len(old_prefix):]
+                    break
+        if os.path.isabs(remapped):
+            full_path = remapped
+        elif data_root:
+            full_path = os.path.normpath(os.path.join(data_root, remapped))
+        else:
+            full_path = remapped
+        img = Image.open(full_path).convert("RGB")
+        W_orig, H_orig = img.size
+        w_scale = tW / W_orig
+        h_scale = tH / H_orig
+        img = img.resize((tW, tH), Image.BILINEAR)
+        images.append(image_transform(img))
+        K, E = None, None
+        if isinstance(sensor, dict) and camera_name in sensor:
+            cam_s = sensor[camera_name]
+            K = np.array(cam_s["intrinsic"]["K"], dtype=np.float32)
+            R = np.array(cam_s["extrinsic"]["rotation"], dtype=np.float32)
+            t = np.array(cam_s["extrinsic"]["translation"], dtype=np.float32)
+            E = np.eye(4, dtype=np.float32)
+            E[:3, :3] = R
+            E[:3, 3] = t
+        sd_rec = None
+        ep_rec = None
+        if K is None and calibration is not None:
+            norm_path = str(img_path).replace("\\", "/").lstrip("./")
+            for cand in [img_path, norm_path]:
+                sd_rec = calibration["sample_data_by_filename"].get(cand)
+                if sd_rec:
+                    break
+            if sd_rec is None:
+                for prefix in ("samples/", "sweeps/"):
+                    if prefix in norm_path:
+                        sd_rec = calibration["sample_data_by_filename"].get(
+                            norm_path[norm_path.index(prefix):]
+                        )
+                        if sd_rec:
+                            break
+            if sd_rec is not None:
+                cs = calibration["calibrated_sensor_by_token"].get(
+                    sd_rec.get("calibrated_sensor_token")
+                )
+                ep_rec = calibration["ego_pose_by_token"].get(
+                    sd_rec.get("ego_pose_token")
+                )
+                if cs is not None:
+                    K = np.array(cs["camera_intrinsic"], dtype=np.float32)
+                    E = np.eye(4, dtype=np.float32)
+                    E[:3, :3] = _quat_to_rot(cs["rotation"])
+                    E[:3, 3] = np.array(cs["translation"], dtype=np.float32)
+        if K is not None and E is not None:
+            K_adj = K.copy()
+            K_adj[0, 0] *= w_scale
+            K_adj[0, 2] *= w_scale
+            K_adj[1, 1] *= h_scale
+            K_adj[1, 2] *= h_scale
+            ego2cam = np.linalg.inv(E.astype(np.float32))
+            K4 = np.eye(4, dtype=np.float32)
+            K4[:3, :3] = K_adj
+            lidar2img_mat = K4 @ ego2cam
+            if calibration is not None and sd_rec is not None and ep_rec is not None:
+                sample_tk = sd_rec.get("sample_token")
+                if sample_tk:
+                    ego2global_c = np.eye(4, dtype=np.float32)
+                    ego2global_c[:3, :3] = _quat_to_rot(ep_rec["rotation"])
+                    ego2global_c[:3, 3] = np.array(ep_rec["translation"], dtype=np.float32)
+                    global2ego_c = np.linalg.inv(ego2global_c)
+                    lidar_sd = calibration.get("lidar_sd_by_sample_token", {}).get(str(sample_tk))
+                    if lidar_sd is not None:
+                        lidar_cs = calibration["calibrated_sensor_by_token"].get(
+                            lidar_sd.get("calibrated_sensor_token"))
+                        lidar_ep = calibration["ego_pose_by_token"].get(
+                            lidar_sd.get("ego_pose_token"))
+                        if lidar_cs is not None and lidar_ep is not None:
+                            lidar2ego = np.eye(4, dtype=np.float32)
+                            lidar2ego[:3, :3] = _quat_to_rot(lidar_cs["rotation"])
+                            lidar2ego[:3, 3] = np.array(lidar_cs["translation"], dtype=np.float32)
+                            ego2global_l = np.eye(4, dtype=np.float32)
+                            ego2global_l[:3, :3] = _quat_to_rot(lidar_ep["rotation"])
+                            ego2global_l[:3, 3] = np.array(lidar_ep["translation"], dtype=np.float32)
+                            lidar2cam = ego2cam @ global2ego_c @ ego2global_l @ lidar2ego
+                            lidar2img_mat = K4 @ lidar2cam
+            lidar2img_list.append(torch.tensor(lidar2img_mat, dtype=torch.float32))
+        else:
+            lidar2img_list.append(torch.eye(4, dtype=torch.float32))
+    return torch.stack(images).unsqueeze(0), torch.stack(lidar2img_list).unsqueeze(0)
+@torch.no_grad()
+def run_topomlp_forward(model, pixel_values, lidar2img, device):
+    imgs = pixel_values.to(device)
+    B, N = imgs.shape[:2]
+    tH, tW = 800, 1600
+    img_metas = [{
+        "img_shape": tuple([(tH, tW, 3)] * N),
+        "pad_shape": tuple([(tH, tW, 3)] * N),
+        "scale_factor": 1.0,
+        "te_yolov8": None,
+    } for _ in range(B)]
+    if lidar2img is not None:
+        for b in range(B):
+            img_metas[b]["lidar2img"] = lidar2img[b].cpu().numpy()
+    return model.simple_forward(imgs, img_metas)
+def extract_adapter_inputs(outs):
+    return {
+        "lc_outs_dec": outs["lc_outs_dec_list"][-1][0].cpu().half(),
+        "lc_cls_scores": outs["all_lc_cls_scores_list"][-1][0].cpu().half(),
+        "lc_preds": outs["all_lc_preds_list"][-1][0].cpu().half(),
+        "lc_outs_dec_o2m": outs["lc_outs_dec_one2many_list"][-1][0].cpu().half(),
+        "lc_cls_scores_o2m": outs["all_lc_cls_scores_one2many_list"][-1][0].cpu().half(),
+        "lc_preds_o2m": outs["all_lc_preds_one2many_list"][-1][0].cpu().half(),
+    }
+def main():
+    args = parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    image_path_remap = {}
+    if args.image_path_remap:
+        for pair in args.image_path_remap.split(","):
+            if "=" in pair:
+                old, new = pair.split("=", 1)
+                image_path_remap[old] = new
+    paths = [p.strip() for p in args.data_json.split(",") if p.strip()]
+    data_items_raw = []
+    for p in paths:
+        with open(p) as f:
+            data_items_raw.extend(json.load(f))
+    data_items = []
+    seen_ids = set()
+    for i, item in enumerate(data_items_raw):
+        sid = str(item.get("id", f"__idx_{i}"))
+        if sid in seen_ids:
+            continue
+        seen_ids.add(sid)
+        data_items.append(item)
+    print(f"Loaded {len(data_items)} unique samples from {len(paths)} file(s)")
+    if args.num_shards > 1:
+        data_items = [item for i, item in enumerate(data_items) if i % args.num_shards == args.shard_id]
+        print(f"Shard {args.shard_id}/{args.num_shards}: {len(data_items)} samples")
+    model = load_topomlp(args.topomlp_config, args.topomlp_ckpt, device)
+    calibration = _load_nuscenes_calibration(args.data_root) if args.data_root else None
+    image_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    total = len(data_items)
+    num_saved = 0
+    num_existed = 0
+    t0 = time.time()
+    for idx, item in enumerate(data_items):
+        sample_id = str(item.get("id", idx))
+        out_path = output_dir / f"{sample_id}.pt"
+        if out_path.exists():
+            num_existed += 1
+        else:
+            pixel_values, lidar2img = load_and_preprocess_images(
+                item, args.data_root, image_path_remap, image_transform,
+                calibration=calibration,
+            )
+            outs = run_topomlp_forward(model, pixel_values, lidar2img, device)
+            torch.save(extract_adapter_inputs(outs), out_path)
+            num_saved += 1
+        done = num_saved + num_existed
+        if done % 200 == 0:
+            elapsed = time.time() - t0
+            rate = done / max(elapsed, 1)
+            eta = (total - done) / max(rate, 0.01)
+            print(f"  [{done}/{total}] saved={num_saved} existed={num_existed} "
+                  f"{elapsed:.0f}s elapsed, ETA {eta:.0f}s")
+    elapsed = time.time() - t0
+    print(f"Done. saved={num_saved}, existed={num_existed}, total={total}, time={elapsed:.0f}s")
+    print(f"Output: {output_dir}")
+    index = {}
+    for pt in output_dir.glob("*.pt"):
+        index[pt.stem] = pt.name
+    with open(output_dir / "index.json", "w") as f:
+        json.dump(index, f)
+    print(f"Index written: {len(index)} entries")
+if __name__ == "__main__":
+    main()

scripts/eval_checkpoint_offline.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/bin/bash
+# [OFFLINE fallback] Evaluate using precomputed *_offline visual tokens.
+# This is NOT the default. Use scripts/eval_checkpoint.sh for online mode.
+# Isolated by default. Set ATLAS_ALLOW_OFFLINE=1 to run.
+set -e
+if [ "${ATLAS_ALLOW_OFFLINE}" != "1" ]; then
+    echo "ERROR: This is an OFFLINE fallback script, not the primary online evaluation." >&2
+    echo "It is isolated by default to prevent accidental use in experiments." >&2
+    echo "If you really need it, set:  ATLAS_ALLOW_OFFLINE=1" >&2
+    echo "For production evaluation use:  bash scripts/eval_checkpoint.sh" >&2
+    exit 1
+fi
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$PROJECT_ROOT"
+CHECKPOINT="${1:?Usage: $0 <checkpoint> <data_json> [output_json] [max_samples]}"
+DATA_JSON="${2:?Usage: $0 <checkpoint> <data_json> [output_json] [max_samples]}"
+OUTPUT_JSON="${3:-}"
+MAX_SAMPLES="${4:-0}"
+PLANNING_TABLE3_MODE="${PLANNING_TABLE3_MODE:-atlas_base}"
+EXTRA_ARGS=""
+if [ -n "$OUTPUT_JSON" ]; then
+    EXTRA_ARGS="$EXTRA_ARGS --output_json $OUTPUT_JSON"
+fi
+if [ "$MAX_SAMPLES" -gt 0 ] 2>/dev/null; then
+    EXTRA_ARGS="$EXTRA_ARGS --max_samples $MAX_SAMPLES"
+fi
+python eval_atlas.py \
+    --checkpoint "$CHECKPOINT" \
+    --llm_model pretrained/vicuna-7b-v1.5 \
+    --data_json "$DATA_JSON" \
+    --data_root /home/guoyuanbo/autodl-tmp/data/nuscenes \
+    --visual_token_mode offline \
+    --planning_table3_mode "$PLANNING_TABLE3_MODE" \
+    --precomputed_det_tokens work_dirs/precomputed_det_tokens_offline/val \
+    --precomputed_map_tokens work_dirs/precomputed_map_tokens_offline/val \
+    --bf16 \
+    --batch_size 1 \
+    --num_workers 2 \
+    $EXTRA_ARGS

scripts/gen_atlas_caption_dashscope.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+Atlas Caption 数据生成脚本 - Dashscope 版
+与 gen_atlas_caption_qa.py 完全相同的输出格式，
+支持 --start/--end 指定 keyframe 范围，写入独立文件，最终合并。
+模型: qwen-vl-max-latest (Dashscope)
+"""
+import asyncio
+import json
+import base64
+import os
+import sys
+import time
+import signal
+from io import BytesIO
+from pathlib import Path
+try:
+    import httpx
+    from PIL import Image
+except ImportError:
+    print("pip install httpx Pillow")
+    sys.exit(1)
+NUSCENES_ROOT = "/home/guoyuanbo/autodl-tmp/data/nuscenes"
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+CAMERAS = [
+    "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_FRONT_LEFT",
+    "CAM_BACK", "CAM_BACK_LEFT", "CAM_BACK_RIGHT",
+]
+GPT4V_PROMPT = (
+    "Describe the current traffic conditions. "
+    "If there are traffic lights in the image, describe the status of all the traffic lights, "
+    "including any countdowns; if there are none, please do not respond. "
+    "If there are traffic signs in the picture, identify and explain each one; "
+    "if there are none, no explanation is necessary. "
+    "If there are other vehicles in the picture, describe them in more detail. "
+    "Please ensure the answer does not exceed 600 words. Answers must be in English."
+)
+TRAIN_PROMPTS = [
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Communicate a narrative of the setting within {camera_name} view image."
+    ),
+]
+API_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
+MODEL = "qwen-vl-max-latest"
+MAX_CONCURRENCY = 50
+MAX_RETRIES = 3
+RETRY_DELAY = 3
+TIMEOUT = 60
+CHECKPOINT_INTERVAL = 100
+def image_to_base64(path):
+    img = Image.open(path)
+    buf = BytesIO()
+    img.save(buf, format="JPEG", quality=80)
+    return base64.b64encode(buf.getvalue()).decode()
+async def call_api(client, api_key, image_b64, camera_name):
+    content = [
+        {"type": "text", "text": f"[{camera_name}] {GPT4V_PROMPT}"},
+        {"type": "image_url", "image_url": {
+            "url": f"data:image/jpeg;base64,{image_b64}",
+        }},
+    ]
+    payload = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": content}],
+        "max_tokens": 800,
+        "temperature": 0.3,
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    resp = await client.post(API_URL, json=payload, headers=headers, timeout=TIMEOUT)
+    resp.raise_for_status()
+    data = resp.json()
+    msg = data["choices"][0]["message"]["content"].strip()
+    usage = data.get("usage", {})
+    return msg, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0)
+async def process_one_view(client, api_key, sample, cam_idx, sem, stats):
+    cam = CAMERAS[cam_idx]
+    img_path = os.path.join(NUSCENES_ROOT, sample["image_paths"][cam_idx])
+    if not os.path.exists(img_path):
+        stats["skipped"] += 1
+        return None
+    img_b64 = image_to_base64(img_path)
+    train_prompt = TRAIN_PROMPTS[0].format(camera_name=cam)
+    for attempt in range(MAX_RETRIES):
+        async with sem:
+            try:
+                caption, in_tok, out_tok = await call_api(client, api_key, img_b64, cam)
+                stats["success"] += 1
+                stats["total_in"] += in_tok
+                stats["total_out"] += out_tok
+                return {
+                    "id": sample["id"],
+                    "image_paths": sample["image_paths"],
+                    "num_map_queries": 0,
+                    "task": "caption",
+                    "camera": cam,
+                    "conversations": [
+                        {"from": "human", "value": train_prompt},
+                        {"from": "gpt", "value": caption},
+                    ],
+                }
+            except httpx.TimeoutException:
+                stats["retries"] += 1
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY * (attempt + 1))
+            except httpx.HTTPStatusError as e:
+                stats["retries"] += 1
+                if e.response.status_code == 429:
+                    await asyncio.sleep(RETRY_DELAY * (attempt + 2))
+                elif attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY)
+                else:
+                    stats["failed"] += 1
+                    return None
+            except Exception:
+                stats["retries"] += 1
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY)
+                else:
+                    stats["failed"] += 1
+                    return None
+    stats["failed"] += 1
+    return None
+def make_ckpt_key(sample_id, cam_idx):
+    return f"{sample_id}_{cam_idx}"
+def load_checkpoint(path):
+    if os.path.exists(path):
+        with open(path) as f:
+            return set(json.load(f))
+    return set()
+def save_checkpoint(path, done_keys):
+    with open(path, "w") as f:
+        json.dump(sorted(done_keys), f)
+async def run(split, start, end, dry_run=False, tag="dashscope"):
+    api_key = os.environ.get("DASHSCOPE_KEY", "")
+    if not api_key:
+        print("ERROR: set DASHSCOPE_KEY env var", flush=True)
+        sys.exit(1)
+    data_file = PROJECT_ROOT / f"data/atlas_nuscenes_{split}.json"
+    out_file = PROJECT_ROOT / f"data/atlas_caption_{split}_{tag}.json"
+    ckpt_file = PROJECT_ROOT / f"data/.caption_{split}_{tag}_checkpoint.json"
+    with open(data_file) as f:
+        all_samples = json.load(f)
+    all_samples = all_samples[start:end]
+    print(f"Range: [{start}:{end}] = {len(all_samples)} keyframes", flush=True)
+    done_keys = load_checkpoint(ckpt_file)
+    existing_results = []
+    if os.path.exists(out_file) and done_keys:
+        with open(out_file) as f:
+            existing_results = json.load(f)
+    todo = []
+    for s in all_samples:
+        for cam_idx in range(6):
+            key = make_ckpt_key(s["id"], cam_idx)
+            if key not in done_keys:
+                todo.append((s, cam_idx))
+    total = len(todo)
+    print(f"Split: {split}, Tag: {tag}", flush=True)
+    print(f"Total keyframes: {len(all_samples)}", flush=True)
+    print(f"Total views to caption: {len(all_samples) * 6}", flush=True)
+    print(f"Already done: {len(done_keys)}", flush=True)
+    print(f"To process: {total}", flush=True)
+    print(f"Model: {MODEL}", flush=True)
+    print(f"Concurrency: {MAX_CONCURRENCY}", flush=True)
+    if dry_run:
+        print("DRY RUN", flush=True)
+        return
+    stats = {"success": 0, "failed": 0, "skipped": 0, "retries": 0,
+             "total_in": 0, "total_out": 0}
+    results = list(existing_results)
+    sem = asyncio.Semaphore(MAX_CONCURRENCY)
+    client = httpx.AsyncClient()
+    shutdown = False
+    def handle_signal(sig, frame):
+        nonlocal shutdown
+        shutdown = True
+        print("\nGraceful shutdown...", flush=True)
+    signal.signal(signal.SIGINT, handle_signal)
+    t0 = time.time()
+    batch_size = CHECKPOINT_INTERVAL
+    for batch_start in range(0, total, batch_size):
+        if shutdown:
+            break
+        batch = todo[batch_start:batch_start + batch_size]
+        tasks = [process_one_view(client, api_key, s, ci, sem, stats) for s, ci in batch]
+        batch_results = await asyncio.gather(*tasks)
+        for (s, ci), r in zip(batch, batch_results):
+            if r is not None:
+                results.append(r)
+                done_keys.add(make_ckpt_key(s["id"], ci))
+        with open(out_file, "w") as f:
+            json.dump(results, f, ensure_ascii=False)
+        save_checkpoint(ckpt_file, done_keys)
+        elapsed = time.time() - t0
+        done_n = batch_start + len(batch)
+        rps = stats["success"] / elapsed if elapsed > 0 else 0
+        eta = (total - done_n) / rps / 3600 if rps > 0 else 0
+        pct = done_n / total * 100
+        print(
+            f"  [{pct:5.1f}%] {done_n}/{total} | "
+            f"ok={stats['success']} fail={stats['failed']} retry={stats['retries']} | "
+            f"{rps:.2f} rps | ETA {eta:.1f}h | "
+            f"tok: {stats['total_in']/1e6:.1f}M in + {stats['total_out']/1e6:.1f}M out",
+            flush=True,
+        )
+    await client.aclose()
+    elapsed = time.time() - t0
+    print(f"\nDone in {elapsed:.0f}s ({elapsed/60:.1f}min)", flush=True)
+    print(f"Results: {len(results)} captions saved to {out_file}", flush=True)
+    print(f"Stats: {json.dumps(stats)}", flush=True)
+    total_tok = stats["total_in"] + stats["total_out"]
+    cost_in = stats["total_in"] / 1000 * 0.003
+    cost_out = stats["total_out"] / 1000 * 0.009
+    print(f"Total tokens: {total_tok:,} | Cost: ¥{cost_in + cost_out:.1f}", flush=True)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--split", default="train", choices=["train", "val"])
+    parser.add_argument("--start", type=int, required=True, help="Start keyframe index (inclusive)")
+    parser.add_argument("--end", type=int, required=True, help="End keyframe index (exclusive)")
+    parser.add_argument("--tag", default="dashscope", help="Output file tag")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--concurrency", type=int, default=50)
+    args = parser.parse_args()
+    MAX_CONCURRENCY = args.concurrency
+    asyncio.run(run(args.split, args.start, args.end, args.dry_run, args.tag))

scripts/gen_atlas_caption_qa.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Atlas Caption 数据生成脚本 (论文对齐版)
+与论文 Appendix A.3 完全对齐:
+- 每个 keyframe 的 6 个摄像头各自独立生成 caption
+- 使用论文 Table 8 中的 GPT-4V prompt
+- human prompt 使用论文 Figure 5 风格的单视角模板
+- 输出样本显式写入 `task="caption"` 与 `camera`
+- 每个 keyframe 产出 6 条 QA，总计 ~204K 条 (34K x 6)
+- 训练 prompt 与 src/prompting.py 中的 CAPTION_PROMPTS 保持一致
+支持: 异步并发、断点续传、自动重试
+"""
+import asyncio
+import json
+import base64
+import os
+import sys
+import time
+import signal
+from io import BytesIO
+from pathlib import Path
+try:
+    import httpx
+    from PIL import Image
+except ImportError:
+    print("pip install httpx Pillow")
+    sys.exit(1)
+NUSCENES_ROOT = "/home/guoyuanbo/autodl-tmp/data/nuscenes"
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+CAMERAS = [
+    "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_FRONT_LEFT",
+    "CAM_BACK", "CAM_BACK_LEFT", "CAM_BACK_RIGHT",
+]
+GPT4V_PROMPT = (
+    "Describe the current traffic conditions. "
+    "If there are traffic lights in the image, describe the status of all the traffic lights, "
+    "including any countdowns; if there are none, please do not respond. "
+    "If there are traffic signs in the picture, identify and explain each one; "
+    "if there are none, no explanation is necessary. "
+    "If there are other vehicles in the picture, describe them in more detail. "
+    "Please ensure the answer does not exceed 600 words. Answers must be in English."
+)
+TRAIN_PROMPTS = [
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Communicate a narrative of the setting within {camera_name} view image."
+    ),
+]
+API_URL = "https://openrouter.fans/v1/chat/completions"
+MODEL = "Qwen/Qwen3-VL-235B-A22B-Instruct"
+MAX_CONCURRENCY = 30
+MAX_RETRIES = 3
+RETRY_DELAY = 5
+TIMEOUT = 90
+CHECKPOINT_INTERVAL = 100
+def image_to_base64(path):
+    img = Image.open(path)
+    buf = BytesIO()
+    img.save(buf, format="JPEG", quality=80)
+    return base64.b64encode(buf.getvalue()).decode()
+async def call_api(client, api_key, image_b64, camera_name):
+    content = [
+        {"type": "text", "text": f"[{camera_name}] {GPT4V_PROMPT}"},
+        {"type": "image_url", "image_url": {
+            "url": f"data:image/jpeg;base64,{image_b64}",
+        }},
+    ]
+    payload = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": content}],
+        "max_tokens": 800,
+        "temperature": 0.3,
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    resp = await client.post(API_URL, json=payload, headers=headers, timeout=TIMEOUT)
+    resp.raise_for_status()
+    data = resp.json()
+    msg = data["choices"][0]["message"]["content"].strip()
+    usage = data.get("usage", {})
+    return msg, usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0)
+async def process_one_view(client, api_key, sample, cam_idx, sem, stats):
+    cam = CAMERAS[cam_idx]
+    img_path = os.path.join(NUSCENES_ROOT, sample["image_paths"][cam_idx])
+    if not os.path.exists(img_path):
+        stats["skipped"] += 1
+        return None
+    img_b64 = image_to_base64(img_path)
+    train_prompt = TRAIN_PROMPTS[0].format(camera_name=cam)
+    for attempt in range(MAX_RETRIES):
+        async with sem:
+            try:
+                caption, in_tok, out_tok = await call_api(client, api_key, img_b64, cam)
+                stats["success"] += 1
+                stats["total_in"] += in_tok
+                stats["total_out"] += out_tok
+                return {
+                    "id": sample["id"],
+                    "image_paths": sample["image_paths"],
+                    "num_map_queries": 0,
+                    "task": "caption",
+                    "camera": cam,
+                    "segment_id": sample.get("segment_id", ""),
+                    "timestamp": sample.get("timestamp", None),
+                    "conversations": [
+                        {"from": "human", "value": train_prompt},
+                        {"from": "gpt", "value": caption},
+                    ],
+                }
+            except httpx.TimeoutException:
+                stats["retries"] += 1
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY * (attempt + 1))
+            except httpx.HTTPStatusError as e:
+                stats["retries"] += 1
+                if e.response.status_code == 429:
+                    await asyncio.sleep(RETRY_DELAY * (attempt + 2))
+                elif attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY)
+                else:
+                    stats["failed"] += 1
+                    return None
+            except Exception:
+                stats["retries"] += 1
+                if attempt < MAX_RETRIES - 1:
+                    await asyncio.sleep(RETRY_DELAY)
+                else:
+                    stats["failed"] += 1
+                    return None
+    stats["failed"] += 1
+    return None
+def make_ckpt_key(sample_id, cam_idx):
+    return f"{sample_id}_{cam_idx}"
+def load_checkpoint(path):
+    if os.path.exists(path):
+        with open(path) as f:
+            return set(json.load(f))
+    return set()
+def save_checkpoint(path, done_keys):
+    with open(path, "w") as f:
+        json.dump(sorted(done_keys), f)
+async def run(split, dry_run=False, limit=None):
+    api_key = os.environ.get("OPENROUTER_KEY", "")
+    if not api_key:
+        print("ERROR: set OPENROUTER_KEY env var", flush=True)
+        sys.exit(1)
+    data_file = PROJECT_ROOT / f"data/atlas_nuscenes_{split}.json"
+    out_file = PROJECT_ROOT / f"data/atlas_caption_{split}.json"
+    ckpt_file = PROJECT_ROOT / f"data/.caption_{split}_checkpoint.json"
+    with open(data_file) as f:
+        all_samples = json.load(f)
+    if limit:
+        all_samples = all_samples[:limit]
+    done_keys = load_checkpoint(ckpt_file)
+    existing_results = []
+    if os.path.exists(out_file) and done_keys:
+        with open(out_file) as f:
+            existing_results = json.load(f)
+    todo = []
+    for s in all_samples:
+        for cam_idx in range(6):
+            key = make_ckpt_key(s["id"], cam_idx)
+            if key not in done_keys:
+                todo.append((s, cam_idx))
+    total = len(todo)
+    print(f"Split: {split}", flush=True)
+    print(f"Total keyframes: {len(all_samples)}", flush=True)
+    print(f"Total views to caption: {len(all_samples) * 6}", flush=True)
+    print(f"Already done: {len(done_keys)}", flush=True)
+    print(f"To process: {total}", flush=True)
+    if dry_run:
+        print("DRY RUN", flush=True)
+        return
+    stats = {"success": 0, "failed": 0, "skipped": 0, "retries": 0,
+             "total_in": 0, "total_out": 0}
+    results = list(existing_results)
+    sem = asyncio.Semaphore(MAX_CONCURRENCY)
+    client = httpx.AsyncClient()
+    shutdown = False
+    def handle_signal(sig, frame):
+        nonlocal shutdown
+        shutdown = True
+        print("\nGraceful shutdown...", flush=True)
+    signal.signal(signal.SIGINT, handle_signal)
+    t0 = time.time()
+    batch_size = CHECKPOINT_INTERVAL
+    for batch_start in range(0, total, batch_size):
+        if shutdown:
+            break
+        batch = todo[batch_start:batch_start + batch_size]
+        tasks = [process_one_view(client, api_key, s, ci, sem, stats) for s, ci in batch]
+        batch_results = await asyncio.gather(*tasks)
+        for (s, ci), r in zip(batch, batch_results):
+            if r is not None:
+                results.append(r)
+                done_keys.add(make_ckpt_key(s["id"], ci))
+        with open(out_file, "w") as f:
+            json.dump(results, f, ensure_ascii=False)
+        save_checkpoint(ckpt_file, done_keys)
+        elapsed = time.time() - t0
+        done_n = batch_start + len(batch)
+        rps = stats["success"] / elapsed if elapsed > 0 else 0
+        eta = (total - done_n) / rps / 3600 if rps > 0 else 0
+        pct = done_n / total * 100
+        print(
+            f"  [{pct:5.1f}%] {done_n}/{total} | "
+            f"ok={stats['success']} fail={stats['failed']} retry={stats['retries']} | "
+            f"{rps:.2f} rps | ETA {eta:.1f}h | "
+            f"tok: {stats['total_in']/1e6:.1f}M in + {stats['total_out']/1e6:.1f}M out",
+            flush=True,
+        )
+    await client.aclose()
+    elapsed = time.time() - t0
+    print(f"\nDone in {elapsed:.0f}s ({elapsed/60:.1f}min)", flush=True)
+    print(f"Results: {len(results)} captions saved to {out_file}", flush=True)
+    print(f"Stats: {json.dumps(stats)}", flush=True)
+    total_tok = stats["total_in"] + stats["total_out"]
+    cost_rmb = total_tok / 50e6 * 40
+    print(f"Total tokens: {total_tok:,} | Est cost: ¥{cost_rmb:.1f}", flush=True)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--split", default="train", choices=["train", "val"])
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--concurrency", type=int, default=30)
+    args = parser.parse_args()
+    MAX_CONCURRENCY = args.concurrency
+    asyncio.run(run(args.split, args.dry_run, args.limit))

scripts/gen_atlas_openlane_subsetB_lane_qa.py ADDED Viewed

	@@ -0,0 +1,251 @@

+#!/usr/bin/env python3
+"""Convert OpenLane-V2 subset-B data into Atlas-style lane QA JSON."""
+from __future__ import annotations
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Iterable
+CAM_ORDER = [
+    "CAM_FRONT",
+    "CAM_FRONT_RIGHT",
+    "CAM_FRONT_LEFT",
+    "CAM_BACK",
+    "CAM_BACK_LEFT",
+    "CAM_BACK_RIGHT",
+]
+def _val_to_bin(value: float, min_val: float, max_val: float, num_bins: int = 1000) -> int:
+    v = float(value)
+    if v < min_val:
+        v = min_val
+    if v > max_val:
+        v = max_val
+    t = (v - min_val) / (max_val - min_val)
+    idx = int(round(t * (num_bins - 1)))
+    if idx < 0:
+        idx = 0
+    if idx > (num_bins - 1):
+        idx = num_bins - 1
+    return idx
+def _openlane_to_paper_xy(x_fwd: float, y_left: float) -> Tuple[float, float]:
+    return (-float(y_left), float(x_fwd))
+def _uniform_sample_points(pts: List[List[float]], k: int) -> List[List[float]]:
+    if k <= 0 or len(pts) <= k:
+        return pts
+    if k == 1:
+        # 避免 k-1 = 0 导致的除零错误，返回中点
+        mid_idx = len(pts) // 2
+        return [pts[mid_idx]]
+    n = len(pts)
+    out = []
+    for i in range(k):
+        j = int(round(i * (n - 1) / float(k - 1)))
+        out.append(pts[j])
+    return out
+def _lane_bev_distance(lane: Dict) -> float:
+    """Compute BEV distance from ego (origin) to lane centroid for sorting."""
+    pts = lane.get("points", [])
+    if not pts:
+        return float('inf')
+    xs, ys = [], []
+    for p in pts:
+        if isinstance(p, (list, tuple)) and len(p) >= 2:
+            xs.append(float(p[0]))
+            ys.append(float(p[1]))
+    if not xs:
+        return float('inf')
+    cx, cy = sum(xs) / len(xs), sum(ys) / len(ys)
+    return cx * cx + cy * cy
+def _lane_answer_from_centerlines(
+    lane_centerline: List[Dict],
+    max_lanes: int,
+    points_per_lane: int,
+    xy_range_m: float = 51.2,
+    z_min: float = -5.0,
+    z_max: float = 3.0,
+) -> str:
+    lanes = list(lane_centerline or [])
+    lanes.sort(key=_lane_bev_distance)
+    if max_lanes > 0:
+        lanes = lanes[:max_lanes]
+    parts: List[str] = []
+    for ln in lanes:
+        pts = ln.get("points", [])
+        if not isinstance(pts, list) or not pts:
+            continue
+        pts = [p for p in pts if isinstance(p, (list, tuple)) and len(p) >= 3]
+        if not pts:
+            continue
+        pts = _uniform_sample_points([list(map(float, p[:3])) for p in pts], points_per_lane)
+        bins: List[str] = []
+        for x, y, z in pts:
+            x_p, y_p = _openlane_to_paper_xy(x, y)
+            if abs(x_p) > xy_range_m or abs(y_p) > xy_range_m:
+                continue
+            if z < z_min or z > z_max:
+                continue
+            xb = _val_to_bin(x_p, -xy_range_m, xy_range_m, 1000)
+            yb = _val_to_bin(y_p, -xy_range_m, xy_range_m, 1000)
+            zb = _val_to_bin(z, z_min, z_max, 1000)
+            bins.append(f"[{xb}, {yb}, {zb}]")
+        if bins:
+            parts.append(", ".join(bins))
+    if not parts:
+        return "No lane centerlines detected within range."
+    return "Lane: " + "; ".join(parts) + "."
+def _prompt_lane_qa() -> str:
+    import sys, os
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+    try:
+        from src.prompting import sample_prompt
+        return sample_prompt("lane")
+    except ImportError:
+        return (
+            "There are six images captured by the surround view cameras in driving vehicle. "
+            "They are uniformly represented as queries embeddings<query>. "
+            "Please complete the centerline detection task under the Bird's Eye View (BEV) perspective. "
+            "Ensure that the detection range does not exceed 50 meters."
+        )
+def iter_info_jsons(root: Path, split: str) -> Iterable[Path]:
+    base = root / split
+    if not base.exists():
+        return []
+    return base.glob("*/info/*.json")
+def write_json_list_stream(out_path: Path, items: Iterable[Dict]) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write("[\n")
+        first = True
+        for it in items:
+            if not first:
+                f.write(",\n")
+            first = False
+            json.dump(it, f, ensure_ascii=False)
+        f.write("\n]\n")
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--openlane_root", type=str, default="/home/guoyuanbo/autodl-tmp/OpenLane-V2")
+    ap.add_argument(
+        "--no_absolute_image_paths",
+        action="store_true",
+        default=False,
+    )
+    ap.add_argument("--split", type=str, default="train", choices=["train", "val", "test"])
+    ap.add_argument("--out_json", type=str, required=True)
+    ap.add_argument("--max_samples", type=int, default=0, help="0 means all")
+    ap.add_argument("--max_lanes", type=int, default=0,
+                    help="Max lanes per sample (0=no limit, paper does not specify a cap)")
+    ap.add_argument("--points_per_lane", type=int, default=4)
+    ap.add_argument("--include_raw_lane", action="store_true", default=False)
+    ap.add_argument("--num_map_queries", type=int, default=256)
+    args = ap.parse_args()
+    root = Path(args.openlane_root)
+    out_path = Path(args.out_json)
+    def _gen():
+        n = 0
+        for p in iter_info_jsons(root, args.split):
+            if args.max_samples and n >= int(args.max_samples):
+                break
+            try:
+                d = json.loads(p.read_text(encoding="utf-8", errors="replace"))
+            except Exception:
+                continue
+            sensor = d.get("sensor", None)
+            ann = d.get("annotation", {})
+            lane_centerline = ann.get("lane_centerline", [])
+            image_paths: List[str] = []
+            use_absolute = not args.no_absolute_image_paths
+            if isinstance(sensor, dict) and all(cam in sensor for cam in CAM_ORDER):
+                for cam in CAM_ORDER:
+                    rp = str(sensor[cam].get("image_path"))
+                    if use_absolute:
+                        image_paths.append(str((root / rp).resolve()))
+                    else:
+                        image_paths.append(rp)
+            else:
+                seq_dir = p.parent.parent
+                ts = p.stem
+                for cam in CAM_ORDER:
+                    rp = str((Path(args.split) / seq_dir.name / "image" / cam / f"{ts}.jpg").as_posix())
+                    if use_absolute:
+                        image_paths.append(str((root / rp).resolve()))
+                    else:
+                        image_paths.append(rp)
+            if use_absolute:
+                missing = [ip for ip in image_paths if not Path(ip).exists()]
+            else:
+                missing = [ip for ip in image_paths if not (root / ip).exists()]
+            if missing:
+                continue
+            answer = _lane_answer_from_centerlines(
+                lane_centerline=lane_centerline,
+                max_lanes=int(args.max_lanes),
+                points_per_lane=int(args.points_per_lane),
+            )
+            prompt = _prompt_lane_qa()
+            sample_id = f"openlane_subsetB_{args.split}_{d.get('segment_id','seg')}_{d.get('timestamp','ts')}"
+            it: Dict = {
+                "id": sample_id,
+                "image_paths": image_paths,
+                "num_map_queries": int(args.num_map_queries),
+                "task": "lane",
+                "sensor": sensor,
+                "pose": d.get("pose", None),
+                "timestamp": d.get("timestamp", None),
+                "segment_id": d.get("segment_id", None),
+                "meta_data": d.get("meta_data", None),
+                "conversations": [
+                    {"from": "human", "value": prompt},
+                    {"from": "gpt", "value": answer},
+                ],
+            }
+            if args.include_raw_lane:
+                it["openlane_lane_centerline"] = lane_centerline
+            n += 1
+            if n % 1000 == 0:
+                print(f"[progress] wrote_samples={n}")
+            yield it
+    print(f"[start] openlane_root={root} split={args.split} out_json={out_path}")
+    write_json_list_stream(out_path, _gen())
+    print("[done]")
+if __name__ == "__main__":
+    main()

scripts/gen_atlas_planning_qa.py ADDED Viewed

	@@ -0,0 +1,491 @@

+#!/usr/bin/env python3
+import math
+import argparse
+import json
+import os
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from tqdm import tqdm
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from src.prompting import PLANNING_TABLE3_MODES, rewrite_planning_prompt_for_table3
+Z_MIN, Z_MAX = -5.0, 3.0
+VEL_ACC_RANGE = (-50.0, 50.0)
+XY_RANGE = (-51.2, 51.2)
+NUM_BINS = 1000
+WAYPOINT_DT = 0.5
+NUM_WAYPOINTS = 6
+# Official UniAD get_sdc_planning_label() uses the terminal lateral offset
+# (RIGHT if x >= 2, LEFT if x <= -2, else FORWARD). Our waypoints are already
+# in Atlas paper frame, where x is lateral-right and y is forward.
+UNIAD_COMMAND_X_THRESHOLD = 2.0
+CAMERA_NAMES = [
+    'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
+    'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'
+]
+def _val_to_bin(value: float, min_val: float, max_val: float, num_bins: int = NUM_BINS) -> int:
+    v = float(np.clip(value, min_val, max_val))
+    t = (v - min_val) / (max_val - min_val)
+    idx = int(round(t * (num_bins - 1)))
+    return int(np.clip(idx, 0, num_bins - 1))
+def _nuscenes_to_paper_xy(x_fwd: float, y_left: float) -> Tuple[float, float]:
+    return (-float(y_left), float(x_fwd))
+def _derive_uniad_style_command(
+    waypoints: List[List[float]],
+    lateral_threshold: float = UNIAD_COMMAND_X_THRESHOLD,
+) -> str:
+    """Derive a 3-way planning command from future GT waypoints.
+    This intentionally matches the semantics of UniAD's
+    `get_sdc_planning_label()`: the final valid future position determines a
+    coarse RIGHT / LEFT / FORWARD command based on lateral displacement.
+    """
+    valid_waypoints: List[Tuple[float, float]] = []
+    for wp in waypoints:
+        if not isinstance(wp, (list, tuple)) or len(wp) < 2:
+            continue
+        x = float(wp[0])
+        y = float(wp[1])
+        if np.isfinite(x) and np.isfinite(y):
+            valid_waypoints.append((x, y))
+    if not valid_waypoints:
+        return "go straight"
+    target_x = float(valid_waypoints[-1][0])
+    if target_x >= lateral_threshold:
+        return "turn right"
+    if target_x <= -lateral_threshold:
+        return "turn left"
+    return "go straight"
+def _compute_velocity(nusc, sample) -> Tuple[float, float]:
+    try:
+        lidar_token = sample['data']['LIDAR_TOP']
+        lidar_data = nusc.get('sample_data', lidar_token)
+        from pyquaternion import Quaternion
+        ego_pose = nusc.get('ego_pose', lidar_data['ego_pose_token'])
+        ego_t = np.array(ego_pose['translation'])
+        ego_q = Quaternion(ego_pose['rotation'])
+        prev_token = lidar_data.get('prev', '')
+        if prev_token:
+            prev_data = nusc.get('sample_data', prev_token)
+            prev_ego = nusc.get('ego_pose', prev_data['ego_pose_token'])
+            prev_t = np.array(prev_ego['translation'])
+            dt = (lidar_data['timestamp'] - prev_data['timestamp']) * 1e-6
+            if dt > 0:
+                vel_global = (ego_t - prev_t) / dt
+                vel_ego = ego_q.inverse.rotate(vel_global)
+                return float(vel_ego[0]), float(vel_ego[1])
+    except Exception:
+        pass
+    return 0.0, 0.0
+def _compute_acceleration(nusc, sample) -> Tuple[float, float]:
+    try:
+        lidar_token = sample['data']['LIDAR_TOP']
+        lidar_data = nusc.get('sample_data', lidar_token)
+        from pyquaternion import Quaternion
+        ego_pose = nusc.get('ego_pose', lidar_data['ego_pose_token'])
+        ego_q = Quaternion(ego_pose['rotation'])
+        prev_token = lidar_data.get('prev', '')
+        if not prev_token:
+            return 0.0, 0.0
+        prev_data = nusc.get('sample_data', prev_token)
+        dt1 = (lidar_data['timestamp'] - prev_data['timestamp']) * 1e-6
+        if dt1 <= 0:
+            return 0.0, 0.0
+        prev2_token = prev_data.get('prev', '')
+        if not prev2_token:
+            return 0.0, 0.0
+        prev2_data = nusc.get('sample_data', prev2_token)
+        dt2 = (prev_data['timestamp'] - prev2_data['timestamp']) * 1e-6
+        if dt2 <= 0:
+            return 0.0, 0.0
+        def _ego_vel(sd1, sd2, dt_val):
+            e1 = nusc.get('ego_pose', sd1['ego_pose_token'])
+            e2 = nusc.get('ego_pose', sd2['ego_pose_token'])
+            t1 = np.array(e1['translation'])
+            t2 = np.array(e2['translation'])
+            return (t1 - t2) / dt_val
+        v1_global = _ego_vel(lidar_data, prev_data, dt1)
+        v0_global = _ego_vel(prev_data, prev2_data, dt2)
+        acc_global = (v1_global - v0_global) / dt1
+        acc_ego = ego_q.inverse.rotate(acc_global)
+        return float(acc_ego[0]), float(acc_ego[1])
+    except Exception:
+        return 0.0, 0.0
+def _get_future_waypoints(nusc, sample) -> Optional[List[List[float]]]:
+    try:
+        from pyquaternion import Quaternion
+        lidar_token = sample['data']['LIDAR_TOP']
+        lidar_data = nusc.get('sample_data', lidar_token)
+        ego_pose = nusc.get('ego_pose', lidar_data['ego_pose_token'])
+        ego_t = np.array(ego_pose['translation'])
+        ego_q = Quaternion(ego_pose['rotation'])
+        current_ts = lidar_data['timestamp']
+        target_times = [current_ts + int(WAYPOINT_DT * (i + 1) * 1e6) for i in range(NUM_WAYPOINTS)]
+        all_sd = []
+        sd_token = lidar_token
+        while sd_token:
+            sd = nusc.get('sample_data', sd_token)
+            all_sd.append(sd)
+            sd_token = sd.get('next', '')
+            if not sd_token:
+                break
+            if sd['timestamp'] > target_times[-1] + 1e6:
+                break
+        if len(all_sd) < 2:
+            return None
+        timestamps = np.array([s['timestamp'] for s in all_sd])
+        poses = []
+        for s in all_sd:
+            ep = nusc.get('ego_pose', s['ego_pose_token'])
+            poses.append(np.array(ep['translation']))
+        poses = np.array(poses)
+        waypoints = []
+        for tt in target_times:
+            if tt > timestamps[-1] or tt < timestamps[0]:
+                return None
+            idx = np.searchsorted(timestamps, tt, side='right') - 1
+            idx = max(0, min(idx, len(timestamps) - 2))
+            dt_seg = timestamps[idx + 1] - timestamps[idx]
+            if dt_seg <= 0:
+                return None
+            alpha = (tt - timestamps[idx]) / dt_seg
+            pos_global = poses[idx] * (1 - alpha) + poses[idx + 1] * alpha
+            pos_ego = ego_q.inverse.rotate(pos_global - ego_t)
+            x_p, y_p = _nuscenes_to_paper_xy(pos_ego[0], pos_ego[1])
+            waypoints.append([float(x_p), float(y_p)])
+        return waypoints
+    except Exception:
+        return None
+def _format_planning_answer(
+    vx: float, vy: float, ax: float, ay: float,
+    waypoints: List[List[float]],
+) -> str:
+    vx_bin = _val_to_bin(vx, *VEL_ACC_RANGE)
+    vy_bin = _val_to_bin(vy, *VEL_ACC_RANGE)
+    ax_bin = _val_to_bin(ax, *VEL_ACC_RANGE)
+    ay_bin = _val_to_bin(ay, *VEL_ACC_RANGE)
+    wp_strs = []
+    for wp in waypoints:
+        xb = _val_to_bin(wp[0], *XY_RANGE)
+        yb = _val_to_bin(wp[1], *XY_RANGE)
+        wp_strs.append(f"[{xb}, {yb}]")
+    return (
+        f"Ego car speed value:[{vx_bin}, {vy_bin}]. "
+        f"Ego car acceleration value:[{ax_bin}, {ay_bin}]. "
+        "Based on the ego car speed and acceleration you predicted, "
+        f"request the ego car planning waypoint in 3-seconds: {', '.join(wp_strs)}"
+    )
+def _collect_gt_boxes_ego(nusc, sample) -> List[Dict]:
+    from pyquaternion import Quaternion
+    lidar_token = sample['data']['LIDAR_TOP']
+    lidar_data = nusc.get('sample_data', lidar_token)
+    ego_pose = nusc.get('ego_pose', lidar_data['ego_pose_token'])
+    ego_t = np.array(ego_pose['translation'])
+    ego_q = Quaternion(ego_pose['rotation'])
+    cs_record = nusc.get('calibrated_sensor', lidar_data['calibrated_sensor_token'])
+    cs_t = np.array(cs_record['translation'])
+    cs_q = Quaternion(cs_record['rotation'])
+    boxes = []
+    for ann_token in sample['anns']:
+        ann = nusc.get('sample_annotation', ann_token)
+        center_global = np.array(ann['translation'])
+        center_ego = ego_q.inverse.rotate(center_global - ego_t)
+        x_p, y_p = _nuscenes_to_paper_xy(center_ego[0], center_ego[1])
+        yaw_global = Quaternion(ann['rotation'])
+        yaw_ego = ego_q.inverse * yaw_global
+        # _nuscenes_to_paper_xy applies a 90° CCW rotation:
+        #   x_paper = -y_ego, y_paper = x_ego
+        # Yaw must be rotated by the same +π/2 to stay consistent.
+        yaw_angle = float(yaw_ego.yaw_pitch_roll[0]) + math.pi / 2.0
+        w = float(ann['size'][0])
+        l = float(ann['size'][1])
+        h = float(ann['size'][2])
+        boxes.append({
+            "world_coords": [float(x_p), float(y_p), float(center_ego[2])],
+            "w": w,
+            "l": l,
+            "h": h,
+            "yaw": yaw_angle,
+            "category": ann['category_name'],
+        })
+    return boxes
+def _collect_gt_boxes_per_timestep(nusc, sample, num_timesteps=NUM_WAYPOINTS) -> List[List[Dict]]:
+    """Collect GT boxes for each future keyframe, transformed to current ego frame.
+    ST-P3 protocol: at each future timestep t, collision is checked against
+    the actual positions of other agents at time t, not their positions at t=0.
+    nuScenes keyframes are ~0.5s apart, matching the waypoint interval.
+    """
+    from pyquaternion import Quaternion
+    lidar_token = sample['data']['LIDAR_TOP']
+    lidar_data = nusc.get('sample_data', lidar_token)
+    ego_pose = nusc.get('ego_pose', lidar_data['ego_pose_token'])
+    ref_ego_t = np.array(ego_pose['translation'])
+    ref_ego_q = Quaternion(ego_pose['rotation'])
+    per_timestep_boxes: List[List[Dict]] = []
+    cur_sample = sample
+    for _ in range(num_timesteps):
+        next_token = cur_sample.get('next', '')
+        if not next_token:
+            per_timestep_boxes.append(per_timestep_boxes[-1] if per_timestep_boxes else [])
+            continue
+        cur_sample = nusc.get('sample', next_token)
+        boxes = []
+        for ann_token in cur_sample['anns']:
+            ann = nusc.get('sample_annotation', ann_token)
+            center_global = np.array(ann['translation'])
+            center_ego = ref_ego_q.inverse.rotate(center_global - ref_ego_t)
+            x_p, y_p = _nuscenes_to_paper_xy(center_ego[0], center_ego[1])
+            yaw_global = Quaternion(ann['rotation'])
+            yaw_ego = ref_ego_q.inverse * yaw_global
+            yaw_angle = float(yaw_ego.yaw_pitch_roll[0]) + math.pi / 2.0
+            w = float(ann['size'][0])
+            l = float(ann['size'][1])
+            h = float(ann['size'][2])
+            boxes.append({
+                "world_coords": [float(x_p), float(y_p), float(center_ego[2])],
+                "w": w, "l": l, "h": h,
+                "yaw": yaw_angle,
+                "category": ann['category_name'],
+            })
+        per_timestep_boxes.append(boxes)
+    return per_timestep_boxes
+def process_sample(
+    nusc,
+    sample_token: str,
+    data_root: Path,
+    planning_table3_mode: str,
+) -> Optional[Dict]:
+    try:
+        from pyquaternion import Quaternion
+        from src.prompting import sample_prompt
+        sample = nusc.get('sample', sample_token)
+        image_paths = []
+        for cam_name in CAMERA_NAMES:
+            if cam_name in sample['data']:
+                cam_token = sample['data'][cam_name]
+                cam_data = nusc.get('sample_data', cam_token)
+                image_paths.append(cam_data['filename'].replace('\\', '/'))
+        if len(image_paths) != 6:
+            return None
+        vx_n, vy_n = _compute_velocity(nusc, sample)
+        ax_n, ay_n = _compute_acceleration(nusc, sample)
+        vx_p, vy_p = _nuscenes_to_paper_xy(vx_n, vy_n)
+        ax_p, ay_p = _nuscenes_to_paper_xy(ax_n, ay_n)
+        waypoints = _get_future_waypoints(nusc, sample)
+        if waypoints is None:
+            return None
+        vx_bin = _val_to_bin(vx_p, *VEL_ACC_RANGE)
+        vy_bin = _val_to_bin(vy_p, *VEL_ACC_RANGE)
+        ax_bin = _val_to_bin(ax_p, *VEL_ACC_RANGE)
+        ay_bin = _val_to_bin(ay_p, *VEL_ACC_RANGE)
+        route_command = _derive_uniad_style_command(waypoints)
+        prompt = sample_prompt(
+            "planning",
+            vx_bin=vx_bin, vy_bin=vy_bin,
+            ax_bin=ax_bin, ay_bin=ay_bin,
+            command=route_command,
+        )
+        prompt = rewrite_planning_prompt_for_table3(
+            prompt,
+            mode=planning_table3_mode,
+            command=route_command,
+            velocity_bins=(vx_bin, vy_bin),
+            acceleration_bins=(ax_bin, ay_bin),
+        )
+        answer = _format_planning_answer(vx_p, vy_p, ax_p, ay_p, waypoints)
+        gt_boxes = _collect_gt_boxes_ego(nusc, sample)
+        gt_boxes_per_ts = _collect_gt_boxes_per_timestep(nusc, sample)
+        item = {
+            "id": sample_token,
+            "image_paths": image_paths,
+            "num_map_queries": 256,
+            "task": "planning",
+            "segment_id": sample.get("scene_token", ""),
+            "timestamp": sample.get("timestamp", None),
+            "ego_motion": {
+                "velocity": [vx_p, vy_p],
+                "acceleration": [ax_p, ay_p],
+                "waypoints": waypoints,
+            },
+            "gt_boxes_3d": gt_boxes,
+            "gt_boxes_3d_per_timestep": gt_boxes_per_ts,
+            "conversations": [
+                {"from": "human", "value": prompt},
+                {"from": "gpt", "value": answer},
+            ],
+            "route_command": route_command,
+        }
+        return item
+    except Exception:
+        return None
+def _audit_results(results: List[Dict], planning_table3_mode: str) -> None:
+    total = int(len(results))
+    if total == 0:
+        print("[AUDIT] No planning samples were generated.")
+        return
+    route_commands = [item.get("route_command") for item in results]
+    route_command_coverage = sum(isinstance(cmd, str) and bool(cmd) for cmd in route_commands)
+    route_command_dist = Counter(route_commands)
+    legacy_ego_motion_command = sum(
+        1
+        for item in results
+        if isinstance(item.get("ego_motion"), dict) and "command" in item["ego_motion"]
+    )
+    prompt_with_command = 0
+    prompt_with_state = 0
+    for item in results:
+        conv = item.get("conversations", [])
+        if not conv:
+            continue
+        prompt_text = str(conv[0].get("value", ""))
+        if "The ego car will " in prompt_text:
+            prompt_with_command += 1
+        if "The current speed value of the ego car is [" in prompt_text:
+            prompt_with_state += 1
+    print(
+        "[AUDIT] planning route_command "
+        f"mode={planning_table3_mode} "
+        f"coverage={route_command_coverage}/{total} "
+        f"legacy_ego_motion_command={legacy_ego_motion_command}/{total} "
+        f"prompt_with_command={prompt_with_command}/{total} "
+        f"prompt_with_state={prompt_with_state}/{total}"
+    )
+    print(f"[AUDIT] planning route_command distribution={dict(route_command_dist)}")
+    print(
+        "[AUDIT] route_command semantics: UniAD-style future-GT-derived "
+        f"(terminal lateral x threshold={UNIAD_COMMAND_X_THRESHOLD:.1f}m)."
+    )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--version', type=str, default='v1.0-trainval')
+    parser.add_argument('--split', type=str, default='train', choices=['train', 'val'])
+    parser.add_argument('--data-root', type=str, default='/mnt/data/nuscenes')
+    parser.add_argument('--output', type=str, default=None)
+    parser.add_argument(
+        '--planning-table3-mode',
+        type=str,
+        choices=PLANNING_TABLE3_MODES,
+        default='atlas_high_level',
+        help=(
+            'Human prompt variant to materialize in the generated JSON. '
+            'route_command is always written as a top-level UniAD-style '
+            'future-GT-derived command.'
+        ),
+    )
+    args = parser.parse_args()
+    data_root = Path(args.data_root)
+    script_dir = Path(__file__).parent.absolute()
+    project_root = script_dir.parent
+    if args.output:
+        output_file = Path(args.output)
+    else:
+        output_file = project_root / "data" / f"atlas_planning_{args.split}_uniad_command.json"
+    from nuscenes.nuscenes import NuScenes
+    from nuscenes.utils.splits import create_splits_scenes
+    nusc = NuScenes(version=args.version, dataroot=str(data_root), verbose=True)
+    splits = create_splits_scenes()
+    split_scenes = set(splits[args.split])
+    scene_tokens = set()
+    for scene in nusc.scene:
+        if scene['name'] in split_scenes:
+            scene_tokens.add(scene['token'])
+    samples_to_process = [s for s in nusc.sample if s['scene_token'] in scene_tokens]
+    print(f"Processing {len(samples_to_process)} samples for planning...")
+    results = []
+    for sample in tqdm(samples_to_process):
+        item = process_sample(
+            nusc,
+            sample['token'],
+            data_root,
+            planning_table3_mode=args.planning_table3_mode,
+        )
+        if item is not None:
+            results.append(item)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    _audit_results(results, args.planning_table3_mode)
+    print(f"Saved {len(results)} planning samples to {output_file}")
+if __name__ == "__main__":
+    main()

scripts/run_val_extraction.sh ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/bin/bash
+# Offline token extraction orchestrator.
+# Isolated by default. Set ATLAS_ALLOW_OFFLINE=1 to run.
+set -e
+if [ "${ATLAS_ALLOW_OFFLINE}" != "1" ]; then
+    echo "ERROR: This is an OFFLINE extraction orchestrator." >&2
+    echo "It is isolated by default to prevent accidental use." >&2
+    echo "If you really need it, set:  ATLAS_ALLOW_OFFLINE=1" >&2
+    echo "For online training use:  bash scripts/train_no_caption_baseline.sh" >&2
+    exit 1
+fi
+cd /home/guoyuanbo/3dtokenizer-atlas
+export LD_LIBRARY_PATH=/home/guoyuanbo/3dtokenizer/envs/streampetr/lib:${LD_LIBRARY_PATH:-}
+PY=/home/guoyuanbo/3dtokenizer/envs/streampetr/bin/python
+LOG_DIR=work_dirs
+DET_OUT=work_dirs/precomputed_det_tokens_offline/val
+MAP_OUT=work_dirs/precomputed_map_tokens_offline/val
+echo "[$(date)] === Waiting for Phase 1 (StreamPETR det) to finish ==="
+# Wait for all extract_streampetr_tokens processes to finish
+while pgrep -f "extract_streampetr_tokens.py" > /dev/null 2>&1; do
+    DET_COUNT=$(find "$DET_OUT" -name "*.pt" 2>/dev/null | wc -l)
+    echo "[$(date)] Phase 1 running... det files: $DET_COUNT / ~12038"
+    sleep 300
+done
+DET_FINAL=$(find "$DET_OUT" -name "*.pt" 2>/dev/null | wc -l)
+echo "[$(date)] Phase 1 DONE. Total det files: $DET_FINAL"
+echo "[$(date)] === Starting Phase 2 (TopoMLP map) ==="
+mkdir -p "$MAP_OUT"
+for i in 0 1 2 3; do
+    CUDA_VISIBLE_DEVICES=$i $PY extract_topomlp_tokens.py \
+        --topomlp_config configs/topomlp_atlas_aligned.py \
+        --topomlp_ckpt work_dirs/topomlp_atlas_aligned/epoch_24.pth \
+        --data_json "data/atlas_planning_val_uniad_command.json,data/openlane_subsetB_lane_val_4pt.json" \
+        --data_root /home/guoyuanbo/autodl-tmp/data/nuscenes \
+        --output_dir "$MAP_OUT" \
+        --shard_id $i --num_shards 4 \
+        > "$LOG_DIR/extract_map_val_shard_${i}.log" 2>&1 &
+    echo "[$(date)] Phase 2 shard $i launched (PID=$!)"
+done
+echo "[$(date)] Waiting for Phase 2 to complete..."
+wait
+MAP_FINAL=$(find "$MAP_OUT" -name "*.pt" 2>/dev/null | wc -l)
+echo "[$(date)] Phase 2 DONE. Total map files: $MAP_FINAL"
+echo "[$(date)] === All extraction complete ==="
+echo "  Det tokens: $DET_FINAL files in $DET_OUT"
+echo "  Map tokens: $MAP_FINAL files in $MAP_OUT"

scripts/train_no_caption_baseline.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/bin/bash
+# Atlas online training: detection + planning + lane (no caption).
+# Default: visual_token_mode=online, live frozen StreamPETR + TopoMLP.
+# Data: 28k det + 24k plan + 28k lane = ~80k samples/epoch
+#
+# Usage:
+#   bash scripts/train_no_caption_baseline.sh
+#   RESUME_CKPT=work_dirs/atlas_no_caption/epoch-4/checkpoint.pt bash scripts/train_no_caption_baseline.sh
+#   NUM_GPUS=8 bash scripts/train_no_caption_baseline.sh
+#
+# For offline mode (read precomputed *_offline token dirs):
+#   bash scripts/train_no_caption_baseline_offline.sh
+set -e
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$PROJECT_ROOT"
+NUM_GPUS=${NUM_GPUS:-4}
+PLANNING_TABLE3_MODE=${PLANNING_TABLE3_MODE:-atlas_high_level_ego}
+EXTRA_ARGS=""
+if [ -n "$RESUME_CKPT" ]; then
+    EXTRA_ARGS="--resume $RESUME_CKPT"
+fi
+deepspeed --num_gpus "$NUM_GPUS" train_atlas.py \
+    --llm_model pretrained/vicuna-7b-v1.5 \
+    --data_json data/atlas_nuscenes_train.json,data/atlas_planning_train_uniad_command.json,data/openlane_subsetB_lane_train_4pt.json \
+    --data_root /home/guoyuanbo/autodl-tmp/data/nuscenes \
+    --visual_token_mode online \
+    --planning_table3_mode "$PLANNING_TABLE3_MODE" \
+    --streampetr_config configs/streampetr_atlas_aligned.py \
+    --streampetr_ckpt pretrained/streampetr/streampetr_eva02_ep24.pth \
+    --topomlp_config configs/topomlp_atlas_aligned.py \
+    --topomlp_ckpt work_dirs/topomlp_atlas_aligned/epoch_24.pth \
+    --deepspeed configs/ds_zero2.json \
+    --output_dir work_dirs/atlas_no_caption_online \
+    --epochs 10 \
+    --lr 2e-5 \
+    --weight_decay 1e-4 \
+    --batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --warmup_ratio 0.03 \
+    --max_grad_norm 1.0 \
+    --log_steps 100 \
+    --save_epochs 1 \
+    --keep_last_n_ckpts 3 \
+    --seed 42 \
+    --num_workers 4 \
+    $EXTRA_ARGS

scripts/train_no_caption_baseline_offline.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+# [OFFLINE fallback] Atlas training using precomputed *_offline visual tokens.
+# This is NOT the default. Use scripts/train_no_caption_baseline.sh for online mode.
+# Isolated by default. Set ATLAS_ALLOW_OFFLINE=1 to run.
+set -e
+if [ "${ATLAS_ALLOW_OFFLINE}" != "1" ]; then
+    echo "ERROR: This is an OFFLINE fallback script, not the primary online training." >&2
+    echo "It is isolated by default to prevent accidental use in experiments." >&2
+    echo "If you really need it, set:  ATLAS_ALLOW_OFFLINE=1" >&2
+    echo "For production training use:  bash scripts/train_no_caption_baseline.sh" >&2
+    exit 1
+fi
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$PROJECT_ROOT"
+NUM_GPUS=${NUM_GPUS:-4}
+PLANNING_TABLE3_MODE=${PLANNING_TABLE3_MODE:-atlas_base}
+EXTRA_ARGS=""
+if [ -n "$RESUME_CKPT" ]; then
+    EXTRA_ARGS="--resume $RESUME_CKPT"
+fi
+deepspeed --num_gpus "$NUM_GPUS" train_atlas.py \
+    --llm_model pretrained/vicuna-7b-v1.5 \
+    --data_json data/atlas_nuscenes_train.json,data/atlas_planning_train_uniad_command.json,data/openlane_subsetB_lane_train_4pt.json \
+    --data_root /home/guoyuanbo/autodl-tmp/data/nuscenes \
+    --visual_token_mode offline \
+    --planning_table3_mode "$PLANNING_TABLE3_MODE" \
+    --precomputed_det_tokens work_dirs/precomputed_det_tokens_offline/train \
+    --precomputed_map_tokens work_dirs/precomputed_map_tokens_offline/train \
+    --deepspeed configs/ds_zero2.json \
+    --output_dir work_dirs/atlas_no_caption_offline \
+    --epochs 10 \
+    --lr 2e-5 \
+    --weight_decay 1e-4 \
+    --batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --warmup_ratio 0.03 \
+    --max_grad_norm 1.0 \
+    --log_steps 100 \
+    --save_epochs 1 \
+    --keep_last_n_ckpts 3 \
+    --seed 42 \
+    --num_workers 4 \
+    $EXTRA_ARGS

scripts/train_with_caption_balanced.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+# Atlas online training: detection + planning + lane + caption.
+# Default: visual_token_mode=online, live frozen StreamPETR + TopoMLP.
+# Data: 28k det + 24k plan + 28k lane + 169k caption = ~249k samples/epoch
+# WARNING: caption data is 6x larger than other tasks — consider downsampling.
+#
+# Usage:
+#   bash scripts/train_with_caption_balanced.sh
+#   RESUME_CKPT=work_dirs/atlas_with_caption/epoch-4/checkpoint.pt bash scripts/train_with_caption_balanced.sh
+#   NUM_GPUS=8 bash scripts/train_with_caption_balanced.sh
+set -e
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$PROJECT_ROOT"
+NUM_GPUS=${NUM_GPUS:-4}
+PLANNING_TABLE3_MODE=${PLANNING_TABLE3_MODE:-atlas_high_level_ego}
+EXTRA_ARGS=""
+if [ -n "$RESUME_CKPT" ]; then
+    EXTRA_ARGS="--resume $RESUME_CKPT"
+fi
+deepspeed --num_gpus "$NUM_GPUS" train_atlas.py \
+    --llm_model pretrained/vicuna-7b-v1.5 \
+    --data_json data/atlas_nuscenes_train.json,data/atlas_planning_train_uniad_command.json,data/openlane_subsetB_lane_train_4pt.json,data/atlas_caption_train.json \
+    --data_root /home/guoyuanbo/autodl-tmp/data/nuscenes \
+    --visual_token_mode online \
+    --planning_table3_mode "$PLANNING_TABLE3_MODE" \
+    --streampetr_config configs/streampetr_atlas_aligned.py \
+    --streampetr_ckpt pretrained/streampetr/streampetr_eva02_ep24.pth \
+    --topomlp_config configs/topomlp_atlas_aligned.py \
+    --topomlp_ckpt work_dirs/topomlp_atlas_aligned/epoch_24.pth \
+    --deepspeed configs/ds_zero2.json \
+    --output_dir work_dirs/atlas_with_caption_online \
+    --epochs 8 \
+    --lr 2e-5 \
+    --weight_decay 1e-4 \
+    --batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --warmup_ratio 0.03 \
+    --max_grad_norm 1.0 \
+    --log_steps 100 \
+    --save_epochs 1 \
+    --keep_last_n_ckpts 3 \
+    --seed 42 \
+    --num_workers 4 \
+    $EXTRA_ARGS

scripts/vis_atlas_lane_gt_pred.py ADDED Viewed

	@@ -0,0 +1,500 @@

+#!/usr/bin/env python3
+"""
+Visualize one Atlas lane sample with denser diagnostics.
+Compared with the earlier plot, this version:
+- keeps the original 4-point supervision visible,
+- densifies those 4 points into easier-to-read curves,
+- optionally overlays raw OpenLane GT centerlines when available,
+- keeps all BEV views on the same axes for fair visual comparison.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+import numpy as np
+try:
+    from scipy.interpolate import interp1d
+    SCIPY_AVAILABLE = True
+except Exception:
+    interp1d = None
+    SCIPY_AVAILABLE = False
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+def _load_json(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+def _find_eval_record(eval_obj: Dict, sample_id: str) -> Dict:
+    for rec in eval_obj.get("predictions", []):
+        if rec.get("sample_id") == sample_id:
+            return rec
+    raise KeyError(f"sample_id not found in eval_json: {sample_id}")
+def _find_data_item(data_list: List[Dict], sample_id: str) -> Dict:
+    for it in data_list:
+        if it.get("id") == sample_id:
+            return it
+    raise KeyError(f"sample_id not found in data_json: {sample_id}")
+def _extract_gt_answer(item: Dict) -> str:
+    for turn in item.get("conversations", []) or []:
+        if isinstance(turn, dict) and turn.get("from") in ("gpt", "assistant"):
+            return str(turn.get("value", ""))
+    return ""
+def _openlane_to_paper_xy(x_fwd: float, y_left: float) -> Tuple[float, float]:
+    return (-float(y_left), float(x_fwd))
+def _extract_xy_rows(points: Sequence) -> np.ndarray:
+    rows: List[List[float]] = []
+    for pt in points or []:
+        if isinstance(pt, dict):
+            wc = pt.get("world_coords", None)
+            if wc is None:
+                continue
+            rows.append([float(wc[0]), float(wc[1])])
+        elif isinstance(pt, (list, tuple)) and len(pt) >= 2:
+            rows.append([float(pt[0]), float(pt[1])])
+    if not rows:
+        return np.zeros((0, 2), dtype=np.float64)
+    return np.asarray(rows, dtype=np.float64)
+def _lanes_as_arrays(parsed: List[Dict]) -> List[np.ndarray]:
+    lanes = []
+    for obj in parsed:
+        if obj.get("type") != "lane":
+            continue
+        arr = _extract_xy_rows(obj.get("points", []) or [])
+        if len(arr) >= 1:
+            lanes.append(arr)
+    return lanes
+def _bounds_xy(lanes: Iterable[np.ndarray]) -> Optional[Tuple[np.ndarray, np.ndarray]]:
+    lanes = [ln for ln in lanes if len(ln) > 0]
+    if not lanes:
+        return None
+    allp = np.concatenate(lanes, axis=0)
+    return allp.min(axis=0), allp.max(axis=0)
+def _lane_bev_distance(lane: np.ndarray) -> float:
+    if len(lane) == 0:
+        return float("inf")
+    centroid = lane.mean(axis=0)
+    return float(centroid[0] ** 2 + centroid[1] ** 2)
+def _select_closest_lanes(lanes: List[np.ndarray], max_lanes: int) -> List[np.ndarray]:
+    if max_lanes <= 0 or len(lanes) <= max_lanes:
+        return list(lanes)
+    order = sorted(range(len(lanes)), key=lambda i: _lane_bev_distance(lanes[i]))
+    return [lanes[i] for i in order[:max_lanes]]
+def _candidate_openlane_paths(openlane_root: Path, item: Dict) -> List[Path]:
+    segment_id = str(item.get("segment_id", "")).strip()
+    timestamp = str(item.get("timestamp", "")).strip()
+    if not segment_id or not timestamp:
+        return []
+    rel = Path("val") / segment_id / "info" / f"{timestamp}.json"
+    return [
+        openlane_root / "subset_B" / rel,
+        openlane_root / rel,
+    ]
+def _load_raw_openlane_gt_lanes(openlane_root: Path, item: Dict) -> Tuple[List[np.ndarray], Optional[Path]]:
+    for info_path in _candidate_openlane_paths(openlane_root, item):
+        if not info_path.exists():
+            continue
+        obj = _load_json(info_path)
+        lanes = []
+        ann = obj.get("annotation", {}) or {}
+        for lane in ann.get("lane_centerline", []) or []:
+            rows = []
+            for pt in lane.get("points", []) or []:
+                if not isinstance(pt, (list, tuple)) or len(pt) < 2:
+                    continue
+                rows.append(list(_openlane_to_paper_xy(float(pt[0]), float(pt[1]))))
+            if len(rows) >= 2:
+                lanes.append(np.asarray(rows, dtype=np.float64))
+        return lanes, info_path
+    return [], None
+def _resample_lane(lane: np.ndarray, num: int = 41) -> np.ndarray:
+    if len(lane) <= 1 or num <= len(lane):
+        return lane.copy()
+    diffs = np.diff(lane, axis=0)
+    seg_len = np.linalg.norm(diffs, axis=1)
+    keep = np.concatenate(([True], seg_len > 1e-8))
+    lane = lane[keep]
+    if len(lane) <= 1:
+        return lane.copy()
+    diffs = np.diff(lane, axis=0)
+    seg_len = np.linalg.norm(diffs, axis=1)
+    arc = np.concatenate(([0.0], np.cumsum(seg_len)))
+    total = float(arc[-1])
+    if total <= 1e-8:
+        return lane.copy()
+    target = np.linspace(0.0, total, num=num)
+    if SCIPY_AVAILABLE and len(lane) >= 4:
+        try:
+            fx = interp1d(arc, lane[:, 0], kind="cubic")
+            fy = interp1d(arc, lane[:, 1], kind="cubic")
+            dense = np.stack([fx(target), fy(target)], axis=1)
+            return dense.astype(np.float64)
+        except Exception:
+            pass
+    x = np.interp(target, arc, lane[:, 0])
+    y = np.interp(target, arc, lane[:, 1])
+    return np.stack([x, y], axis=1).astype(np.float64)
+def _draw_ego(ax, *, color: str = "k"):
+    import matplotlib.patches as patches
+    w, l = 1.85, 4.084
+    rect = patches.Rectangle(
+        (-w / 2.0, -l / 2.0),
+        w,
+        l,
+        linewidth=1.2,
+        edgecolor=color,
+        facecolor="none",
+        zorder=8,
+    )
+    ax.add_patch(rect)
+    ax.arrow(0.0, 0.0, 0.0, l * 0.7, head_width=0.6, head_length=0.8, fc=color, ec=color, zorder=9)
+    ax.scatter([0.0], [0.0], s=18, c=color, zorder=10)
+    ax.text(0.2, -0.5, "EGO", color=color, fontsize=8, zorder=10)
+def _set_bev_axes(ax, *, title: str, xlim, ylim):
+    ax.set_title(title, fontsize=10)
+    ax.set_xlabel("X (m) — right", fontsize=8)
+    ax.set_ylabel("Y (m) — forward", fontsize=8)
+    ax.grid(True, linewidth=0.4, alpha=0.4)
+    ax.set_aspect("equal", adjustable="box")
+    ax.set_xlim(xlim)
+    ax.set_ylim(ylim)
+def _plot_colorful_panel(
+    ax,
+    lanes: List[np.ndarray],
+    *,
+    title: str,
+    xlim,
+    ylim,
+    raw_reference_lanes: Optional[List[np.ndarray]] = None,
+    show_control_points: bool = True,
+    dense_points: int = 41,
+):
+    import matplotlib.pyplot as plt
+    from matplotlib.lines import Line2D
+    _set_bev_axes(ax, title=title, xlim=xlim, ylim=ylim)
+    cmap = plt.get_cmap("tab20")
+    if raw_reference_lanes:
+        for ln in raw_reference_lanes:
+            ax.plot(ln[:, 0], ln[:, 1], "--", linewidth=1.0, color="0.60", alpha=0.50, zorder=1)
+    for i, lane in enumerate(lanes):
+        color = cmap(i % 20)
+        dense = _resample_lane(lane, num=dense_points)
+        ax.plot(dense[:, 0], dense[:, 1], "-", linewidth=2.0, color=color, alpha=0.95, zorder=3)
+        if show_control_points:
+            ax.scatter(
+                lane[:, 0],
+                lane[:, 1],
+                s=20,
+                facecolors="white",
+                edgecolors=[color],
+                linewidths=0.9,
+                zorder=4,
+            )
+    handles = []
+    if raw_reference_lanes:
+        handles.append(Line2D([0], [0], color="0.60", linestyle="--", linewidth=1.2, label="Raw GT"))
+    handles.append(Line2D([0], [0], color="black", linewidth=2.0, label="Densified 4pt"))
+    if show_control_points:
+        handles.append(
+            Line2D(
+                [0],
+                [0],
+                marker="o",
+                color="black",
+                markerfacecolor="white",
+                markersize=5,
+                linewidth=0.0,
+                label="Control points",
+            )
+        )
+    ax.legend(handles=handles, loc="upper left", fontsize=7, frameon=True)
+    _draw_ego(ax, color="k")
+def _plot_overlay_panel(
+    ax,
+    *,
+    gt_lanes: List[np.ndarray],
+    pred_lanes: List[np.ndarray],
+    raw_gt_lanes: Optional[List[np.ndarray]],
+    xlim,
+    ylim,
+    dense_points: int = 41,
+):
+    from matplotlib.lines import Line2D
+    _set_bev_axes(
+        ax,
+        title="Overlay: raw GT(gray), 4pt GT(green), Pred(red)",
+        xlim=xlim,
+        ylim=ylim,
+    )
+    if raw_gt_lanes:
+        for lane in raw_gt_lanes:
+            ax.plot(lane[:, 0], lane[:, 1], "--", linewidth=1.1, color="0.55", alpha=0.45, zorder=1)
+    for lane in gt_lanes:
+        dense = _resample_lane(lane, num=dense_points)
+        ax.plot(dense[:, 0], dense[:, 1], "-", linewidth=2.0, color="green", alpha=0.78, zorder=3)
+        ax.scatter(lane[:, 0], lane[:, 1], s=18, facecolors="white", edgecolors="green", linewidths=0.8, zorder=4)
+    for lane in pred_lanes:
+        dense = _resample_lane(lane, num=dense_points)
+        ax.plot(dense[:, 0], dense[:, 1], "-", linewidth=2.0, color="red", alpha=0.78, zorder=5)
+        ax.scatter(lane[:, 0], lane[:, 1], s=18, marker="x", color="red", linewidths=0.9, zorder=6)
+    handles = [
+        Line2D([0], [0], color="0.55", linestyle="--", linewidth=1.1, label="Raw GT"),
+        Line2D([0], [0], color="green", linewidth=2.0, label="Densified GT (4pt)"),
+        Line2D([0], [0], marker="o", color="green", markerfacecolor="white", markersize=5, linewidth=0.0, label="GT control"),
+        Line2D([0], [0], color="red", linewidth=2.0, label="Densified Pred (4pt)"),
+        Line2D([0], [0], marker="x", color="red", markersize=5, linewidth=0.0, label="Pred control"),
+    ]
+    ax.legend(handles=handles, loc="upper left", fontsize=7, frameon=True)
+    _draw_ego(ax, color="k")
+def _load_images(data_root: Path, image_paths: List[str]):
+    from PIL import Image
+    imgs = []
+    for rp in image_paths:
+        p = Path(rp)
+        if not p.is_absolute():
+            p = data_root / rp
+        imgs.append(Image.open(p).convert("RGB"))
+    return imgs
+def parse_args():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--eval_json", type=str, default="work_dirs/eval_nocap_ep2_lane50.json")
+    ap.add_argument("--data_json", type=str, default="data/openlane_subsetB_lane_val_4pt.json")
+    ap.add_argument("--data_root", type=str, default="/home/guoyuanbo/autodl-tmp/data/nuscenes")
+    ap.add_argument("--openlane_root", type=str, default="/home/guoyuanbo/autodl-tmp/OpenLane-V2")
+    ap.add_argument("--sample_id", type=str, required=True)
+    ap.add_argument("--out_png", type=str, default="work_dirs/atlas_lane_real_prediction_dense.png")
+    ap.add_argument("--xlim", type=float, nargs=2, default=[-30.0, 30.0])
+    ap.add_argument("--ylim", type=float, nargs=2, default=[-55.0, 55.0])
+    ap.add_argument("--dpi", type=int, default=170)
+    ap.add_argument("--dense_points", type=int, default=41)
+    return ap.parse_args()
+def main():
+    args = parse_args()
+    repo = Path(__file__).resolve().parent.parent
+    eval_path = (repo / args.eval_json).resolve()
+    data_path = (repo / args.data_json).resolve()
+    data_root = Path(args.data_root).resolve()
+    openlane_root = Path(args.openlane_root).resolve()
+    out_png = (repo / args.out_png).resolve()
+    eval_obj = _load_json(eval_path)
+    data_list = _load_json(data_path)
+    rec = _find_eval_record(eval_obj, args.sample_id)
+    item = _find_data_item(data_list, args.sample_id)
+    pred_text = str(rec.get("generated_text", ""))
+    gt_text = _extract_gt_answer(item)
+    from src.eval.metrics import parse_atlas_output
+    pred_parsed = parse_atlas_output(pred_text)
+    gt_parsed = parse_atlas_output(gt_text)
+    pred_lanes = _lanes_as_arrays(pred_parsed)
+    gt_lanes_all = _lanes_as_arrays(gt_parsed)
+    raw_gt_lanes_all, raw_gt_path = _load_raw_openlane_gt_lanes(openlane_root, item)
+    expected_gt_count = rec.get("num_gt", 0)
+    try:
+        expected_gt_count = int(expected_gt_count)
+    except Exception:
+        expected_gt_count = 0
+    gt_lanes = _select_closest_lanes(gt_lanes_all, expected_gt_count)
+    raw_gt_lanes = _select_closest_lanes(raw_gt_lanes_all, expected_gt_count)
+    xlim = (float(args.xlim[0]), float(args.xlim[1]))
+    ylim = (float(args.ylim[0]), float(args.ylim[1]))
+    delta = None
+    if pred_lanes and gt_lanes:
+        p_all = np.concatenate(pred_lanes, axis=0)
+        g_all = np.concatenate(gt_lanes, axis=0)
+        delta = (p_all.mean(axis=0) - g_all.mean(axis=0)).tolist()
+    imgs = _load_images(data_root, item.get("image_paths", []) or [])
+    if len(imgs) != 6:
+        raise RuntimeError(f"expected 6 images, got {len(imgs)}")
+    order = [2, 0, 1, 4, 3, 5]
+    cam_titles = ["FRONT_LEFT", "FRONT", "FRONT_RIGHT", "BACK_LEFT", "BACK", "BACK_RIGHT"]
+    imgs = [imgs[i] for i in order]
+    import matplotlib.pyplot as plt
+    from matplotlib.gridspec import GridSpec
+    fig = plt.figure(figsize=(15.5, 9.4), dpi=args.dpi)
+    fig.suptitle("Atlas Lane Diagnostics — Control Points, Densified Curves, and Raw GT", fontsize=14, y=0.985)
+    gs = GridSpec(
+        3,
+        4,
+        figure=fig,
+        width_ratios=[1.0, 1.0, 1.0, 1.10],
+        height_ratios=[1.0, 1.0, 1.08],
+        wspace=0.24,
+        hspace=0.33,
+    )
+    for i in range(6):
+        r = 0 if i < 3 else 1
+        c = i % 3
+        ax = fig.add_subplot(gs[r, c])
+        ax.imshow(imgs[i])
+        ax.set_title(cam_titles[i], fontsize=9)
+        ax.axis("off")
+    ax_gt = fig.add_subplot(gs[0, 3])
+    _plot_colorful_panel(
+        ax_gt,
+        gt_lanes,
+        title=f"GT: 4pt densified ({len(gt_lanes)} lanes)",
+        xlim=xlim,
+        ylim=ylim,
+        raw_reference_lanes=raw_gt_lanes or None,
+        dense_points=int(args.dense_points),
+    )
+    ax_pr = fig.add_subplot(gs[1, 3])
+    _plot_colorful_panel(
+        ax_pr,
+        pred_lanes,
+        title=f"Pred: 4pt densified ({len(pred_lanes)} lanes)",
+        xlim=xlim,
+        ylim=ylim,
+        raw_reference_lanes=None,
+        dense_points=int(args.dense_points),
+    )
+    ax_ov = fig.add_subplot(gs[2, 0:3])
+    _plot_overlay_panel(
+        ax_ov,
+        gt_lanes=gt_lanes,
+        pred_lanes=pred_lanes,
+        raw_gt_lanes=raw_gt_lanes or None,
+        xlim=xlim,
+        ylim=ylim,
+        dense_points=int(args.dense_points),
+    )
+    ax_txt = fig.add_subplot(gs[2, 3])
+    ax_txt.axis("off")
+    lane_metrics = eval_obj.get("metrics", {}).get("lane", {})
+    model_name = Path(str(eval_obj.get("args", {}).get("checkpoint", ""))).parent.name or "atlas"
+    lines = [
+        "Atlas Lane Diagnostics",
+        f"Model: {model_name}",
+        f"Sample: {args.sample_id}",
+        "",
+        f"GT 4pt lanes:     {len(gt_lanes)} vis / {len(gt_lanes_all)} full, {sum(len(x) for x in gt_lanes)} ctrl pts",
+        f"Pred 4pt lanes:   {len(pred_lanes)} lanes, {sum(len(x) for x in pred_lanes)} ctrl pts",
+        f"Raw OpenLane GT:  {len(raw_gt_lanes)} vis / {len(raw_gt_lanes_all)} full, {sum(len(x) for x in raw_gt_lanes)} pts",
+        f"Curve densifier:  {'cubic spline' if SCIPY_AVAILABLE else 'linear'} ({args.dense_points} pts/lane)",
+        "",
+        "Interpretation:",
+        "  - white circles: GT 4pt control points",
+        "  - red x: pred 4pt control points",
+        "  - gray dashed: raw OpenLane GT centerlines",
+        "  - green/red solid: densified 4pt curves",
+        f"  - vis GT subset matches eval num_gt={expected_gt_count}" if expected_gt_count > 0 else "  - vis GT subset uses full GT",
+        "",
+        "Eval metrics (from eval_json):",
+    ]
+    for k in ("lane_f1", "method", "num_samples"):
+        if k in lane_metrics:
+            lines.append(f"  {k}: {lane_metrics[k]}")
+    lines.append("")
+    if delta is not None:
+        lines.append("Mean shift (Pred - GT 4pt):")
+        lines.append(f"  dx={delta[0]:+.3f} m, dy={delta[1]:+.3f} m")
+        lines.append("")
+    gb = _bounds_xy(gt_lanes)
+    pb = _bounds_xy(pred_lanes)
+    rb = _bounds_xy(raw_gt_lanes)
+    if gb is not None:
+        lines.append(f"GT4 bounds:  x[{gb[0][0]:+.1f},{gb[1][0]:+.1f}] y[{gb[0][1]:+.1f},{gb[1][1]:+.1f}]")
+    if pb is not None:
+        lines.append(f"Pred bounds: x[{pb[0][0]:+.1f},{pb[1][0]:+.1f}] y[{pb[0][1]:+.1f},{pb[1][1]:+.1f}]")
+    if rb is not None:
+        lines.append(f"Raw bounds:  x[{rb[0][0]:+.1f},{rb[1][0]:+.1f}] y[{rb[0][1]:+.1f},{rb[1][1]:+.1f}]")
+    if raw_gt_path is not None:
+        lines.append("")
+        lines.append(f"Raw GT source: {raw_gt_path}")
+    ax_txt.text(0.0, 1.0, "\n".join(lines), va="top", ha="left", fontsize=8, family="monospace")
+    out_png.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_png, bbox_inches="tight")
+    plt.close(fig)
+    print(f"[saved] {out_png}")
+if __name__ == "__main__":
+    main()

scripts/vis_atlas_planning_qualitative.py ADDED Viewed

	@@ -0,0 +1,800 @@

+#!/usr/bin/env python3
+"""
+Paper-style qualitative planning visualization (Figure-4-like):
+- Left: 6-camera mosaic (2x3)
+- Right: BEV panel with planned trajectory
+Optionally overlays the planned trajectory onto CAM_FRONT using *fixed* nuScenes
+camera calibration (loaded from v1.0-trainval sensor/calibrated_sensor tables).
+This avoids scanning the 1.3GB sample_data.json.
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import mmap
+import pickle
+import sys
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+import numpy as np
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+CAM_ORDER_DATAJSON = [
+    "CAM_FRONT",
+    "CAM_FRONT_RIGHT",
+    "CAM_FRONT_LEFT",
+    "CAM_BACK",
+    "CAM_BACK_LEFT",
+    "CAM_BACK_RIGHT",
+]
+CAM_ORDER_PAPER = [
+    "CAM_FRONT_LEFT",
+    "CAM_FRONT",
+    "CAM_FRONT_RIGHT",
+    "CAM_BACK_LEFT",
+    "CAM_BACK",
+    "CAM_BACK_RIGHT",
+]
+_IDX_REORDER = [2, 0, 1, 4, 3, 5]  # data_json -> paper layout
+def _load_json(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+def _load_pickle(path: Path):
+    with path.open("rb") as f:
+        return pickle.load(f)
+def _extract_results_list_for_token(mm: "mmap.mmap", token: str) -> List[Dict]:
+    """
+    Extract `results[token]` list from a nuScenes detection results JSON mmap.
+    The file is a minified JSON like:
+      {"meta": {...}, "results": {"<token>": [ {...}, ... ], ...}}
+    """
+    pat = (f"\"{token}\":").encode("utf-8")
+    idx = mm.find(pat)
+    if idx < 0:
+        return []
+    j = idx + len(pat)
+    # Skip whitespace
+    while j < len(mm) and mm[j] in b" \t\r\n":
+        j += 1
+    if j >= len(mm) or mm[j : j + 1] != b"[":
+        return []
+    start = j
+    depth = 0
+    in_str = False
+    esc = False
+    k = start
+    end = None
+    while k < len(mm):
+        c = mm[k]
+        if in_str:
+            if esc:
+                esc = False
+            elif c == 0x5C:  # backslash
+                esc = True
+            elif c == 0x22:  # quote
+                in_str = False
+        else:
+            if c == 0x22:
+                in_str = True
+            elif c == 0x5B:  # [
+                depth += 1
+            elif c == 0x5D:  # ]
+                depth -= 1
+                if depth == 0:
+                    end = k + 1
+                    break
+        k += 1
+    if end is None:
+        return []
+    try:
+        return json.loads(mm[start:end].decode("utf-8"))
+    except Exception:
+        return []
+def _load_ego_pose_map(nuscenes_root: Path) -> Dict[str, Tuple[np.ndarray, np.ndarray]]:
+    """
+    Map: sample_token -> (R_ego2global(3x3), t_ego2global(3,))
+    """
+    pkl = nuscenes_root / "nuscenes_infos_val.pkl"
+    if not pkl.exists():
+        raise FileNotFoundError(f"Missing {pkl} (required to transform detector global boxes to ego frame).")
+    obj = _load_pickle(pkl)
+    infos = obj["infos"] if isinstance(obj, dict) and "infos" in obj else obj
+    out: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
+    for it in infos:
+        tok = str(it.get("token", ""))
+        if not tok:
+            continue
+        t = np.asarray(it.get("ego2global_translation", [0.0, 0.0, 0.0]), dtype=np.float64).reshape(3)
+        q = it.get("ego2global_rotation", [1.0, 0.0, 0.0, 0.0])
+        if not (isinstance(q, (list, tuple)) and len(q) == 4):
+            continue
+        R = _quat_to_rotmat(float(q[0]), float(q[1]), float(q[2]), float(q[3]))
+        out[tok] = (R, t)
+    return out
+def _quat_to_rotmat(qw: float, qx: float, qy: float, qz: float) -> np.ndarray:
+    # nuScenes quaternions are in (w, x, y, z)
+    n = math.sqrt(qw * qw + qx * qx + qy * qy + qz * qz)
+    if n < 1e-12:
+        return np.eye(3, dtype=np.float64)
+    qw, qx, qy, qz = qw / n, qx / n, qy / n, qz / n
+    # Standard quaternion -> rotation matrix
+    xx, yy, zz = qx * qx, qy * qy, qz * qz
+    xy, xz, yz = qx * qy, qx * qz, qy * qz
+    wx, wy, wz = qw * qx, qw * qy, qw * qz
+    return np.array(
+        [
+            [1.0 - 2.0 * (yy + zz), 2.0 * (xy - wz), 2.0 * (xz + wy)],
+            [2.0 * (xy + wz), 1.0 - 2.0 * (xx + zz), 2.0 * (yz - wx)],
+            [2.0 * (xz - wy), 2.0 * (yz + wx), 1.0 - 2.0 * (xx + yy)],
+        ],
+        dtype=np.float64,
+    )
+def _load_fixed_cam_front_calib(nuscenes_root: Path) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Returns (R_c2e, t_c2e, K) for CAM_FRONT.
+    Uses the first calibrated_sensor record matching CAM_FRONT.
+    """
+    meta_root = nuscenes_root / "v1.0-trainval"
+    sensor = _load_json(meta_root / "sensor.json")
+    calib = _load_json(meta_root / "calibrated_sensor.json")
+    # sensor.json is a list of dicts
+    sensor_token = None
+    for rec in sensor:
+        if rec.get("channel") == "CAM_FRONT":
+            sensor_token = rec.get("token")
+            break
+    if sensor_token is None:
+        raise RuntimeError("CAM_FRONT not found in sensor.json")
+    calib_rec = None
+    for rec in calib:
+        if rec.get("sensor_token") == sensor_token:
+            calib_rec = rec
+            break
+    if calib_rec is None:
+        raise RuntimeError("No calibrated_sensor record found for CAM_FRONT")
+    t = np.asarray(calib_rec.get("translation", [0.0, 0.0, 0.0]), dtype=np.float64).reshape(3)
+    q = calib_rec.get("rotation", [1.0, 0.0, 0.0, 0.0])
+    if not (isinstance(q, (list, tuple)) and len(q) == 4):
+        raise RuntimeError(f"Unexpected CAM_FRONT rotation quaternion: {q}")
+    R = _quat_to_rotmat(float(q[0]), float(q[1]), float(q[2]), float(q[3]))
+    K = np.asarray(calib_rec.get("camera_intrinsic", np.eye(3).tolist()), dtype=np.float64)
+    if K.shape != (3, 3):
+        raise RuntimeError(f"Unexpected CAM_FRONT camera_intrinsic shape: {K.shape}")
+    return R, t, K
+def _load_fixed_cam_calibs(nuscenes_root: Path) -> Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarray]]:
+    """
+    Load fixed (R_c2e, t_c2e, K) for all nuScenes cameras by channel name.
+    """
+    meta_root = nuscenes_root / "v1.0-trainval"
+    sensor = _load_json(meta_root / "sensor.json")
+    calib = _load_json(meta_root / "calibrated_sensor.json")
+    sensor_token_by_channel: Dict[str, str] = {}
+    for rec in sensor:
+        ch = rec.get("channel")
+        tok = rec.get("token")
+        if isinstance(ch, str) and isinstance(tok, str):
+            sensor_token_by_channel[ch] = tok
+    calib_by_sensor_token: Dict[str, Dict] = {}
+    for rec in calib:
+        st = rec.get("sensor_token")
+        if isinstance(st, str):
+            calib_by_sensor_token[st] = rec
+    out: Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarray]] = {}
+    for ch, st in sensor_token_by_channel.items():
+        rec = calib_by_sensor_token.get(st)
+        if not rec or "camera_intrinsic" not in rec:
+            continue
+        t = np.asarray(rec.get("translation", [0.0, 0.0, 0.0]), dtype=np.float64).reshape(3)
+        q = rec.get("rotation", [1.0, 0.0, 0.0, 0.0])
+        if not (isinstance(q, (list, tuple)) and len(q) == 4):
+            continue
+        R = _quat_to_rotmat(float(q[0]), float(q[1]), float(q[2]), float(q[3]))
+        K = np.asarray(rec.get("camera_intrinsic", np.eye(3).tolist()), dtype=np.float64)
+        if K.shape != (3, 3):
+            continue
+        out[ch] = (R, t, K)
+    return out
+def _paper_xy_to_nuscenes_ego_xyz(x_right: float, y_fwd: float, z_up: float = 0.0) -> np.ndarray:
+    # nuScenes ego: x forward, y left, z up
+    x_fwd = float(y_fwd)
+    y_left = float(-x_right)
+    return np.array([x_fwd, y_left, float(z_up)], dtype=np.float64)
+def _project_ego_points_to_cam(
+    pts_ego_xyz: np.ndarray,  # (N,3) in nuScenes ego
+    R_c2e: np.ndarray,
+    t_c2e: np.ndarray,
+    K: np.ndarray,
+) -> np.ndarray:
+    """
+    Project nuScenes ego-frame 3D points to pixel coords in CAM_FRONT.
+    Returns (M,2) pixels for points with z_cam > 0.
+    """
+    # ego -> cam: p_cam = R_c2e^T (p_ego - t_c2e)
+    R_e2c = R_c2e.T
+    pts_cam = (R_e2c @ (pts_ego_xyz - t_c2e[None, :]).T).T  # (N,3)
+    z = pts_cam[:, 2]
+    keep = z > 1e-3
+    pts_cam = pts_cam[keep]
+    if pts_cam.shape[0] == 0:
+        return np.zeros((0, 2), dtype=np.float64)
+    x = pts_cam[:, 0] / pts_cam[:, 2]
+    y = pts_cam[:, 1] / pts_cam[:, 2]
+    fx, fy = float(K[0, 0]), float(K[1, 1])
+    cx, cy = float(K[0, 2]), float(K[1, 2])
+    u = fx * x + cx
+    v = fy * y + cy
+    return np.stack([u, v], axis=1)
+def _project_one_ego_point_to_cam(
+    pt_ego_xyz: np.ndarray,  # (3,)
+    R_c2e: np.ndarray,
+    t_c2e: np.ndarray,
+    K: np.ndarray,
+) -> Optional[Tuple[float, float, float]]:
+    """Return (u, v, z_cam) or None if behind camera."""
+    pt_ego_xyz = np.asarray(pt_ego_xyz, dtype=np.float64).reshape(3)
+    R_e2c = R_c2e.T
+    pt_cam = R_e2c @ (pt_ego_xyz - t_c2e)
+    zc = float(pt_cam[2])
+    if zc <= 1e-3:
+        return None
+    fx, fy = float(K[0, 0]), float(K[1, 1])
+    cx, cy = float(K[0, 2]), float(K[1, 2])
+    u = fx * float(pt_cam[0] / zc) + cx
+    v = fy * float(pt_cam[1] / zc) + cy
+    return float(u), float(v), zc
+def _draw_ego_bev(ax, *, color: str = "k"):
+    import matplotlib.patches as patches
+    w, l = 1.85, 4.084
+    rect = patches.Rectangle(
+        (-w / 2.0, -l / 2.0),
+        w,
+        l,
+        linewidth=1.2,
+        edgecolor=color,
+        facecolor="none",
+        zorder=10,
+    )
+    ax.add_patch(rect)
+    ax.arrow(0.0, 0.0, 0.0, l * 0.8, head_width=0.6, head_length=0.8, fc=color, ec=color, zorder=11)
+    ax.text(0.2, -0.6, "EGO", color=color, fontsize=8, zorder=12)
+def _box_corners_xy(cx: float, cy: float, w: float, l: float, yaw: float) -> np.ndarray:
+    """
+    Oriented rectangle corners in XY (paper coords).
+    yaw is treated as radians in the same XY frame (best-effort).
+    IMPORTANT: In our planning eval JSON, `yaw` behaves like a standard heading
+    angle measured from +X (right) axis (yaw-from-x). So:
+      - yaw = 0   => vehicle length points to +X (right)
+      - yaw = +pi/2 => vehicle length points to +Y (forward)
+    """
+    c, s = math.cos(yaw), math.sin(yaw)
+    center = np.array([cx, cy], dtype=np.float64)
+    # Length axis (heading) and width axis (perpendicular), yaw-from-x
+    d_len = np.array([c, s], dtype=np.float64) * (l / 2.0)
+    d_wid = np.array([-s, c], dtype=np.float64) * (w / 2.0)
+    corners = np.stack(
+        [
+            center + d_len + d_wid,
+            center + d_len - d_wid,
+            center - d_len - d_wid,
+            center - d_len + d_wid,
+        ],
+        axis=0,
+    )
+    return corners
+def _title_case_command(cmd: str) -> str:
+    c = (cmd or "").strip().lower()
+    if c == "turn left":
+        return "Turn Left"
+    if c == "turn right":
+        return "Turn Right"
+    if c == "go straight":
+        return "Go Straight"
+    return cmd
+def _short_cat(cat: str) -> str:
+    c = (cat or "").strip()
+    if not c:
+        return "obj"
+    tail = c.split(".")[-1]
+    mapping = {
+        "car": "car",
+        "truck": "truck",
+        "trailer": "trailer",
+        "bus": "bus",
+        "construction": "cveh",
+        "construction_vehicle": "cveh",
+        "pedestrian": "ped",
+        "trafficcone": "cone",
+        "traffic_cone": "cone",
+        "barrier": "barrier",
+        "motorcycle": "moto",
+        "bicycle": "bike",
+    }
+    return mapping.get(tail, tail[:10])
+def _select_default_samples(data_items: List[Dict]) -> List[str]:
+    # prefer (turn right, go straight) like the paper figure
+    by_cmd: Dict[str, str] = {}
+    for it in data_items:
+        cmd = (it.get("ego_motion", {}) or {}).get("command")
+        if cmd and cmd not in by_cmd and it.get("id"):
+            by_cmd[str(cmd)] = str(it["id"])
+    out = []
+    if "turn right" in by_cmd:
+        out.append(by_cmd["turn right"])
+    if "go straight" in by_cmd:
+        out.append(by_cmd["go straight"])
+    if not out:
+        out = [str(it.get("id")) for it in data_items[:2] if it.get("id")]
+    return out[:2]
+def parse_args():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--eval_json", type=str, default="work_dirs/eval_nocap_ep2_plan50.json")
+    ap.add_argument("--data_json", type=str, default="data/_eval_ep2_plan50.json")
+    ap.add_argument("--data_root", type=str, default="/home/guoyuanbo/autodl-tmp/data/nuscenes")
+    ap.add_argument("--out_png", type=str, default="work_dirs/atlas_planning_qualitative.png")
+    ap.add_argument("--sample_ids", type=str, nargs="*", default=None, help="Provide 1-2 sample ids; default picks turn right & go straight.")
+    ap.add_argument("--bev_xlim", type=float, nargs=2, default=[-30.0, 30.0])
+    ap.add_argument("--bev_ylim", type=float, nargs=2, default=[-10.0, 55.0])
+    ap.add_argument("--dpi", type=int, default=200)
+    ap.add_argument("--draw_gt_boxes", action="store_true", default=True)
+    ap.add_argument("--no_draw_gt_boxes", action="store_false", dest="draw_gt_boxes")
+    ap.add_argument("--overlay_on_front_cam", action="store_true", default=True)
+    ap.add_argument("--no_overlay_on_front_cam", action="store_false", dest="overlay_on_front_cam")
+    ap.add_argument("--highlight_front_visible_boxes", action="store_true", default=True)
+    ap.add_argument("--no_highlight_front_visible_boxes", action="store_false", dest="highlight_front_visible_boxes")
+    ap.add_argument("--max_front_labels", type=int, default=8, help="Max GT box labels shown on CAM_FRONT and BEV.")
+    ap.add_argument("--max_cam_labels", type=int, default=4, help="Max GT labels per camera view (all 6 views).")
+    ap.add_argument("--bev_only_visible", action="store_true", default=True, help="If set, BEV draws only FRONT* visible GT boxes.")
+    ap.add_argument("--bev_show_all", action="store_false", dest="bev_only_visible", help="Draw all GT boxes faintly in BEV (plus highlighted).")
+    ap.add_argument(
+        "--det_results_json",
+        type=str,
+        default="external/StreamPETR/val/work_dirs/streampetr_atlas_aligned/Tue_Feb_10_23_13_58_2026/pts_bbox/results_nusc.json",
+        help="nuScenes detection results JSON (StreamPETR).",
+    )
+    ap.add_argument("--draw_det_pred_boxes", action="store_true", default=True)
+    ap.add_argument("--no_draw_det_pred_boxes", action="store_false", dest="draw_det_pred_boxes")
+    ap.add_argument("--det_score_thresh", type=float, default=0.3)
+    ap.add_argument("--det_max_boxes", type=int, default=30)
+    ap.add_argument("--det_label_topk", type=int, default=6, help="Label top-k detector boxes in BEV.")
+    return ap.parse_args()
+def main():
+    args = parse_args()
+    repo = _REPO_ROOT
+    eval_path = (repo / args.eval_json).resolve()
+    data_path = (repo / args.data_json).resolve()
+    nuscenes_root = Path(args.data_root).resolve()
+    out_png = (repo / args.out_png).resolve()
+    eval_obj = _load_json(eval_path)
+    data_items = _load_json(data_path)
+    # Build lookup maps
+    pred_text_by_id: Dict[str, str] = {}
+    for rec in eval_obj.get("predictions", []):
+        sid = str(rec.get("sample_id", ""))
+        if not sid:
+            continue
+        pred_text_by_id[sid] = str(rec.get("generated_text", ""))
+    item_by_id: Dict[str, Dict] = {str(it.get("id")): it for it in data_items if it.get("id")}
+    sample_ids = list(args.sample_ids) if args.sample_ids else _select_default_samples(data_items)
+    sample_ids = [sid for sid in sample_ids if sid in item_by_id]
+    if not sample_ids:
+        raise RuntimeError("No valid sample_ids found. Provide --sample_ids explicitly.")
+    if len(sample_ids) > 2:
+        sample_ids = sample_ids[:2]
+    # Calibration for projecting onto CAM_FRONT
+    cam_calibs: Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarray]] = {}
+    if args.overlay_on_front_cam:
+        cam_calibs = _load_fixed_cam_calibs(nuscenes_root)
+    from src.eval.metrics import parse_planning_output
+    # Detector predictions (StreamPETR nuScenes results) are in GLOBAL coords.
+    # We need ego pose to transform them into ego/paper coords for BEV plotting.
+    det_mm = None
+    det_f = None
+    ego_pose_map = None
+    det_results_path = (repo / args.det_results_json).resolve()
+    if args.draw_det_pred_boxes and det_results_path.exists():
+        try:
+            ego_pose_map = _load_ego_pose_map(nuscenes_root)
+            det_f = det_results_path.open("rb")
+            det_mm = mmap.mmap(det_f.fileno(), 0, access=mmap.ACCESS_READ)
+        except Exception:
+            det_mm = None
+            if det_f is not None:
+                det_f.close()
+                det_f = None
+    import matplotlib.pyplot as plt
+    from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
+    nrows = len(sample_ids)
+    fig = plt.figure(figsize=(12.5, 6.5 * nrows), dpi=args.dpi)
+    fig.suptitle("Atlas Planning — 6-Camera + BEV (Paper-style)", fontsize=16, y=0.99)
+    outer = GridSpec(nrows, 2, figure=fig, width_ratios=[3.2, 1.4], wspace=0.10, hspace=0.18)
+    from PIL import Image
+    for r, sid in enumerate(sample_ids):
+        item = item_by_id[sid]
+        pred_text = pred_text_by_id.get(sid, "")
+        plan = parse_planning_output(pred_text) if pred_text else None
+        pred_wps = (plan or {}).get("waypoints", []) if plan else []
+        gt_wps = (item.get("ego_motion", {}) or {}).get("waypoints", []) or []
+        cmd = (item.get("ego_motion", {}) or {}).get("command", "")
+        # Load images
+        rel_paths: List[str] = list(item.get("image_paths", []) or [])
+        if len(rel_paths) != 6:
+            raise RuntimeError(f"sample {sid}: expected 6 image_paths, got {len(rel_paths)}")
+        imgs = []
+        for rp in rel_paths:
+            p = Path(rp)
+            if not p.is_absolute():
+                p = nuscenes_root / rp
+            imgs.append(Image.open(p).convert("RGB"))
+        imgs = [imgs[i] for i in _IDX_REORDER]
+        front_w, front_h = imgs[1].size  # CAM_FRONT in paper order
+        # Left: 2x3 image mosaic
+        left = GridSpecFromSubplotSpec(2, 3, subplot_spec=outer[r, 0], wspace=0.02, hspace=0.06)
+        ax_imgs = []
+        for i in range(6):
+            ax = fig.add_subplot(left[i // 3, i % 3])
+            ax.imshow(imgs[i])
+            # Lock image axis limits so later overlays don't autoscale-shrink the image.
+            w_i, h_i = imgs[i].size
+            ax.set_xlim(0, w_i)
+            ax.set_ylim(h_i, 0)
+            ax.set_title(CAM_ORDER_PAPER[i], fontsize=9)
+            ax.axis("off")
+            ax_imgs.append(ax)
+        # Collect GT boxes that are visible in each FRONT* camera (by projected centers).
+        boxes = item.get("gt_boxes_3d", []) or []
+        def _visible_for_channel(channel: str, img_w: int, img_h: int):
+            if not (args.highlight_front_visible_boxes and args.overlay_on_front_cam):
+                return []
+            if channel not in cam_calibs:
+                return []
+            R_c2e, t_c2e, K = cam_calibs[channel]
+            out = []
+            for bi, b in enumerate(boxes):
+                if not isinstance(b, dict) or "world_coords" not in b:
+                    continue
+                wc = b.get("world_coords", [0.0, 0.0, 0.0])
+                if not (isinstance(wc, (list, tuple)) and len(wc) >= 3):
+                    continue
+                x_r, y_f, z_u = float(wc[0]), float(wc[1]), float(wc[2])
+                pt_ego = _paper_xy_to_nuscenes_ego_xyz(x_r, y_f, z_u)
+                uv = _project_one_ego_point_to_cam(pt_ego, R_c2e, t_c2e, K)
+                if uv is None:
+                    continue
+                u, v, _zc = uv
+                if 0.0 <= u <= float(img_w) and 0.0 <= v <= float(img_h):
+                    d = float(math.hypot(x_r, y_f))
+                    out.append((d, bi, str(b.get("category", "")), x_r, y_f, z_u, u, v))
+            out.sort(key=lambda t: t[0])
+            return out
+        vis_by_ch = {
+            "CAM_FRONT_LEFT": _visible_for_channel("CAM_FRONT_LEFT", imgs[0].size[0], imgs[0].size[1]),
+            "CAM_FRONT": _visible_for_channel("CAM_FRONT", imgs[1].size[0], imgs[1].size[1]),
+            "CAM_FRONT_RIGHT": _visible_for_channel("CAM_FRONT_RIGHT", imgs[2].size[0], imgs[2].size[1]),
+            "CAM_BACK_LEFT": _visible_for_channel("CAM_BACK_LEFT", imgs[3].size[0], imgs[3].size[1]),
+            "CAM_BACK": _visible_for_channel("CAM_BACK", imgs[4].size[0], imgs[4].size[1]),
+            "CAM_BACK_RIGHT": _visible_for_channel("CAM_BACK_RIGHT", imgs[5].size[0], imgs[5].size[1]),
+        }
+        visible_union_all = []
+        if any(vis_by_ch.get(ch) for ch in vis_by_ch.keys()):
+            seen = set()
+            for ch in ("CAM_FRONT_LEFT", "CAM_FRONT", "CAM_FRONT_RIGHT", "CAM_BACK_LEFT", "CAM_BACK", "CAM_BACK_RIGHT"):
+                for tup in vis_by_ch.get(ch, []):
+                    bi = tup[1]
+                    if bi in seen:
+                        continue
+                    seen.add(bi)
+                    visible_union_all.append(tup)
+        visible_union_all.sort(key=lambda t: t[0])
+        visible_union = visible_union_all[: max(int(args.max_front_labels), 0)]
+        # Overlay trajectory on CAM_FRONT (middle of top row in paper order)
+        if args.overlay_on_front_cam and pred_wps and "CAM_FRONT" in cam_calibs:
+            R_c2e, t_c2e, K = cam_calibs["CAM_FRONT"]
+            ax_front = ax_imgs[1]
+            # Pred
+            pts_pred = np.array([_paper_xy_to_nuscenes_ego_xyz(x, y, 0.0) for x, y in pred_wps], dtype=np.float64)
+            uv_pred = _project_ego_points_to_cam(pts_pred, R_c2e, t_c2e, K)
+            if uv_pred.shape[0] >= 2:
+                ax_front.plot(uv_pred[:, 0], uv_pred[:, 1], "-o", color="#00C2FF", linewidth=2.0, markersize=4.0, alpha=0.95)
+            # GT (dashed)
+            if gt_wps:
+                pts_gt = np.array([_paper_xy_to_nuscenes_ego_xyz(x, y, 0.0) for x, y in gt_wps], dtype=np.float64)
+                uv_gt = _project_ego_points_to_cam(pts_gt, R_c2e, t_c2e, K)
+                if uv_gt.shape[0] >= 2:
+                    ax_front.plot(uv_gt[:, 0], uv_gt[:, 1], "--o", color="white", linewidth=2.0, markersize=3.5, alpha=0.85)
+        # Highlight GT centers on the three FRONT* cameras (helps validate BEV↔camera consistency).
+        if args.highlight_front_visible_boxes and args.overlay_on_front_cam:
+            cam_axes = {
+                "CAM_FRONT_LEFT": ax_imgs[0],
+                "CAM_FRONT": ax_imgs[1],
+                "CAM_FRONT_RIGHT": ax_imgs[2],
+                "CAM_BACK_LEFT": ax_imgs[3],
+                "CAM_BACK": ax_imgs[4],
+                "CAM_BACK_RIGHT": ax_imgs[5],
+            }
+            for ch, ax in cam_axes.items():
+                vis = (vis_by_ch.get(ch, []) or [])[: max(int(args.max_cam_labels), 0)]
+                if not vis:
+                    continue
+                for _d, _bi, cat, _x, _y, _z, u, v in vis:
+                    ax.scatter([u], [v], s=26, c="#FF4D4D", edgecolors="black", linewidths=0.6, zorder=20)
+                    ax.text(
+                        u + 6.0,
+                        v - 6.0,
+                        _short_cat(cat),
+                        color="white",
+                        fontsize=8,
+                        fontweight="bold",
+                        ha="left",
+                        va="bottom",
+                        zorder=21,
+                        bbox=dict(boxstyle="round,pad=0.15", facecolor="black", edgecolor="none", alpha=0.6),
+                    )
+        # Right: BEV
+        ax_bev = fig.add_subplot(outer[r, 1])
+        ax_bev.set_title("BEV", fontsize=12)
+        ax_bev.set_xlabel("X (m) — right", fontsize=9)
+        ax_bev.set_ylabel("Y (m) — forward", fontsize=9)
+        ax_bev.set_aspect("equal", adjustable="box")
+        ax_bev.grid(True, linewidth=0.4, alpha=0.35)
+        ax_bev.set_xlim(float(args.bev_xlim[0]), float(args.bev_xlim[1]))
+        ax_bev.set_ylim(float(args.bev_ylim[0]), float(args.bev_ylim[1]))
+        _draw_ego_bev(ax_bev, color="black")
+        # Draw GT boxes (paper-style context)
+        if args.draw_gt_boxes:
+            boxes = item.get("gt_boxes_3d", []) or []
+            # When debugging camera↔BEV consistency, draw only the boxes that are visible
+            # in the shown camera views to avoid confusing off-screen objects.
+            if args.bev_only_visible and visible_union_all:
+                boxes_to_draw = [boxes[bi] for _d, bi, _cat, *_rest in visible_union_all if 0 <= bi < len(boxes)]
+            else:
+                boxes_to_draw = boxes
+            for b in boxes_to_draw:
+                if not isinstance(b, dict) or "world_coords" not in b:
+                    continue
+                wc = b.get("world_coords", [0.0, 0.0, 0.0])
+                cx, cy = float(wc[0]), float(wc[1])
+                w = float(b.get("w", 1.8))
+                l = float(b.get("l", 4.0))
+                yaw = float(b.get("yaw", 0.0))
+                corners = _box_corners_xy(cx, cy, w, l, yaw)
+                poly = np.vstack([corners, corners[0:1]])
+                ax_bev.plot(poly[:, 0], poly[:, 1], color="#FF5A5A", linewidth=0.8, alpha=0.50)
+            # Re-draw FRONT* visible boxes thicker + label them in BEV.
+            if args.highlight_front_visible_boxes and visible_union:
+                for _d, bi, cat, x_r, y_f, _z, _u, _v in visible_union:
+                    if bi >= len(boxes):
+                        continue
+                    b = boxes[bi]
+                    wc = b.get("world_coords", [0.0, 0.0, 0.0])
+                    cx, cy = float(wc[0]), float(wc[1])
+                    w = float(b.get("w", 1.8))
+                    l = float(b.get("l", 4.0))
+                    yaw = float(b.get("yaw", 0.0))
+                    corners = _box_corners_xy(cx, cy, w, l, yaw)
+                    poly = np.vstack([corners, corners[0:1]])
+                    ax_bev.plot(poly[:, 0], poly[:, 1], color="#FF2E2E", linewidth=1.8, alpha=0.95)
+                    ax_bev.text(
+                        cx,
+                        cy,
+                        _short_cat(cat),
+                        fontsize=7.5,
+                        color="#FF2E2E",
+                        ha="center",
+                        va="center",
+                        zorder=30,
+                        bbox=dict(boxstyle="round,pad=0.12", facecolor="white", edgecolor="none", alpha=0.55),
+                    )
+        # Draw detector predicted boxes (StreamPETR) in BEV.
+        det_handle = None
+        if args.draw_det_pred_boxes and det_mm is not None and ego_pose_map is not None:
+            R_e2g_t = None
+            t_e2g_t = None
+            if sid in ego_pose_map:
+                R_e2g_t, t_e2g_t = ego_pose_map[sid]
+            if R_e2g_t is not None and t_e2g_t is not None:
+                dets = _extract_results_list_for_token(det_mm, sid)
+                # Filter & transform
+                det_plot = []
+                xlo, xhi = float(args.bev_xlim[0]), float(args.bev_xlim[1])
+                ylo, yhi = float(args.bev_ylim[0]), float(args.bev_ylim[1])
+                for d in dets:
+                    try:
+                        score = float(d.get("detection_score", 0.0))
+                    except Exception:
+                        score = 0.0
+                    if score < float(args.det_score_thresh):
+                        continue
+                    tr = d.get("translation", None)
+                    sz = d.get("size", None)
+                    rot = d.get("rotation", None)
+                    if not (isinstance(tr, (list, tuple)) and len(tr) >= 3):
+                        continue
+                    if not (isinstance(sz, (list, tuple)) and len(sz) >= 3):
+                        continue
+                    if not (isinstance(rot, (list, tuple)) and len(rot) == 4):
+                        continue
+                    p_g = np.asarray([float(tr[0]), float(tr[1]), float(tr[2])], dtype=np.float64)
+                    # global -> ego
+                    p_e = (R_e2g_t.T @ (p_g - t_e2g_t)).reshape(3)
+                    # ego -> paper
+                    x_p = float(-p_e[1])
+                    y_p = float(p_e[0])
+                    z_p = float(p_e[2])
+                    if not (xlo <= x_p <= xhi and ylo <= y_p <= yhi):
+                        continue
+                    # orientation: global -> ego -> paper yaw
+                    R_box_g = _quat_to_rotmat(float(rot[0]), float(rot[1]), float(rot[2]), float(rot[3]))
+                    R_box_e = R_e2g_t.T @ R_box_g
+                    yaw_e = float(math.atan2(R_box_e[1, 0], R_box_e[0, 0]))  # yaw-from-x in ego
+                    yaw_p = yaw_e + math.pi / 2.0  # convert to paper yaw-from-x
+                    w = float(sz[0])
+                    l = float(sz[1])
+                    cat = str(d.get("detection_name", "obj"))
+                    det_plot.append((score, cat, x_p, y_p, z_p, w, l, yaw_p))
+                det_plot.sort(key=lambda t: -t[0])
+                det_plot = det_plot[: max(int(args.det_max_boxes), 0)]
+                det_color = "#00A65A"  # green
+                for score, cat, x_p, y_p, _z, w, l, yaw_p in det_plot:
+                    corners = _box_corners_xy(x_p, y_p, w, l, yaw_p)
+                    poly = np.vstack([corners, corners[0:1]])
+                    ax_bev.plot(poly[:, 0], poly[:, 1], color=det_color, linewidth=1.1, alpha=0.65, linestyle="--")
+                # Label top-k predictions to show "front vehicles"
+                for score, cat, x_p, y_p, _z, w, l, yaw_p in det_plot[: max(int(args.det_label_topk), 0)]:
+                    ax_bev.text(
+                        x_p,
+                        y_p,
+                        _short_cat(cat),
+                        fontsize=7.2,
+                        color=det_color,
+                        ha="center",
+                        va="center",
+                        zorder=31,
+                        bbox=dict(boxstyle="round,pad=0.12", facecolor="white", edgecolor="none", alpha=0.55),
+                    )
+                # Legend handle for detector boxes
+                from matplotlib.lines import Line2D
+                det_handle = Line2D([0], [0], color=det_color, lw=1.5, linestyle="--", label="Det Pred boxes")
+        # Plot trajectories
+        if gt_wps:
+            gt_arr = np.asarray(gt_wps, dtype=np.float64)
+            ax_bev.plot(gt_arr[:, 0], gt_arr[:, 1], "--o", color="black", linewidth=1.8, markersize=4.0, alpha=0.85, label="GT traj")
+        if pred_wps:
+            pr_arr = np.asarray(pred_wps, dtype=np.float64)
+            ax_bev.plot(pr_arr[:, 0], pr_arr[:, 1], "-o", color="#00C2FF", linewidth=2.4, markersize=4.5, alpha=0.95, label="Pred traj")
+        # Legend: add GT boxes handle to avoid confusion (boxes are NOT predicted here).
+        handles, _labels = ax_bev.get_legend_handles_labels()
+        if args.draw_gt_boxes:
+            import matplotlib.patches as mpatches
+            handles.append(mpatches.Patch(facecolor="none", edgecolor="#FF5A5A", linewidth=1.2, label="GT boxes"))
+        if det_handle is not None:
+            handles.append(det_handle)
+        ax_bev.legend(handles=handles, loc="upper left", fontsize=9, frameon=True)
+        # Command label (like paper: bottom-left)
+        ax_bev.text(
+            0.02,
+            0.02,
+            _title_case_command(str(cmd)),
+            transform=ax_bev.transAxes,
+            fontsize=12,
+            fontweight="bold",
+            ha="left",
+            va="bottom",
+            color="black",
+            bbox=dict(boxstyle="round,pad=0.25", facecolor="white", edgecolor="none", alpha=0.65),
+        )
+    out_png.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_png, bbox_inches="tight")
+    plt.close(fig)
+    print(f"[saved] {out_png}")
+    if det_mm is not None:
+        try:
+            det_mm.close()
+        except Exception:
+            pass
+    if det_f is not None:
+        try:
+            det_f.close()
+        except Exception:
+            pass
+if __name__ == "__main__":
+    main()

scripts/vis_traffic_violation.py ADDED Viewed

	@@ -0,0 +1,516 @@

+#!/usr/bin/env python3
+"""
+Figure 11 — Violation of traffic regulations.
+This script focuses on the construction-blocking case shown in the paper:
+the road ahead is fenced by barriers / traffic cones, but Atlas still outputs
+a "go straight" trajectory that cuts through the blocked area.
+Style:
+  - Left: 6 camera views, only CAM_FRONT overlays the trajectory
+  - Right: clean BEV with black road boundaries, red construction boxes,
+    green/blue trajectory, and "Go Straight" label
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import pickle
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+_REPO = Path(__file__).resolve().parent.parent
+if str(_REPO) not in sys.path:
+    sys.path.insert(0, str(_REPO))
+CAM_ORDER_PAPER = [
+    "CAM_FRONT_LEFT",
+    "CAM_FRONT",
+    "CAM_FRONT_RIGHT",
+    "CAM_BACK_LEFT",
+    "CAM_BACK",
+    "CAM_BACK_RIGHT",
+]
+_IDX_REORDER = [2, 0, 1, 4, 3, 5]
+LOCATION_TO_MAP = {
+    "singapore-onenorth": "53992ee3023e5494b90c316c183be829.png",
+    "boston-seaport": "36092f0b03a857c6a3403e25b4b7aab3.png",
+    "singapore-queenstown": "93406b464a165eaba6d9de76ca09f5da.png",
+    "singapore-hollandvillage": "37819e65e09e5547b8a3ceaefba56bb2.png",
+}
+MAP_RES = 0.1  # meters per pixel
+DEFAULT_FIG11_SAMPLE = "856ccc626a4a4c0aaac1e62335050ac0"
+def _load_json(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+def _load_pickle(path: Path):
+    with path.open("rb") as f:
+        return pickle.load(f)
+def _quat_to_rotmat(qw, qx, qy, qz):
+    n = math.sqrt(qw * qw + qx * qx + qy * qy + qz * qz)
+    if n < 1e-12:
+        return np.eye(3, dtype=np.float64)
+    qw, qx, qy, qz = qw / n, qx / n, qy / n, qz / n
+    xx, yy, zz = qx * qx, qy * qy, qz * qz
+    xy, xz, yz = qx * qy, qx * qz, qy * qz
+    wx, wy, wz = qw * qx, qw * qy, qw * qz
+    return np.array(
+        [
+            [1 - 2 * (yy + zz), 2 * (xy - wz), 2 * (xz + wy)],
+            [2 * (xy + wz), 1 - 2 * (xx + zz), 2 * (yz - wx)],
+            [2 * (xz - wy), 2 * (yz + wx), 1 - 2 * (xx + yy)],
+        ],
+        dtype=np.float64,
+    )
+def _quat_to_yaw(q):
+    w, x, y, z = [float(v) for v in q]
+    return math.atan2(2 * (w * z + x * y), 1 - 2 * (y * y + z * z))
+def _paper_xy_to_ego(x_right: float, y_fwd: float, z_up: float = 0.0) -> np.ndarray:
+    return np.array([float(y_fwd), float(-x_right), float(z_up)], dtype=np.float64)
+def _project_batch(pts_ego: np.ndarray, R_c2e: np.ndarray, t_c2e: np.ndarray, K: np.ndarray) -> np.ndarray:
+    R_e2c = R_c2e.T
+    pts_cam = (R_e2c @ (pts_ego - t_c2e[None, :]).T).T
+    z = pts_cam[:, 2]
+    keep = z > 1e-3
+    pts_cam = pts_cam[keep]
+    if pts_cam.shape[0] == 0:
+        return np.zeros((0, 2), dtype=np.float64)
+    x = pts_cam[:, 0] / pts_cam[:, 2]
+    y = pts_cam[:, 1] / pts_cam[:, 2]
+    fx, fy = float(K[0, 0]), float(K[1, 1])
+    cx, cy = float(K[0, 2]), float(K[1, 2])
+    return np.stack([fx * x + cx, fy * y + cy], axis=1)
+def _load_cam_calibs(nusc_root: Path) -> Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarray]]:
+    meta = nusc_root / "v1.0-trainval"
+    sensor = _load_json(meta / "sensor.json")
+    calib = _load_json(meta / "calibrated_sensor.json")
+    sensor_token_by_channel: Dict[str, str] = {}
+    for rec in sensor:
+        ch = rec.get("channel")
+        tok = rec.get("token")
+        if isinstance(ch, str) and isinstance(tok, str):
+            sensor_token_by_channel[ch] = tok
+    calib_by_sensor_token: Dict[str, Dict] = {}
+    for rec in calib:
+        st = rec.get("sensor_token")
+        if isinstance(st, str):
+            calib_by_sensor_token[st] = rec
+    out: Dict[str, Tuple[np.ndarray, np.ndarray, np.ndarray]] = {}
+    for ch, st in sensor_token_by_channel.items():
+        rec = calib_by_sensor_token.get(st)
+        if not rec or "camera_intrinsic" not in rec:
+            continue
+        t = np.asarray(rec.get("translation", [0, 0, 0]), dtype=np.float64).reshape(3)
+        q = rec.get("rotation", [1, 0, 0, 0])
+        K = np.asarray(rec.get("camera_intrinsic", np.eye(3).tolist()), dtype=np.float64)
+        if not (isinstance(q, (list, tuple)) and len(q) == 4):
+            continue
+        if K.shape != (3, 3):
+            continue
+        R = _quat_to_rotmat(*[float(x) for x in q])
+        out[ch] = (R, t, K)
+    return out
+def _load_ego_poses(nusc_root: Path) -> Dict[str, Tuple[np.ndarray, np.ndarray, float]]:
+    pkl = nusc_root / "nuscenes_infos_val.pkl"
+    obj = _load_pickle(pkl)
+    infos = obj["infos"] if isinstance(obj, dict) and "infos" in obj else obj
+    out = {}
+    for it in infos:
+        tok = str(it.get("token", ""))
+        if not tok:
+            continue
+        t = np.asarray(it.get("ego2global_translation", [0, 0, 0]), dtype=np.float64).reshape(3)
+        q = it.get("ego2global_rotation", [1, 0, 0, 0])
+        if isinstance(q, (list, tuple)) and len(q) == 4:
+            out[tok] = (_quat_to_rotmat(*[float(x) for x in q]), t, _quat_to_yaw(q))
+    return out
+def _get_sample_location(nusc_root: Path, sample_token: str) -> str:
+    samples = _load_json(nusc_root / "v1.0-trainval" / "sample.json")
+    scenes = _load_json(nusc_root / "v1.0-trainval" / "scene.json")
+    logs = _load_json(nusc_root / "v1.0-trainval" / "log.json")
+    scene_map = {s["token"]: s for s in scenes}
+    log_map = {l["token"]: l for l in logs}
+    for s in samples:
+        if s["token"] == sample_token:
+            sc = scene_map.get(s.get("scene_token", ""), {})
+            lg = log_map.get(sc.get("log_token", ""), {})
+            return str(lg.get("location", ""))
+    return ""
+def _smooth_map(bev_map: np.ndarray) -> np.ndarray:
+    acc = bev_map.copy()
+    acc += np.roll(bev_map, 1, axis=0)
+    acc += np.roll(bev_map, -1, axis=0)
+    acc += np.roll(bev_map, 1, axis=1)
+    acc += np.roll(bev_map, -1, axis=1)
+    acc += np.roll(np.roll(bev_map, 1, axis=0), 1, axis=1)
+    acc += np.roll(np.roll(bev_map, 1, axis=0), -1, axis=1)
+    acc += np.roll(np.roll(bev_map, -1, axis=0), 1, axis=1)
+    acc += np.roll(np.roll(bev_map, -1, axis=0), -1, axis=1)
+    return acc / 9.0
+def _build_bev_map(
+    nusc_root: Path,
+    location: str,
+    ego_xy: np.ndarray,
+    ego_yaw: float,
+    bev_xlim: Tuple[float, float],
+    bev_ylim: Tuple[float, float],
+    bev_res: float = 0.1,
+) -> Optional[np.ndarray]:
+    import PIL.Image
+    PIL.Image.MAX_IMAGE_PIXELS = None
+    from PIL import Image
+    map_fn = LOCATION_TO_MAP.get(location)
+    if not map_fn:
+        return None
+    map_path = nusc_root / "maps" / map_fn
+    if not map_path.exists():
+        return None
+    map_img = Image.open(map_path)
+    mw, mh = map_img.size
+    map_max_y = mh * MAP_RES
+    map_arr = np.asarray(map_img, dtype=np.float32) / 255.0
+    ex, ey = float(ego_xy[0]), float(ego_xy[1])
+    c_yaw, s_yaw = math.cos(ego_yaw), math.sin(ego_yaw)
+    x0, x1 = bev_xlim
+    y0, y1 = bev_ylim
+    nx = int((x1 - x0) / bev_res)
+    ny = int((y1 - y0) / bev_res)
+    bev = np.zeros((ny, nx), dtype=np.float32)
+    px_arr = np.linspace(x0, x1, nx)
+    py_arr = np.linspace(y1, y0, ny)
+    PX, PY = np.meshgrid(px_arr, py_arr)
+    GX = ex + PY * c_yaw + PX * s_yaw
+    GY = ey + PY * s_yaw - PX * c_yaw
+    MX = (GX / MAP_RES).astype(np.int32)
+    MY = ((map_max_y - GY) / MAP_RES).astype(np.int32)
+    valid = (MX >= 0) & (MX < mw) & (MY >= 0) & (MY < mh)
+    bev[valid] = map_arr[MY[valid], MX[valid]]
+    return _smooth_map(bev)
+def _box_corners(cx: float, cy: float, w: float, l: float, yaw: float) -> np.ndarray:
+    c, s = math.cos(yaw), math.sin(yaw)
+    center = np.array([cx, cy], dtype=np.float64)
+    d_len = np.array([c, s], dtype=np.float64) * (l / 2.0)
+    d_wid = np.array([-s, c], dtype=np.float64) * (w / 2.0)
+    return np.stack(
+        [
+            center + d_len + d_wid,
+            center + d_len - d_wid,
+            center - d_len - d_wid,
+            center - d_len + d_wid,
+        ],
+        axis=0,
+    )
+def _is_barrier_like(cat: str) -> bool:
+    return ("barrier" in cat) or ("traffic_cone" in cat) or ("trafficcone" in cat)
+def _is_context_like(cat: str) -> bool:
+    return ("construction" in cat) or ("pedestrian" in cat) or ("vehicle.car" in cat)
+def _title_cmd(cmd: str) -> str:
+    c = (cmd or "").strip().lower()
+    return {
+        "turn left": "Turn Left",
+        "turn right": "Turn Right",
+        "go straight": "Go Straight",
+    }.get(c, cmd)
+def _blocked_score(item: Dict) -> float:
+    route_command = str(item.get("route_command", "")).strip().lower()
+    if route_command != "go straight":
+        return -1e9
+    boxes = item.get("gt_boxes_3d", []) or []
+    wps = (item.get("ego_motion", {}) or {}).get("waypoints", []) or []
+    n_block_front = 0
+    n_block_center = 0
+    n_barrier = 0
+    n_cone = 0
+    for b in boxes:
+        cat = str(b.get("category", ""))
+        wc = b.get("world_coords", [0, 0, 0])
+        if not (isinstance(wc, (list, tuple)) and len(wc) >= 2):
+            continue
+        x, y = float(wc[0]), float(wc[1])
+        if _is_barrier_like(cat):
+            if "barrier" in cat:
+                n_barrier += 1
+            else:
+                n_cone += 1
+            if 0 < y < 25 and abs(x) < 10:
+                n_block_front += 1
+            if 2 < y < 20 and abs(x) < 4:
+                n_block_center += 1
+    through_center = 0
+    for x, y in wps:
+        if 2 < float(y) < 20 and abs(float(x)) < 4:
+            through_center += 1
+    return n_block_center * 8 + n_block_front * 3 + through_center * 4 + n_barrier * 1.5 + n_cone
+def parse_args():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--eval_json", default="work_dirs/eval_final_plan100.json")
+    ap.add_argument("--data_json", default="data/atlas_planning_val_uniad_command.json")
+    ap.add_argument("--data_root", default="/home/guoyuanbo/autodl-tmp/data/nuscenes")
+    ap.add_argument("--sample_id", default=None)
+    ap.add_argument("--out_png", default="work_dirs/atlas_traffic_violation.png")
+    ap.add_argument("--dpi", type=int, default=200)
+    ap.add_argument("--bev_xlim", type=float, nargs=2, default=[-7.5, 10.5])
+    ap.add_argument("--bev_ylim", type=float, nargs=2, default=[-1.5, 30.5])
+    return ap.parse_args()
+def main():
+    args = parse_args()
+    repo = _REPO
+    nusc_root = Path(args.data_root).resolve()
+    out_png = (repo / args.out_png).resolve()
+    eval_obj = _load_json((repo / args.eval_json).resolve())
+    data_items = _load_json((repo / args.data_json).resolve())
+    pred_by_id = {
+        str(r.get("sample_id", "")): str(r.get("generated_text", ""))
+        for r in eval_obj.get("predictions", [])
+        if r.get("sample_id")
+    }
+    item_by_id = {str(it["id"]): it for it in data_items if it.get("id")}
+    from src.eval.metrics import parse_planning_output
+    sid = args.sample_id
+    if not sid:
+        if DEFAULT_FIG11_SAMPLE in item_by_id and parse_planning_output(pred_by_id.get(DEFAULT_FIG11_SAMPLE, "")):
+            sid = DEFAULT_FIG11_SAMPLE
+        else:
+            candidates = []
+            for item in data_items:
+                item_sid = str(item.get("id", ""))
+                pred_text = pred_by_id.get(item_sid, "")
+                if not pred_text:
+                    continue
+                plan = parse_planning_output(pred_text)
+                if not plan or not plan.get("waypoints"):
+                    continue
+                candidates.append((_blocked_score(item), item_sid))
+            candidates.sort(reverse=True)
+            if not candidates:
+                raise RuntimeError("No valid construction-blocked sample found.")
+            sid = candidates[0][1]
+    if sid not in item_by_id:
+        raise RuntimeError(f"sample_id {sid} not found in data_json")
+    item = item_by_id[sid]
+    pred_text = pred_by_id.get(sid, "")
+    plan = parse_planning_output(pred_text) if pred_text else None
+    if not plan or not plan.get("waypoints"):
+        raise RuntimeError(f"sample_id {sid} has no parseable planning output in {args.eval_json}")
+    pred_wps = np.asarray(plan["waypoints"], dtype=np.float64)
+    gt_wps = np.asarray((item.get("ego_motion", {}) or {}).get("waypoints", []) or [], dtype=np.float64)
+    cmd = str(item.get("route_command", ""))
+    boxes = item.get("gt_boxes_3d", []) or []
+    location = _get_sample_location(nusc_root, sid)
+    ego_poses = _load_ego_poses(nusc_root)
+    ego_info = ego_poses.get(sid)
+    if ego_info is None:
+        raise RuntimeError(f"Missing ego pose for sample {sid}")
+    ego_xy = ego_info[1][:2]
+    ego_yaw = ego_info[2]
+    print(f"[sample] {sid}")
+    print(f"  location: {location}")
+    print(f"  pred: {pred_text}")
+    bev_map = _build_bev_map(
+        nusc_root,
+        location,
+        ego_xy,
+        ego_yaw,
+        tuple(args.bev_xlim),
+        tuple(args.bev_ylim),
+        bev_res=0.1,
+    )
+    from PIL import Image
+    rel_paths = list(item.get("image_paths", []) or [])
+    if len(rel_paths) != 6:
+        raise RuntimeError(f"Expected 6 images, got {len(rel_paths)}")
+    imgs = []
+    for rp in rel_paths:
+        p = Path(rp)
+        if not p.is_absolute():
+            p = nusc_root / rp
+        imgs.append(Image.open(p).convert("RGB"))
+    imgs = [imgs[i] for i in _IDX_REORDER]
+    cam_calibs = _load_cam_calibs(nusc_root)
+    import matplotlib
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+    from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
+    fig = plt.figure(figsize=(14.8, 4.1), dpi=args.dpi)
+    gs = GridSpec(1, 2, figure=fig, width_ratios=[3.0, 1.1], wspace=0.03)
+    gs_cam = GridSpecFromSubplotSpec(2, 3, subplot_spec=gs[0, 0], wspace=0.01, hspace=0.01)
+    ax_imgs = []
+    for i in range(6):
+        ax = fig.add_subplot(gs_cam[i // 3, i % 3])
+        ax.imshow(imgs[i])
+        w_i, h_i = imgs[i].size
+        ax.set_xlim(0, w_i)
+        ax.set_ylim(h_i, 0)
+        ax.axis("off")
+        ax.text(
+            6,
+            14,
+            CAM_ORDER_PAPER[i],
+            color="white",
+            fontsize=7,
+            ha="left",
+            va="top",
+            bbox=dict(boxstyle="square,pad=0.12", facecolor="black", edgecolor="none", alpha=0.55),
+        )
+        ax_imgs.append(ax)
+    if "CAM_FRONT" in cam_calibs:
+        R_c2e, t_c2e, K = cam_calibs["CAM_FRONT"]
+        ax_front = ax_imgs[1]
+        if gt_wps.shape[0] >= 2:
+            uv_gt = _project_batch(
+                np.array([_paper_xy_to_ego(x, y) for x, y in gt_wps], dtype=np.float64),
+                R_c2e,
+                t_c2e,
+                K,
+            )
+            if uv_gt.shape[0] >= 2:
+                ax_front.plot(uv_gt[:, 0], uv_gt[:, 1], color="#34c759", linewidth=4.0, alpha=0.95, zorder=18)
+        uv_pred = _project_batch(
+            np.array([_paper_xy_to_ego(x, y) for x, y in pred_wps], dtype=np.float64),
+            R_c2e,
+            t_c2e,
+            K,
+        )
+        if uv_pred.shape[0] >= 2:
+            ax_front.plot(uv_pred[:, 0], uv_pred[:, 1], color="#1f5cff", linewidth=2.2, alpha=0.98, zorder=20)
+            ax_front.scatter(uv_pred[:, 0], uv_pred[:, 1], color="#1f5cff", s=8, zorder=21)
+    ax_bev = fig.add_subplot(gs[0, 1])
+    ax_bev.set_facecolor("white")
+    ax_bev.set_xlim(*args.bev_xlim)
+    ax_bev.set_ylim(*args.bev_ylim)
+    ax_bev.set_aspect("equal", adjustable="box")
+    ax_bev.set_xticks([])
+    ax_bev.set_yticks([])
+    for spine in ax_bev.spines.values():
+        spine.set_linewidth(1.3)
+        spine.set_color("black")
+    if bev_map is not None:
+        ny, nx = bev_map.shape
+        xs = np.linspace(args.bev_xlim[0], args.bev_xlim[1], nx)
+        ys = np.linspace(args.bev_ylim[0], args.bev_ylim[1], ny)
+        ax_bev.contour(xs, ys, bev_map, levels=[0.5], colors="black", linewidths=0.8, zorder=1)
+    construction_boxes = []
+    for b in boxes:
+        cat = str(b.get("category", ""))
+        wc = b.get("world_coords", [0, 0, 0])
+        if not (isinstance(wc, (list, tuple)) and len(wc) >= 2):
+            continue
+        cx, cy = float(wc[0]), float(wc[1])
+        if not (args.bev_xlim[0] - 2 <= cx <= args.bev_xlim[1] + 2 and args.bev_ylim[0] - 2 <= cy <= args.bev_ylim[1] + 2):
+            continue
+        box_item = (cx, cy, float(b.get("w", 1.8)), float(b.get("l", 4.0)), float(b.get("yaw", 0.0)))
+        if _is_barrier_like(cat):
+            construction_boxes.append(box_item)
+    for cx, cy, w, l, yaw in construction_boxes:
+        poly = _box_corners(cx, cy, w, l, yaw)
+        poly = np.vstack([poly, poly[0:1]])
+        ax_bev.plot(poly[:, 0], poly[:, 1], color="#cf7a6b", linewidth=0.9, alpha=0.95, zorder=3)
+    if gt_wps.shape[0] >= 2:
+        ax_bev.plot(gt_wps[:, 0], gt_wps[:, 1], color="#34c759", linewidth=3.4, alpha=0.95, zorder=5)
+    ax_bev.plot(pred_wps[:, 0], pred_wps[:, 1], color="#1f5cff", linewidth=1.8, alpha=0.98, zorder=6)
+    ego_rect = patches.Rectangle(
+        (-0.45, -0.45),
+        0.9,
+        0.9,
+        linewidth=1.4,
+        edgecolor="#34c759",
+        facecolor="none",
+        zorder=7,
+    )
+    ax_bev.add_patch(ego_rect)
+    ax_bev.text(0.03, 0.98, "BEV", transform=ax_bev.transAxes, ha="left", va="top", fontsize=9, fontweight="bold")
+    ax_bev.text(
+        0.03,
+        0.03,
+        _title_cmd(cmd),
+        transform=ax_bev.transAxes,
+        ha="left",
+        va="bottom",
+        fontsize=9,
+        fontweight="bold",
+    )
+    out_png.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_png, bbox_inches="tight", facecolor="white", pad_inches=0.02)
+    plt.close(fig)
+    print(f"[saved] {out_png}")
+    print(f"  construction boxes: {len(construction_boxes)}")
+if __name__ == "__main__":
+    main()

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (177 Bytes). View file

src/__pycache__/prompting.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

src/__pycache__/prompting.cpython-38.pyc ADDED Viewed

Binary file (10.6 kB). View file

src/audit/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (240 Bytes). View file

src/audit/__pycache__/audit_utils.cpython-310.pyc ADDED Viewed

Binary file (998 Bytes). View file

src/dataset/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (316 Bytes). View file

src/dataset/__pycache__/atlas_dataset.cpython-310.pyc ADDED Viewed

Binary file (40 kB). View file

src/dataset/__pycache__/atlas_dataset.cpython-38.pyc ADDED Viewed

Binary file (40.6 kB). View file

src/dataset/__pycache__/scene_sampler.cpython-310.pyc ADDED Viewed

Binary file (3.88 kB). View file

src/dataset/__pycache__/scene_sampler.cpython-38.pyc ADDED Viewed

Binary file (3.89 kB). View file

src/dataset/atlas_dataset.py ADDED Viewed

	@@ -0,0 +1,1416 @@

+import os
+import json
+import math
+import torch
+import numpy as np
+from torch.utils.data import Dataset
+from PIL import Image
+from typing import Dict, List, Optional, Tuple
+from pathlib import Path
+import torchvision.transforms as transforms
+try:
+    import torchvision.transforms.v2 as _v2
+    _HAS_V2 = hasattr(_v2, "ToImage") and hasattr(_v2, "ToDtype")
+except (ImportError, AttributeError):
+    _HAS_V2 = False
+if _HAS_V2:
+    transforms = _v2
+from src.prompting import (
+    PLANNING_TABLE3_MODES,
+    build_prompt,
+    rewrite_planning_prompt_for_table3,
+)
+from src.audit.audit_utils import audit_enabled, audit_check
+NUM_DETECTION_QUERIES = 256
+NUM_MAP_QUERIES = 256
+PLANNING_STATE_RANGE = (-50.0, 50.0)
+PLANNING_NUM_BINS = 1000
+AVAILABLE_COMMANDS = ["turn left", "turn right", "go straight"]
+# Z range aligned with StreamPETR point_cloud_range [-5, 3]
+Z_MIN, Z_MAX = -5.0, 3.0
+# nuScenes 10-class detection 类别映射
+NUSCENES_CLASSES = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# 完整的 nuScenes 类别名映射到基础类别
+NUSCENES_CATEGORY_MAP = {
+    # 基础类别名
+    'car': 0, 'truck': 1, 'construction_vehicle': 2, 'bus': 3, 'trailer': 4,
+    'barrier': 5, 'motorcycle': 6, 'bicycle': 7, 'pedestrian': 8, 'traffic_cone': 9,
+    # 完整 nuScenes 类别名 - 车辆
+    'vehicle.car': 0, 'vehicle.truck': 1, 'vehicle.construction': 2,
+    'vehicle.bus.bendy': 3, 'vehicle.bus.rigid': 3, 'vehicle.trailer': 4,
+    'vehicle.motorcycle': 6, 'vehicle.bicycle': 7,
+    # 完整 nuScenes 类别名 - 行人
+    'human.pedestrian.adult': 8, 'human.pedestrian.child': 8,
+    'human.pedestrian.construction_worker': 8, 'human.pedestrian.police_officer': 8,
+    'human.pedestrian.wheelchair': 8, 'human.pedestrian.stroller': 8,
+    'human.pedestrian.personal_mobility': 8,
+    # 完整 nuScenes 类别名 - 可移动物体
+    'movable_object.barrier': 5, 'movable_object.trafficcone': 9,
+    'movable_object.traffic_cone': 9,
+}
+def nuscenes_to_paper_coords(x_nuscenes: float, y_nuscenes: float) -> Tuple[float, float]:
+    return -y_nuscenes, x_nuscenes
+def planning_state_to_bin(
+    value: float,
+    min_val: float = PLANNING_STATE_RANGE[0],
+    max_val: float = PLANNING_STATE_RANGE[1],
+    num_bins: int = PLANNING_NUM_BINS,
+) -> int:
+    v = float(np.clip(value, min_val, max_val))
+    t = (v - min_val) / (max_val - min_val)
+    idx = int(round(t * (num_bins - 1)))
+    return int(np.clip(idx, 0, num_bins - 1))
+def normalize_route_command(command: object) -> Optional[str]:
+    if not isinstance(command, str):
+        return None
+    cmd = command.strip().lower()
+    mapping = {
+        "turn left": "turn left",
+        "left": "turn left",
+        "turn right": "turn right",
+        "right": "turn right",
+        "go straight": "go straight",
+        "straight": "go straight",
+        "keep straight": "go straight",
+        "forward": "go straight",
+    }
+    if cmd in mapping:
+        return mapping[cmd]
+    raise ValueError(f"Unsupported route command: {command!r}")
+CAMERA_NAMES = [
+    'CAM_FRONT',
+    'CAM_FRONT_RIGHT',
+    'CAM_FRONT_LEFT',
+    'CAM_BACK',
+    'CAM_BACK_LEFT',
+    'CAM_BACK_RIGHT',
+]
+VALID_TASK_TYPES = {"detection", "lane", "planning", "caption"}
+def _normalize_task_type(task: object) -> Optional[str]:
+    if not isinstance(task, str):
+        return None
+    task_name = task.strip().lower()
+    if task_name in VALID_TASK_TYPES:
+        return task_name
+    raise ValueError(f"Unsupported task type: {task!r}")
+def _infer_task_type_from_structure(item: dict) -> Optional[str]:
+    if not isinstance(item, dict):
+        return None
+    try:
+        num_map_queries = int(item.get("num_map_queries", -1))
+    except Exception:
+        num_map_queries = -1
+    if "ego_motion" in item or "gt_boxes_3d_per_timestep" in item:
+        return "planning"
+    if isinstance(item.get("camera"), str) and item.get("camera") and num_map_queries == 0:
+        return "caption"
+    if item.get("sensor") is not None or "openlane_lane_centerline" in item:
+        return "lane"
+    if "gt_boxes_3d" in item or num_map_queries == 0:
+        return "detection"
+    return None
+def infer_task_type(item: dict) -> str:
+    task = _normalize_task_type(item.get("task"))
+    if task:
+        return task
+    inferred = _infer_task_type_from_structure(item)
+    if inferred:
+        return inferred
+    sample_id = item.get("id", "<unknown>")
+    raise ValueError(
+        f"Unable to infer task type for sample {sample_id!r}. "
+        "Please add an explicit 'task' field to the dataset JSON."
+    )
+def load_tokenizer(model_name: str = "lmsys/vicuna-7b-v1.5"):
+    from transformers import AutoTokenizer
+    print(f"Loading tokenizer: {model_name}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            use_fast=False,
+            trust_remote_code=True,
+        )
+    except Exception as e:
+        print(f"Failed to load from {model_name}: {e}")
+        os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            use_fast=False,
+            trust_remote_code=True,
+        )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+class AtlasDataset(Dataset):
+    def __init__(
+        self,
+        json_file: str,
+        image_root: str,
+        tokenizer,
+        max_length: int = 4096,
+        image_size: Tuple[int, int] = (800, 1600),
+        use_nuscenes_calibration: bool = True,
+        is_training: bool = True,
+        num_detection_queries: int = NUM_DETECTION_QUERIES,
+        num_map_queries: int = NUM_MAP_QUERIES,
+        planning_table3_mode: str = "atlas_base",
+        image_path_remap: Optional[str] = None,
+        precomputed_det_tokens: Optional[str] = None,
+        precomputed_map_tokens: Optional[str] = None,
+    ):
+        self.json_file = json_file
+        self.image_root = image_root
+        self.image_path_remap = None
+        if image_path_remap:
+            old, new = image_path_remap.split("=", 1)
+            self.image_path_remap = (old, new)
+        self.precomputed_det_dir = precomputed_det_tokens
+        self.precomputed_map_dir = precomputed_map_tokens
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.is_training = is_training
+        self.num_detection_queries = int(num_detection_queries)
+        self.num_map_queries = int(num_map_queries)
+        if planning_table3_mode not in PLANNING_TABLE3_MODES:
+            raise ValueError(
+                f"Unsupported planning_table3_mode: {planning_table3_mode}. "
+                f"Expected one of {PLANNING_TABLE3_MODES}."
+            )
+        self.planning_table3_mode = planning_table3_mode
+        self.use_nuscenes_calibration = bool(use_nuscenes_calibration)
+        if isinstance(image_size, int):
+            self.image_size = (image_size, image_size)
+        else:
+            self.image_size = image_size
+        paths = [p.strip() for p in str(json_file).split(",") if p.strip()]
+        if len(paths) == 0:
+            raise RuntimeError("json_file is empty")
+        self.data = []
+        for p in paths:
+            with open(p, "r", encoding="utf-8") as f:
+                chunk = json.load(f)
+            if not isinstance(chunk, list):
+                raise RuntimeError(f"JSON must be a list: {p}")
+            self.data.extend(chunk)
+        if len(paths) == 1:
+            print(f"Loaded {len(self.data)} samples from {paths[0]}")
+        else:
+            print(f"Loaded {len(self.data)} samples from {len(paths)} json files")
+        print(f"Image size: {self.image_size[0]}x{self.image_size[1]} (HxW)")
+        self._task_types = [infer_task_type(item) for item in self.data]
+        from collections import Counter
+        print(f"Task distribution: {dict(Counter(self._task_types))}")
+        self._audit_planning_route_command_schema()
+        if _HAS_V2:
+            self.image_transform = transforms.Compose([
+                transforms.ToImage(),
+                transforms.ToDtype(torch.float32, scale=True),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ])
+        else:
+            self.image_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ])
+        self.streampetr_conf = {
+            "H": 900, "W": 1600,
+            "final_dim": (800, 1600),
+            "resize_lim": (1.0, 1.2),
+            "bot_pct_lim": (0.0, 0.0),
+        }
+        self.topomlp_conf = {
+            "target_size": (1600, 800),
+        }
+        self.calibration = None
+        if self.use_nuscenes_calibration:
+            self.calibration = self._load_nuscenes_calibration()
+            if self.calibration is None:
+                print("[WARN] nuScenes calibration metadata not found, will use per-item sensor field.")
+        self.query_token = "<query>"
+        vocab = tokenizer.get_vocab()
+        if self.query_token not in vocab:
+            raise RuntimeError(f"Tokenizer missing required special token: {self.query_token}")
+        if tokenizer.pad_token_id is None:
+            raise RuntimeError("tokenizer.pad_token_id is None")
+        self.query_token_id = tokenizer.convert_tokens_to_ids(self.query_token)
+        print(f"<query> token ID: {self.query_token_id}")
+    def _load_nuscenes_calibration(self) -> Optional[Dict]:
+        try:
+            nuscenes_root = Path(self.image_root)
+            version_dir = None
+            for v in ["v1.0-trainval", "v1.0-mini", "v1.0-test"]:
+                if (nuscenes_root / v).exists():
+                    version_dir = nuscenes_root / v
+                    break
+            if version_dir is None:
+                print("nuScenes metadata folder not found under image_root")
+                return None
+            sample_data_file = version_dir / "sample_data.json"
+            calibrated_sensor_file = version_dir / "calibrated_sensor.json"
+            ego_pose_file = version_dir / "ego_pose.json"
+            sample_file = version_dir / "sample.json"
+            if (not sample_data_file.exists()) or (not calibrated_sensor_file.exists()) or (not ego_pose_file.exists()) or (not sample_file.exists()):
+                print(f"nuScenes metadata missing under {version_dir}")
+                return None
+            with open(sample_data_file, "r") as f:
+                sample_data = json.load(f)
+            with open(calibrated_sensor_file, "r") as f:
+                calibrated_sensor = json.load(f)
+            with open(ego_pose_file, "r") as f:
+                ego_pose = json.load(f)
+            with open(sample_file, "r") as f:
+                sample = json.load(f)
+            sample_data_by_filename = {rec["filename"]: rec for rec in sample_data if "filename" in rec}
+            calibrated_sensor_by_token = {rec["token"]: rec for rec in calibrated_sensor if "token" in rec}
+            ego_pose_by_token = {rec["token"]: rec for rec in ego_pose if "token" in rec}
+            sample_by_token = {rec["token"]: rec for rec in sample if "token" in rec}
+            lidar_sd_by_sample_token: Dict[str, Dict] = {}
+            for rec in sample_data:
+                if not isinstance(rec, dict):
+                    continue
+                fn = str(rec.get("filename", "")).replace("\\", "/")
+                if "/LIDAR_TOP/" not in fn:
+                    continue
+                if not fn.startswith("samples/"):
+                    continue
+                if not bool(rec.get("is_key_frame", False)):
+                    continue
+                st = rec.get("sample_token", None)
+                if st is None:
+                    continue
+                lidar_sd_by_sample_token.setdefault(str(st), rec)
+            print(f"Loaded nuScenes metadata from {version_dir.name}:")
+            print(f"  sample_data: {len(sample_data_by_filename)}")
+            print(f"  calibrated_sensor: {len(calibrated_sensor_by_token)}")
+            print(f"  ego_pose: {len(ego_pose_by_token)}")
+            print(f"  sample: {len(sample_by_token)}")
+            print(f"  lidar_keyframes: {len(lidar_sd_by_sample_token)}")
+            return {
+                "sample_data_by_filename": sample_data_by_filename,
+                "calibrated_sensor_by_token": calibrated_sensor_by_token,
+                "ego_pose_by_token": ego_pose_by_token,
+                "sample_by_token": sample_by_token,
+                "lidar_sd_by_sample_token": lidar_sd_by_sample_token,
+            }
+        except Exception as e:
+            print(f"Failed to load nuScenes calibration: {e}")
+            return None
+    def __len__(self) -> int:
+        return len(self.data)
+    def _audit_planning_route_command_schema(self) -> None:
+        planning_indices = [i for i, task in enumerate(self._task_types) if task == "planning"]
+        if not planning_indices:
+            return
+        total = len(planning_indices)
+        top_level_count = 0
+        legacy_ego_motion_command = 0
+        route_command_dist: Dict[str, int] = {}
+        for idx in planning_indices:
+            item = self.data[idx]
+            try:
+                command = self._resolve_route_command(item)
+            except Exception as exc:
+                sample_id = item.get("id", f"planning_idx_{idx}")
+                print(f"[WARN] invalid planning route_command for sample {sample_id}: {exc}")
+                command = None
+            if command is not None:
+                top_level_count += 1
+                route_command_dist[command] = route_command_dist.get(command, 0) + 1
+            ego = item.get("ego_motion")
+            if isinstance(ego, dict) and "command" in ego:
+                legacy_ego_motion_command += 1
+        print(
+            "[Planning route_command audit] "
+            f"mode={self.planning_table3_mode} "
+            f"top_level_coverage={top_level_count}/{total} "
+            f"legacy_ego_motion_command={legacy_ego_motion_command}/{total} "
+            f"distribution={route_command_dist}"
+        )
+        if self.planning_table3_mode != "atlas_base" and top_level_count < total:
+            print(
+                "[WARN] planning high-level mode requested but top-level "
+                "route_command coverage is incomplete."
+            )
+        if legacy_ego_motion_command > 0:
+            print(
+                "[WARN] legacy planning schema detected: ego_motion.command "
+                "is still present in the loaded JSON."
+            )
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        item = self.data[idx]
+        task_type = self._task_types[idx]
+        img_paths = item["image_paths"]
+        if self.image_path_remap:
+            old, new = self.image_path_remap
+            img_paths = [p.replace(old, new) for p in img_paths]
+        cam_out = self._load_images_with_cameras(
+            img_paths,
+            item=item,
+        )
+        pixel_values = cam_out["pixel_values"]
+        intrinsics = cam_out["intrinsics"]
+        extrinsics = cam_out["extrinsics"]
+        lidar2img = cam_out["lidar2img"]
+        ego_pose = cam_out.get("ego_pose")
+        ego_pose_inv = cam_out.get("ego_pose_inv")
+        timestamp = cam_out.get("timestamp")
+        prompt_raw, answer_raw = self._extract_conversation(item)
+        if task_type == "planning":
+            prompt_raw = self._rewrite_planning_prompt(prompt_raw, item)
+        expected_num_queries = self._infer_expected_query_count(prompt_raw, item=item)
+        prompt_text = self._expand_query_placeholders(
+            prompt_raw,
+            expected_num_queries,
+            item=item,
+        )
+        prompt_str = build_prompt(prompt_text, mode="train" if self.is_training else "infer")
+        answer_str = f" {answer_raw}"
+        prompt_ids = self.tokenizer(prompt_str, add_special_tokens=False)["input_ids"]
+        answer_ids = self.tokenizer(answer_str, add_special_tokens=False)["input_ids"]
+        bos = [self.tokenizer.bos_token_id] if self.tokenizer.bos_token_id is not None else []
+        eos = [self.tokenizer.eos_token_id] if self.tokenizer.eos_token_id is not None else []
+        if self.is_training:
+            input_ids_full = bos + prompt_ids + answer_ids + eos
+        else:
+            input_ids_full = bos + prompt_ids
+        input_ids = input_ids_full
+        if len(input_ids) > self.max_length:
+            input_ids = input_ids[: self.max_length]
+        attention_mask = [1] * len(input_ids)
+        num_query_tokens = sum(1 for t in input_ids if t == self.query_token_id)
+        if num_query_tokens != expected_num_queries:
+            raise ValueError(f"<query> mismatch: expected {expected_num_queries}, got {num_query_tokens}")
+        if self.is_training:
+            labels = input_ids.copy()
+            prompt_len = len(bos) + len(prompt_ids)
+            labels[:prompt_len] = [-100] * prompt_len
+            first_nonmasked_index = -1
+            for i, t in enumerate(labels):
+                if t != -100:
+                    first_nonmasked_index = i
+                    break
+            labels_nonmasked_count = sum(1 for t in labels if t != -100)
+            assert len(labels) == len(input_ids), "labels/input_ids length mismatch"
+            if labels_nonmasked_count > 0:
+                assert first_nonmasked_index == prompt_len, (
+                    f"first_nonmasked_index={first_nonmasked_index} != prompt_len={prompt_len}"
+                )
+            assert labels_nonmasked_count > 0, (
+                f"all labels are -100: len_full={len(input_ids_full)} len_trunc={len(input_ids)} max_length={self.max_length}"
+            )
+            if audit_enabled():
+                if not hasattr(self, "_audit_trunc_total"):
+                    self._audit_trunc_total = 0
+                    self._audit_trunc_hits = 0
+                self._audit_trunc_total += 1
+                truncated = int(len(input_ids_full) > self.max_length)
+                self._audit_trunc_hits += truncated
+                trunc_rate = float(self._audit_trunc_hits) / float(self._audit_trunc_total)
+                min_ans = int(os.getenv("ATLAS_MIN_ANSWER_TOKENS", "16"))
+                ok = (labels_nonmasked_count >= min_ans)
+                audit_check(
+                    "A6",
+                    ok,
+                    once=False,
+                    truncated=truncated,
+                    trunc_rate=trunc_rate,
+                    labels_nonmasked_count=labels_nonmasked_count,
+                    min_answer_tokens=min_ans,
+                )
+        else:
+            labels = [-100] * len(input_ids)
+            prompt_len = len(input_ids)
+            first_nonmasked_index = -1
+            labels_nonmasked_count = 0
+        scene_id = self._get_scene_id(item)
+        sample_id = str(item.get("id", idx))
+        gt_boxes, gt_labels = self._load_gt_boxes(item)
+        result = {
+            "pixel_values": pixel_values,
+            "pixel_values_det": cam_out["pixel_values_det"],
+            "pixel_values_map": cam_out["pixel_values_map"],
+            "intrinsics": intrinsics,
+            "intrinsics_det": cam_out["intrinsics_det"],
+            "intrinsics_map": cam_out["intrinsics_map"],
+            "extrinsics": extrinsics,
+            "lidar2img": lidar2img,
+            "lidar2img_det": cam_out["lidar2img_det"],
+            "lidar2img_map": cam_out["lidar2img_map"],
+            "input_ids": torch.tensor(input_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "labels": torch.tensor(labels, dtype=torch.long),
+            "scene_id": scene_id,
+            "sample_id": sample_id,
+            "dataset_idx": torch.tensor(idx, dtype=torch.long),
+            "task_type": task_type,
+            "audit_prompt_len": torch.tensor(prompt_len, dtype=torch.long),
+            "audit_answer_len": torch.tensor(len(answer_ids), dtype=torch.long),
+            "audit_labels_nonmasked_count": torch.tensor(labels_nonmasked_count, dtype=torch.long),
+            "audit_first_nonmasked_index": torch.tensor(first_nonmasked_index, dtype=torch.long),
+            "audit_num_query_tokens_in_input_ids": torch.tensor(num_query_tokens, dtype=torch.long),
+            "audit_expected_num_queries": torch.tensor(expected_num_queries, dtype=torch.long),
+            "audit_truncated": torch.tensor(int(len(input_ids_full) > self.max_length), dtype=torch.long),
+        }
+        if ego_pose is not None:
+            result["ego_pose"] = ego_pose
+        if ego_pose_inv is not None:
+            result["ego_pose_inv"] = ego_pose_inv
+        if timestamp is not None:
+            result["timestamp"] = timestamp
+        try:
+            ego_motion = self._get_ego_motion_data(item)
+            if isinstance(ego_motion, dict) and "velocity" in ego_motion:
+                result["velocity"] = torch.tensor(ego_motion["velocity"], dtype=torch.float32)
+        except Exception:
+            pass
+        if gt_boxes is not None:
+            result["gt_boxes"] = gt_boxes
+            result["gt_labels"] = gt_labels
+        if self.precomputed_det_dir:
+            pt = self._load_precomputed(self.precomputed_det_dir, item)
+            if pt is not None:
+                result["precomputed_det"] = pt["detection"]
+                result["precomputed_det_ref"] = pt["detection_ref_points"]
+        if self.precomputed_map_dir:
+            mpt = self._load_precomputed(self.precomputed_map_dir, item)
+            if mpt is not None:
+                result["precomputed_map"] = mpt
+        if os.getenv("ATLAS_AUDIT", "0") not in ("", "0", "false", "False"):
+            max_samples = int(os.getenv("ATLAS_AUDIT_MAX_SAMPLES", "1"))
+            if idx < max_samples:
+                print(
+                    "[ATLAS_AUDIT][A1/A3] "
+                    f"idx={idx} "
+                    f"prompt_len={prompt_len} answer_len={len(answer_ids)} "
+                    f"first_nonmasked_index={first_nonmasked_index} labels_nonmasked_count={labels_nonmasked_count} "
+                    f"num_query_tokens_in_input_ids={num_query_tokens} expected_num_queries={expected_num_queries} "
+                    f"seq_len={len(input_ids)} truncated={int(len(input_ids_full) > self.max_length)}"
+                )
+        return result
+    def _resolve_category_id(self, label) -> int:
+        """将类别标签转换为类别 ID，支持整数、浮点数和字符串类别名"""
+        if isinstance(label, (int, float)):
+            return int(label)
+        if isinstance(label, str):
+            label_lower = label.lower().strip()
+            if label_lower in NUSCENES_CATEGORY_MAP:
+                return NUSCENES_CATEGORY_MAP[label_lower]
+            # 对 human.pedestrian.* 子类使用前缀匹配
+            if label_lower.startswith('human.pedestrian.'):
+                return 8  # pedestrian
+            # 对 vehicle.bus.* 子类使用前缀匹配
+            if label_lower.startswith('vehicle.bus.'):
+                return 3  # bus
+        return 0
+    def _load_gt_boxes(self, item: Dict) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        annotations = item.get("annotations", item.get("gt_boxes_3d", None))
+        if annotations is None:
+            return None, None
+        if not annotations:
+            return torch.zeros(0, 7), torch.zeros(0, dtype=torch.long)
+        boxes_list = []
+        labels_list = []
+        bev_range = 51.2
+        for ann in annotations:
+            if isinstance(ann, dict):
+                if "translation" in ann:
+                    pos_x, pos_y, pos_z = ann["translation"][:3]
+                    size_w, size_l, size_h = ann.get("size", [1, 1, 1])[:3]
+                    yaw = ann.get("rotation", 0)
+                    if isinstance(yaw, (list, tuple)):
+                        if len(yaw) == 1:
+                            yaw = float(yaw[0])
+                        elif len(yaw) == 4:
+                            # Quaternion (w, x, y, z) -> yaw
+                            qw, qx, qy, qz = [float(v) for v in yaw]
+                            t0 = 2.0 * (qw * qz + qx * qy)
+                            t1 = 1.0 - 2.0 * (qy * qy + qz * qz)
+                            yaw = math.atan2(t0, t1)
+                        else:
+                            yaw = 0.0
+                    # 支持多种类别字段: category_id, category, category_name
+                    label = ann.get("category_id", ann.get("category", ann.get("category_name", 0)))
+                    x, y, z = pos_x, pos_y, pos_z
+                    w, l, h = size_w, size_l, size_h
+                    x_norm = (x + bev_range) / (2 * bev_range)
+                    y_norm = (y + bev_range) / (2 * bev_range)
+                    z_norm = (z - Z_MIN) / (Z_MAX - Z_MIN)
+                    boxes_list.append([x_norm, y_norm, z_norm, w, l, h, yaw])
+                    labels_list.append(self._resolve_category_id(label))
+                elif "box" in ann:
+                    box = ann["box"]
+                    x, y, z = box[:3]
+                    w, l, h = box[3:6] if len(box) >= 6 else (1, 1, 1)
+                    yaw = box[6] if len(box) >= 7 else 0
+                    label = ann.get("category_id", ann.get("label", ann.get("category_name", 0)))
+                    x_norm = (x + bev_range) / (2 * bev_range)
+                    y_norm = (y + bev_range) / (2 * bev_range)
+                    z_norm = (z - Z_MIN) / (Z_MAX - Z_MIN)
+                    boxes_list.append([x_norm, y_norm, z_norm, w, l, h, yaw])
+                    labels_list.append(self._resolve_category_id(label))
+                elif "world_coords" in ann:
+                    wc = ann["world_coords"]
+                    x, y, z = wc[0], wc[1], wc[2] if len(wc) > 2 else 0.0
+                    w = ann.get("w", 1.0)
+                    l = ann.get("l", 1.0)
+                    h = ann.get("h", 1.0)
+                    yaw = ann.get("yaw", 0.0)
+                    label = ann.get("category_id", ann.get("category", ann.get("category_name", 0)))
+                    x_norm = (x + bev_range) / (2 * bev_range)
+                    y_norm = (y + bev_range) / (2 * bev_range)
+                    z_norm = (z - Z_MIN) / (Z_MAX - Z_MIN)
+                    boxes_list.append([x_norm, y_norm, z_norm, w, l, h, yaw])
+                    labels_list.append(self._resolve_category_id(label))
+            elif isinstance(ann, (list, tuple)) and len(ann) >= 3:
+                x, y, z = ann[:3]
+                w, l, h = ann[3:6] if len(ann) >= 6 else (1, 1, 1)
+                yaw = ann[6] if len(ann) >= 7 else 0
+                x_norm = (x + bev_range) / (2 * bev_range)
+                y_norm = (y + bev_range) / (2 * bev_range)
+                z_norm = (z - Z_MIN) / (Z_MAX - Z_MIN)
+                boxes_list.append([x_norm, y_norm, z_norm, w, l, h, yaw])
+                labels_list.append(0)
+        if not boxes_list:
+            return torch.zeros(0, 7), torch.zeros(0, dtype=torch.long)
+        if os.getenv("ATLAS_AUDIT", "0") not in ("", "0", "false", "False"):
+            if not hasattr(self, "_audit_gt_calls"):
+                self._audit_gt_calls = 0
+            max_calls = int(os.getenv("ATLAS_AUDIT_MAX_GT", "1"))
+            if self._audit_gt_calls < max_calls:
+                yaws = [float(b[6]) for b in boxes_list if len(b) >= 7]
+                if yaws:
+                    y_min = float(min(yaws))
+                    y_max = float(max(yaws))
+                    y_abs = float(max(abs(y) for y in yaws))
+                    print(f"[ATLAS_AUDIT][E2/E3] gt_yaw_min={y_min:.3e} gt_yaw_max={y_max:.3e} gt_yaw_absmax={y_abs:.3e}")
+                    if y_abs > 10.0:
+                        print("[ATLAS_AUDIT][E2] yaw_absmax>10 (possible degrees instead of radians)")
+                self._audit_gt_calls += 1
+        gt_boxes = torch.tensor(boxes_list, dtype=torch.float32)
+        gt_labels = torch.tensor(labels_list, dtype=torch.long)
+        return gt_boxes, gt_labels
+    def _rewrite_planning_prompt(self, prompt_text: str, item: Dict) -> str:
+        ego_motion = item.get("ego_motion", {})
+        if not isinstance(ego_motion, dict):
+            ego_motion = {}
+        route_command = self._resolve_route_command(item)
+        velocity = ego_motion.get("velocity")
+        acceleration = ego_motion.get("acceleration")
+        velocity_bins = None
+        acceleration_bins = None
+        if velocity is not None:
+            if not isinstance(velocity, (list, tuple)) or len(velocity) < 2:
+                raise RuntimeError("planning ego_motion.velocity must be a 2D vector")
+            velocity_bins = (
+                planning_state_to_bin(float(velocity[0])),
+                planning_state_to_bin(float(velocity[1])),
+            )
+        if acceleration is not None:
+            if not isinstance(acceleration, (list, tuple)) or len(acceleration) < 2:
+                raise RuntimeError("planning ego_motion.acceleration must be a 2D vector")
+            acceleration_bins = (
+                planning_state_to_bin(float(acceleration[0])),
+                planning_state_to_bin(float(acceleration[1])),
+            )
+        return rewrite_planning_prompt_for_table3(
+            prompt_text,
+            mode=self.planning_table3_mode,
+            command=route_command,
+            velocity_bins=velocity_bins,
+            acceleration_bins=acceleration_bins,
+        )
+    def _resolve_route_command(self, item: Dict) -> Optional[str]:
+        candidates = [
+            item.get("route_command"),
+            item.get("nav_command"),
+            item.get("high_level_command"),
+            item.get("navigation_command"),
+        ]
+        meta = item.get("meta_data")
+        if isinstance(meta, dict):
+            candidates.extend([
+                meta.get("route_command"),
+                meta.get("nav_command"),
+                meta.get("high_level_command"),
+                meta.get("navigation_command"),
+            ])
+        for candidate in candidates:
+            normalized = normalize_route_command(candidate)
+            if normalized is not None:
+                return normalized
+        return None
+    def _extract_conversation(self, item: Dict) -> Tuple[str, str]:
+        conv = item.get("conversations", None)
+        if not conv or not isinstance(conv, list):
+            raise RuntimeError("missing conversations field")
+        prompt = None
+        answer = None
+        for turn in conv:
+            if not isinstance(turn, dict):
+                continue
+            if turn.get("from") in ("human", "user") and prompt is None:
+                prompt = turn.get("value")
+            if turn.get("from") in ("gpt", "assistant") and answer is None:
+                answer = turn.get("value")
+        if prompt is None or answer is None:
+            raise RuntimeError("conversations missing human/gpt pair")
+        return str(prompt), str(answer)
+    def _infer_expected_query_count(self, prompt_text: str, item: Optional[Dict] = None) -> int:
+        if isinstance(item, dict):
+            if "num_map_queries" in item:
+                try:
+                    num_map = int(item.get("num_map_queries", 0))
+                    if num_map < 0:
+                        num_map = 0
+                    return self.num_detection_queries + num_map
+                except Exception:
+                    pass
+            use_map = item.get("use_map_queries", None)
+            if use_map is not None:
+                return self.num_detection_queries + (self.num_map_queries if bool(use_map) else 0)
+        p = prompt_text.lower()
+        if "map query" in p:
+            return self.num_detection_queries + self.num_map_queries
+        return self.num_detection_queries
+    def _expand_query_placeholders(
+        self,
+        prompt_text: str,
+        expected_num_queries: int,
+        item: Optional[Dict] = None,
+    ) -> str:
+        cnt = prompt_text.count(self.query_token)
+        if cnt == 1:
+            placeholder = " ".join([self.query_token] * expected_num_queries)
+            out = prompt_text.replace(self.query_token, f" {placeholder} ")
+            return " ".join(out.split())
+        if cnt == 2 and expected_num_queries > self.num_detection_queries:
+            num_map = expected_num_queries - self.num_detection_queries
+            if isinstance(item, dict) and "num_map_queries" in item:
+                try:
+                    num_map = int(item.get("num_map_queries", num_map))
+                except Exception:
+                    pass
+            num_map = max(0, min(num_map, expected_num_queries))
+            num_det = expected_num_queries - num_map
+            parts = prompt_text.split(self.query_token)
+            if len(parts) == 3 and num_det > 0 and num_map > 0:
+                # Latest planning prompts use two placeholders: the first stands
+                # for detection query slots and the second for map query slots.
+                det_placeholder = " ".join([self.query_token] * num_det)
+                map_placeholder = " ".join([self.query_token] * num_map)
+                out = (
+                    f"{parts[0]} {det_placeholder} "
+                    f"{parts[1]} {map_placeholder} "
+                    f"{parts[2]}"
+                )
+                return " ".join(out.split())
+        if cnt == expected_num_queries:
+            return " ".join(prompt_text.split())
+        if expected_num_queries > self.num_detection_queries:
+            raise ValueError(
+                f"<query> count mismatch: got {cnt}, expected 1, 2, or {expected_num_queries}"
+            )
+        raise ValueError(f"<query> count mismatch: got {cnt}, expected 1 or {expected_num_queries}")
+    def _load_precomputed(self, directory: str, item: Dict) -> Optional[Dict]:
+        item_id = str(item.get("id", ""))
+        pt_path = os.path.join(directory, f"{item_id}.pt")
+        if os.path.isfile(pt_path):
+            try:
+                return torch.load(pt_path, map_location="cpu")
+            except Exception:
+                return None
+        meta = item.get("meta_data", {})
+        if isinstance(meta, dict):
+            source_id = meta.get("source_id")
+            if source_id:
+                pt_path2 = os.path.join(directory, f"{source_id}.pt")
+                if os.path.isfile(pt_path2):
+                    try:
+                        return torch.load(pt_path2, map_location="cpu")
+                    except Exception:
+                        pass
+        return None
+    def _get_scene_id(self, item: Dict) -> str:
+        if "segment_id" in item and item["segment_id"]:
+            return str(item["segment_id"])
+        try:
+            p0 = item.get("image_paths", [None])[0]
+            if not p0:
+                return "unknown"
+            fname = os.path.basename(p0)
+            parts = fname.split("__")
+            if len(parts) > 1:
+                return parts[0]
+            return "unknown"
+        except Exception:
+            return "unknown"
+    def _get_timestamp(self, item: Dict) -> int:
+        if "timestamp" in item and item["timestamp"]:
+            return int(item["timestamp"])
+        try:
+            p0 = item.get("image_paths", [None])[0]
+            fname = os.path.basename(p0)
+            parts = fname.replace(".jpg", "").split("__")
+            return int(parts[-1]) if parts else 0
+        except Exception:
+            return 0
+    def get_scene_groups(self) -> Dict[str, List[int]]:
+        groups: Dict[str, List[int]] = {}
+        for idx, item in enumerate(self.data):
+            sid = self._get_scene_id(item)
+            groups.setdefault(sid, []).append(idx)
+        for sid in groups:
+            groups[sid].sort(key=lambda i: self._get_timestamp(self.data[i]))
+        return groups
+    def _get_ego_motion_data(self, item: Dict) -> Dict:
+        route_cmd = self._resolve_route_command(item) or "go straight"
+        if 'ego_motion' in item:
+            ego = item['ego_motion']
+            # JSON already stores paper-frame values (gen_atlas_planning_qa.py
+            # applies _nuscenes_to_paper_xy before writing). Do NOT transform again.
+            vel = ego.get('velocity', [0.0, 0.0])
+            acc = ego.get('acceleration', [0.0, 0.0])
+            vx, vy = float(vel[0]), float(vel[1])
+            ax, ay = float(acc[0]), float(acc[1])
+            if os.getenv("ATLAS_AUDIT", "0") not in ("", "0", "false", "False"):
+                if not hasattr(self, "_audit_ego_calls"):
+                    self._audit_ego_calls = 0
+                max_calls = int(os.getenv("ATLAS_AUDIT_MAX_EGO", "1"))
+                if self._audit_ego_calls < max_calls:
+                    print(f"[ATLAS_AUDIT][ego] vel=({vx:.3e},{vy:.3e}) acc=({ax:.3e},{ay:.3e}) [paper-frame, no transform]")
+                    self._audit_ego_calls += 1
+            if 'waypoints' in ego:
+                waypoints_raw = ego['waypoints']
+                waypoints = [[float(wp[0]), float(wp[1])] for wp in waypoints_raw]
+            else:
+                waypoints = self._generate_waypoints(route_cmd, [vx, vy], [ax, ay])
+            return {
+                'velocity': [vx, vy],
+                'acceleration': [ax, ay],
+                'waypoints': waypoints,
+            }
+        if not hasattr(self, "_warned_missing_ego_motion"):
+            self._warned_missing_ego_motion = True
+            print("[WARN] ego_motion missing, using stationary default (velocity=[0,0])")
+        velocity = [0.0, 0.0]
+        acceleration = [0.0, 0.0]
+        waypoints = self._generate_waypoints(route_cmd, velocity, acceleration)
+        return {
+            'velocity': velocity,
+            'acceleration': acceleration,
+            'waypoints': waypoints,
+        }
+    def _generate_waypoints(
+        self,
+        command: str,
+        velocity: List[float] = None,
+        acceleration: List[float] = None,
+    ) -> List[List[float]]:
+        if velocity is None:
+            velocity = [0.0, 5.0]
+        if acceleration is None:
+            acceleration = [0.0, 0.0]
+        vx, vy = velocity
+        ax, ay = acceleration
+        waypoints = []
+        for i in range(1, 7):
+            t = i * 0.5
+            x = vx * t + 0.5 * ax * t * t
+            y = vy * t + 0.5 * ay * t * t
+            if command == "turn left":
+                curvature = -0.3 * t * t
+                x += curvature
+            elif command == "turn right":
+                curvature = 0.3 * t * t
+                x += curvature
+            waypoints.append([round(x, 2), round(y, 2)])
+        return waypoints
+    def _preprocess_streampetr(self, img_pil: Image.Image, K: np.ndarray):
+        W, H = img_pil.size
+        conf = self.streampetr_conf
+        fH, fW = conf["final_dim"]
+        resize = max(fH / H, fW / W)
+        rW, rH = int(W * resize), int(H * resize)
+        crop_h = int(rH) - fH
+        crop_w = max(0, rW - fW) // 2
+        if resize != 1.0:
+            img_pil = img_pil.resize((rW, rH), Image.BILINEAR)
+        img_pil = img_pil.crop((crop_w, crop_h, crop_w + fW, crop_h + fH))
+        K_new = K.copy()
+        K_new[0, 0] *= resize
+        K_new[1, 1] *= resize
+        K_new[0, 2] = K_new[0, 2] * resize - crop_w
+        K_new[1, 2] = K_new[1, 2] * resize - crop_h
+        return img_pil, K_new
+    def _preprocess_topomlp(self, img_pil: Image.Image, K: np.ndarray):
+        W, H = img_pil.size
+        tW, tH = self.topomlp_conf["target_size"]
+        w_scale = tW / W
+        h_scale = tH / H
+        img_pil = img_pil.resize((tW, tH), Image.BILINEAR)
+        K_new = K.copy()
+        K_new[0, 0] *= w_scale
+        K_new[0, 2] *= w_scale
+        K_new[1, 1] *= h_scale
+        K_new[1, 2] *= h_scale
+        return img_pil, K_new
+    def _load_images_with_cameras(
+        self,
+        image_paths: List[str],
+        item: Optional[Dict] = None,
+    ) -> Dict:
+        images_det = []
+        images_map = []
+        intrinsics_det_list = []
+        intrinsics_map_list = []
+        extrinsics_list = []
+        lidar2img_det_list = []
+        lidar2img_map_list = []
+        ego_pose_out = None
+        ego_pose_inv_out = None
+        timestamp_out = None
+        for i, img_path in enumerate(image_paths):
+            camera_name = CAMERA_NAMES[i] if i < len(CAMERA_NAMES) else f"CAM_{i}"
+            for cam in sorted(CAMERA_NAMES, key=len, reverse=True):
+                if cam in img_path:
+                    camera_name = cam
+                    break
+            def _normalize_path(p: str) -> str:
+                return str(p).replace("\\", "/").lstrip("./")
+            def _lookup_sample_data(path: str) -> Optional[Dict]:
+                if self.calibration is None:
+                    return None
+                by_name = self.calibration["sample_data_by_filename"]
+                candidates = []
+                raw = str(path)
+                norm = _normalize_path(raw)
+                candidates.append(raw)
+                candidates.append(norm)
+                for key in ("samples/", "sweeps/"):
+                    if key in norm:
+                        candidates.append(norm[norm.index(key):])
+                for cand in candidates:
+                    rec = by_name.get(cand, None)
+                    if rec is not None:
+                        return rec
+                return None
+            full_path = os.path.normpath(os.path.join(self.image_root, img_path))
+            if not os.path.isabs(img_path):
+                full_path = os.path.normpath(os.path.join(self.image_root, img_path))
+            else:
+                full_path = img_path
+            try:
+                img = Image.open(full_path).convert("RGB")
+            except Exception as e:
+                sample_id = "unknown"
+                task_type = "unknown"
+                if isinstance(item, dict):
+                    sample_id = str(item.get("id", item.get("sample_id", "unknown")))
+                    try:
+                        task_type = infer_task_type(item)
+                    except Exception:
+                        task_type = str(item.get("task_type", item.get("task", "unknown")))
+                raise RuntimeError(
+                    "failed to load image for AtlasDataset: "
+                    f"sample_id={sample_id} task_type={task_type} "
+                    f"camera_name={camera_name} image_path={img_path} "
+                    f"full_path={full_path} image_root={self.image_root}"
+                ) from e
+            K = None
+            E = None
+            ep = None
+            sd = None
+            if self.calibration is not None:
+                sd = _lookup_sample_data(img_path)
+                if sd is not None:
+                    cs_token = sd.get("calibrated_sensor_token", None)
+                    if cs_token is None:
+                        raise RuntimeError(f"sample_data missing calibrated_sensor_token: {img_path}")
+                    cs = self.calibration["calibrated_sensor_by_token"].get(cs_token, None)
+                    if cs is None:
+                        raise RuntimeError(f"calibrated_sensor not found: {cs_token}")
+                    ep_token = sd.get("ego_pose_token", None)
+                    if ep_token is None:
+                        raise RuntimeError(f"sample_data missing ego_pose_token: {img_path}")
+                    ep = self.calibration["ego_pose_by_token"].get(ep_token, None)
+                    if ep is None:
+                        raise RuntimeError(f"ego_pose not found: {ep_token}")
+                    K = np.array(cs["camera_intrinsic"], dtype=np.float32)
+                    q = cs["rotation"]
+                    t = cs["translation"]
+                    w, x, y, z = q
+                    R = np.array(
+                        [
+                            [1 - 2 * (y * y + z * z), 2 * (x * y - z * w), 2 * (x * z + y * w)],
+                            [2 * (x * y + z * w), 1 - 2 * (x * x + z * z), 2 * (y * z - x * w)],
+                            [2 * (x * z - y * w), 2 * (y * z + x * w), 1 - 2 * (x * x + y * y)],
+                        ],
+                        dtype=np.float32,
+                    )
+                    E = np.eye(4, dtype=np.float32)
+                    E[:3, :3] = R
+                    E[:3, 3] = np.array(t, dtype=np.float32)
+                    if ego_pose_out is None and self.calibration is not None:
+                        _item_id = str(item.get("id", "")) if isinstance(item, dict) else ""
+                        _lidar_sd = self.calibration.get("lidar_sd_by_sample_token", {}).get(_item_id)
+                        if _lidar_sd is not None:
+                            _lidar_cs = self.calibration["calibrated_sensor_by_token"].get(
+                                _lidar_sd.get("calibrated_sensor_token"), None)
+                            _lidar_ep = self.calibration["ego_pose_by_token"].get(
+                                _lidar_sd.get("ego_pose_token"), None)
+                            if _lidar_cs is not None and _lidar_ep is not None:
+                                def _q2R(q):
+                                    ww, xx, yy, zz = q
+                                    return np.array([
+                                        [1-2*(yy*yy+zz*zz), 2*(xx*yy-zz*ww), 2*(xx*zz+yy*ww)],
+                                        [2*(xx*yy+zz*ww), 1-2*(xx*xx+zz*zz), 2*(yy*zz-xx*ww)],
+                                        [2*(xx*zz-yy*ww), 2*(yy*zz+xx*ww), 1-2*(xx*xx+yy*yy)],
+                                    ], dtype=np.float32)
+                                _l2e = np.eye(4, dtype=np.float32)
+                                _l2e[:3, :3] = _q2R(_lidar_cs["rotation"])
+                                _l2e[:3, 3] = np.array(_lidar_cs["translation"], dtype=np.float32)
+                                _e2g = np.eye(4, dtype=np.float32)
+                                _e2g[:3, :3] = _q2R(_lidar_ep["rotation"])
+                                _e2g[:3, 3] = np.array(_lidar_ep["translation"], dtype=np.float32)
+                                _lidar2global = (_e2g @ _l2e).astype(np.float32)
+                                ego_pose_out = torch.tensor(_lidar2global, dtype=torch.float32)
+                                try:
+                                    ego_pose_inv_out = torch.tensor(np.linalg.inv(_lidar2global), dtype=torch.float32)
+                                except Exception:
+                                    ego_pose_inv_out = None
+                                _lidar_ts = _lidar_sd.get("timestamp", None)
+                                if _lidar_ts is not None:
+                                    timestamp_out = torch.tensor(float(_lidar_ts) * 1e-6, dtype=torch.float32)
+                    if ego_pose_out is None and ep is not None:
+                        q_ep = ep.get("rotation", None)
+                        t_ep = ep.get("translation", None)
+                        if q_ep is not None and t_ep is not None:
+                            w, x, y, z = q_ep
+                            R_ep = np.array(
+                                [
+                                    [1 - 2 * (y * y + z * z), 2 * (x * y - z * w), 2 * (x * z + y * w)],
+                                    [2 * (x * y + z * w), 1 - 2 * (x * x + z * z), 2 * (y * z - x * w)],
+                                    [2 * (x * z - y * w), 2 * (y * z + x * w), 1 - 2 * (x * x + y * y)],
+                                ],
+                                dtype=np.float32,
+                            )
+                            ego_pose_m = np.eye(4, dtype=np.float32)
+                            ego_pose_m[:3, :3] = R_ep
+                            ego_pose_m[:3, 3] = np.array(t_ep, dtype=np.float32)
+                            ego_pose_out = torch.tensor(ego_pose_m, dtype=torch.float32)
+                            try:
+                                ego_pose_inv_out = torch.tensor(np.linalg.inv(ego_pose_m), dtype=torch.float32)
+                            except Exception:
+                                ego_pose_inv_out = None
+                    if timestamp_out is None and sd is not None:
+                        ts = sd.get("timestamp", None)
+                        if ts is not None:
+                            timestamp_out = torch.tensor(float(ts) * 1e-6, dtype=torch.float32)
+            if K is None or E is None:
+                sensor = (item or {}).get("sensor", None) if isinstance(item, dict) else None
+                if not isinstance(sensor, dict):
+                    raise RuntimeError(f"no camera params for {img_path}")
+                if camera_name not in sensor:
+                    raise RuntimeError(f"sensor missing camera {camera_name}")
+                cam_s = sensor[camera_name]
+                try:
+                    K = np.array(cam_s["intrinsic"]["K"], dtype=np.float32)
+                    R = np.array(cam_s["extrinsic"]["rotation"], dtype=np.float32)
+                    t = np.array(cam_s["extrinsic"]["translation"], dtype=np.float32)
+                    E = np.eye(4, dtype=np.float32)
+                    E[:3, :3] = R
+                    E[:3, 3] = t
+                except Exception as e:
+                    raise RuntimeError(f"failed to parse sensor for {camera_name}: {e}")
+            img_det, K_det = self._preprocess_streampetr(img.copy(), K.copy())
+            img_map, K_map = self._preprocess_topomlp(img.copy(), K.copy())
+            images_det.append(self.image_transform(img_det))
+            images_map.append(self.image_transform(img_map))
+            intrinsics_det_list.append(torch.tensor(K_det, dtype=torch.float32))
+            intrinsics_map_list.append(torch.tensor(K_map, dtype=torch.float32))
+            extrinsics_list.append(torch.tensor(E, dtype=torch.float32))
+            def _quat_wxyz_to_R(qwxyz):
+                ww, xx, yy, zz = qwxyz
+                return np.array(
+                    [
+                        [1 - 2 * (yy * yy + zz * zz), 2 * (xx * yy - zz * ww), 2 * (xx * zz + yy * ww)],
+                        [2 * (xx * yy + zz * ww), 1 - 2 * (xx * xx + zz * zz), 2 * (yy * zz - xx * ww)],
+                        [2 * (xx * zz - yy * ww), 2 * (yy * zz + xx * ww), 1 - 2 * (xx * xx + yy * yy)],
+                    ],
+                    dtype=np.float32,
+                )
+            def _T_from_Rt(Rm, tv):
+                T = np.eye(4, dtype=np.float32)
+                T[:3, :3] = Rm
+                T[:3, 3] = np.array(tv, dtype=np.float32)
+                return T
+            def _compute_lidar2img(K_adj, E_mat, sd_rec, ep_rec):
+                cam2ego = E_mat.astype(np.float32)
+                ego2cam = np.linalg.inv(cam2ego)
+                K4 = np.eye(4, dtype=np.float32)
+                K4[:3, :3] = K_adj.astype(np.float32)
+                if sd_rec is None or ep_rec is None:
+                    return K4 @ ego2cam
+                sample_tk = sd_rec.get("sample_token", None)
+                if sample_tk is None:
+                    return K4 @ ego2cam
+                ego2global_c = _T_from_Rt(_quat_wxyz_to_R(ep_rec["rotation"]), ep_rec["translation"])
+                global2ego_c = np.linalg.inv(ego2global_c)
+                lidar_sd_rec = self.calibration.get("lidar_sd_by_sample_token", {}).get(str(sample_tk), None)
+                if lidar_sd_rec is None:
+                    return K4 @ ego2cam
+                lidar_cs_rec = self.calibration["calibrated_sensor_by_token"].get(
+                    lidar_sd_rec.get("calibrated_sensor_token"), None)
+                lidar_ep_rec = self.calibration["ego_pose_by_token"].get(
+                    lidar_sd_rec.get("ego_pose_token"), None)
+                if lidar_cs_rec is None or lidar_ep_rec is None:
+                    return K4 @ ego2cam
+                lidar2ego = _T_from_Rt(_quat_wxyz_to_R(lidar_cs_rec["rotation"]), lidar_cs_rec["translation"])
+                ego2global_lidar = _T_from_Rt(_quat_wxyz_to_R(lidar_ep_rec["rotation"]), lidar_ep_rec["translation"])
+                lidar2cam = ego2cam @ global2ego_c @ ego2global_lidar @ lidar2ego
+                return K4 @ lidar2cam
+            lidar2img_det_list.append(
+                torch.tensor(_compute_lidar2img(K_det, E, sd, ep), dtype=torch.float32))
+            lidar2img_map_list.append(
+                torch.tensor(_compute_lidar2img(K_map, E, sd, ep), dtype=torch.float32))
+        # Fallback: if nuScenes calibration lookup failed (e.g. OpenLane samples),
+        # try to recover ego_pose from item["pose"] and timestamp from item["timestamp"].
+        if ego_pose_out is None and isinstance(item, dict):
+            pose_data = item.get("pose", None)
+            if isinstance(pose_data, dict):
+                try:
+                    rot_raw = pose_data.get("rotation", None)
+                    t_p = pose_data.get("translation", None)
+                    if rot_raw is not None and t_p is not None:
+                        arr = np.array(rot_raw, dtype=np.float32)
+                        if arr.shape == (3, 3):
+                            R_p = arr
+                        elif arr.shape == (4,):
+                            w, x, y, z = arr
+                            R_p = np.array([
+                                [1-2*(y*y+z*z), 2*(x*y-z*w), 2*(x*z+y*w)],
+                                [2*(x*y+z*w), 1-2*(x*x+z*z), 2*(y*z-x*w)],
+                                [2*(x*z-y*w), 2*(y*z+x*w), 1-2*(x*x+y*y)],
+                            ], dtype=np.float32)
+                        else:
+                            raise ValueError(f"Unsupported rotation shape: {arr.shape}")
+                        T_p = np.eye(4, dtype=np.float32)
+                        T_p[:3, :3] = R_p
+                        T_p[:3, 3] = np.array(t_p, dtype=np.float32)
+                        ego_pose_out = torch.tensor(T_p, dtype=torch.float32)
+                        try:
+                            ego_pose_inv_out = torch.tensor(np.linalg.inv(T_p), dtype=torch.float32)
+                        except Exception:
+                            ego_pose_inv_out = None
+                except Exception as e:
+                    print(f"WARNING: Failed to parse item['pose']: {e}")
+        if timestamp_out is None and isinstance(item, dict):
+            ts_raw = item.get("timestamp", None)
+            if ts_raw is not None:
+                try:
+                    timestamp_out = torch.tensor(float(ts_raw) * 1e-6, dtype=torch.float32)
+                except Exception:
+                    pass
+        result = {
+            "pixel_values_det": torch.stack(images_det, dim=0),
+            "pixel_values_map": torch.stack(images_map, dim=0),
+            "intrinsics_det": torch.stack(intrinsics_det_list, dim=0),
+            "intrinsics_map": torch.stack(intrinsics_map_list, dim=0),
+            "extrinsics": torch.stack(extrinsics_list, dim=0),
+            "lidar2img_det": torch.stack(lidar2img_det_list, dim=0),
+            "lidar2img_map": torch.stack(lidar2img_map_list, dim=0),
+            "ego_pose": ego_pose_out,
+            "ego_pose_inv": ego_pose_inv_out,
+            "timestamp": timestamp_out,
+        }
+        result["pixel_values"] = result["pixel_values_det"]
+        result["intrinsics"] = result["intrinsics_det"]
+        result["lidar2img"] = result["lidar2img_det"]
+        return result
+def atlas_collate_fn(
+    batch: List[Dict[str, torch.Tensor]],
+    pad_token_id: Optional[int] = None,
+) -> Dict[str, torch.Tensor]:
+    pixel_values = torch.stack([item['pixel_values'] for item in batch])
+    pixel_values_det = torch.stack([item['pixel_values_det'] for item in batch])
+    pixel_values_map = torch.stack([item['pixel_values_map'] for item in batch])
+    intrinsics = torch.stack([item['intrinsics'] for item in batch])
+    intrinsics_det = torch.stack([item['intrinsics_det'] for item in batch])
+    intrinsics_map = torch.stack([item['intrinsics_map'] for item in batch])
+    extrinsics = torch.stack([item['extrinsics'] for item in batch])
+    def _try_stack(key):
+        if all(key in item for item in batch):
+            try:
+                return torch.stack([item[key] for item in batch])
+            except Exception:
+                pass
+        return None
+    lidar2img = _try_stack("lidar2img")
+    lidar2img_det = _try_stack("lidar2img_det")
+    lidar2img_map = _try_stack("lidar2img_map")
+    ego_pose = _try_stack("ego_pose")
+    ego_pose_inv = _try_stack("ego_pose_inv")
+    timestamp = _try_stack("timestamp")
+    max_length = max(len(item['input_ids']) for item in batch)
+    batch_size = len(batch)
+    if pad_token_id is None:
+        raise RuntimeError("atlas_collate_fn requires explicit pad_token_id")
+    input_ids = torch.full((batch_size, max_length), fill_value=pad_token_id, dtype=torch.long)
+    attention_mask = torch.full((batch_size, max_length), fill_value=0, dtype=torch.long)
+    labels = torch.full((batch_size, max_length), fill_value=-100, dtype=torch.long)
+    for i, item in enumerate(batch):
+        seq_len = len(item['input_ids'])
+        input_ids[i, :seq_len] = item['input_ids']
+        attention_mask[i, :seq_len] = item['attention_mask']
+        labels[i, :seq_len] = item['labels']
+    pad_mask = attention_mask == 0
+    if pad_mask.any():
+        if not torch.all(input_ids[pad_mask] == pad_token_id):
+            raise RuntimeError("padding inconsistent: input_ids")
+        if not torch.all(labels[pad_mask] == -100):
+            raise RuntimeError("padding inconsistent: labels")
+    result = {
+        'pixel_values': pixel_values,
+        'pixel_values_det': pixel_values_det,
+        'pixel_values_map': pixel_values_map,
+        'intrinsics': intrinsics,
+        'intrinsics_det': intrinsics_det,
+        'intrinsics_map': intrinsics_map,
+        'extrinsics': extrinsics,
+        'input_ids': input_ids,
+        'attention_mask': attention_mask,
+        'labels': labels,
+        'scene_id': [item.get('scene_id', 'unknown') for item in batch],
+        'sample_id': [item.get('sample_id', 'unknown') for item in batch],
+        'dataset_idx': torch.stack([item['dataset_idx'] for item in batch]),
+        'task_type': [item.get('task_type', 'detection') for item in batch],
+    }
+    for k, v in [("lidar2img", lidar2img), ("lidar2img_det", lidar2img_det),
+                  ("lidar2img_map", lidar2img_map), ("ego_pose", ego_pose),
+                  ("ego_pose_inv", ego_pose_inv), ("timestamp", timestamp)]:
+        if v is not None:
+            result[k] = v
+    velocity = None
+    if all("velocity" in item for item in batch):
+        try:
+            velocity = torch.stack([item["velocity"] for item in batch])
+        except Exception:
+            velocity = None
+    if velocity is not None:
+        result["velocity"] = velocity
+    if any("precomputed_det" in item for item in batch):
+        det_shape = next(item["precomputed_det"].shape for item in batch if "precomputed_det" in item)
+        ref_shape = next(item["precomputed_det_ref"].shape for item in batch if "precomputed_det" in item)
+        dets, refs = [], []
+        for item in batch:
+            if "precomputed_det" in item:
+                dets.append(item["precomputed_det"])
+                refs.append(item["precomputed_det_ref"])
+            else:
+                dets.append(torch.zeros(det_shape))
+                refs.append(torch.zeros(ref_shape))
+        result["precomputed_det"] = torch.stack(dets)
+        result["precomputed_det_ref"] = torch.stack(refs)
+    if any("precomputed_map" in item for item in batch):
+        result["precomputed_map"] = [item.get("precomputed_map") for item in batch]
+    audit_keys = [
+        "audit_prompt_len",
+        "audit_answer_len",
+        "audit_labels_nonmasked_count",
+        "audit_first_nonmasked_index",
+        "audit_num_query_tokens_in_input_ids",
+        "audit_expected_num_queries",
+        "audit_truncated",
+    ]
+    for k in audit_keys:
+        if all(k in item for item in batch):
+            try:
+                result[k] = torch.stack([item[k] for item in batch])
+            except Exception:
+                pass
+    has_gt = all('gt_boxes' in item for item in batch)
+    if has_gt:
+        result['gt_boxes'] = [item['gt_boxes'] for item in batch]
+        result['gt_labels'] = [item['gt_labels'] for item in batch]
+    return result
+def make_atlas_collate_fn(pad_token_id: int):
+    from functools import partial
+    return partial(atlas_collate_fn, pad_token_id=pad_token_id)

src/dataset/scene_sampler.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""DDP-safe scene-sequential sampler for online temporal training.
+Guarantees:
+  1. Within each scene, frames are yielded in strict timestamp order.
+  2. Scene order is shuffled per-epoch for training diversity.
+  3. All ranks yield exactly the same number of micro-steps per epoch
+     (balanced by greedy scene assignment + deterministic replay padding).
+  4. Epoch boundaries and replay-scene starts are detectable by the caller
+     via timestamp regression, so StreamPETR memory can be reset correctly.
+"""
+import random
+from typing import Dict, Iterator, List, Sequence
+from torch.utils.data import Sampler
+class SceneSequentialSampler(Sampler[int]):
+    """Distributed temporal sampler with equal-step guarantee."""
+    def __init__(
+        self,
+        scene_groups: Dict[str, List[int]],
+        num_replicas: int = 1,
+        rank: int = 0,
+        seed: int = 0,
+        shuffle_scenes: bool = True,
+        pad_to_multiple: int = 1,
+    ):
+        if num_replicas < 1:
+            raise ValueError(f"num_replicas must be >= 1, got {num_replicas}")
+        if rank < 0 or rank >= num_replicas:
+            raise ValueError(f"rank must be in [0, {num_replicas}), got {rank}")
+        if not scene_groups:
+            raise ValueError("scene_groups must not be empty")
+        self.scene_groups = scene_groups
+        self.scene_ids = sorted(scene_groups.keys())
+        self.num_replicas = int(num_replicas)
+        self.rank = int(rank)
+        self.seed = int(seed)
+        self.shuffle_scenes = bool(shuffle_scenes)
+        self.pad_to_multiple = max(1, int(pad_to_multiple))
+        self.epoch = 0
+        self._cached: List[int] = []
+        self._target_len = 0
+    def set_epoch(self, epoch: int) -> None:
+        self.epoch = int(epoch)
+        self._cached = []
+        self._target_len = 0
+    def _scene_len(self, sid: str) -> int:
+        return len(self.scene_groups[sid])
+    def _build_indices(self) -> List[int]:
+        rng = random.Random(self.seed + self.epoch)
+        scene_order = list(self.scene_ids)
+        if self.shuffle_scenes:
+            rng.shuffle(scene_order)
+        per_rank_scenes: List[List[str]] = [[] for _ in range(self.num_replicas)]
+        per_rank_counts = [0] * self.num_replicas
+        for sid in sorted(scene_order, key=self._scene_len, reverse=True):
+            target = min(range(self.num_replicas), key=lambda r: per_rank_counts[r])
+            per_rank_scenes[target].append(sid)
+            per_rank_counts[target] += self._scene_len(sid)
+        for rid in range(self.num_replicas):
+            if not per_rank_scenes[rid]:
+                fallback = scene_order[rid % len(scene_order)]
+                per_rank_scenes[rid].append(fallback)
+                per_rank_counts[rid] += self._scene_len(fallback)
+        target_count = max(per_rank_counts)
+        if target_count % self.pad_to_multiple != 0:
+            target_count = (
+                (target_count + self.pad_to_multiple - 1)
+                // self.pad_to_multiple
+            ) * self.pad_to_multiple
+        self._target_len = target_count
+        my_scenes = per_rank_scenes[self.rank]
+        if self.shuffle_scenes:
+            rng2 = random.Random(self.seed + self.epoch + self.rank)
+            rng2.shuffle(my_scenes)
+        indices: List[int] = []
+        for sid in my_scenes:
+            indices.extend(self.scene_groups[sid])
+        if len(indices) < target_count:
+            replay_pool = list(my_scenes)
+            cursor = 0
+            while len(indices) < target_count:
+                sid = replay_pool[cursor % len(replay_pool)]
+                indices.extend(self.scene_groups[sid])
+                cursor += 1
+            indices = indices[:target_count]
+        return indices
+    def __iter__(self) -> Iterator[int]:
+        self._cached = self._build_indices()
+        return iter(self._cached)
+    def __len__(self) -> int:
+        if self._target_len == 0:
+            self._cached = self._build_indices()
+        return self._target_len

src/eval/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (403 Bytes). View file

src/eval/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (451 Bytes). View file

src/eval/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (23.6 kB). View file

src/eval/__pycache__/metrics.cpython-38.pyc ADDED Viewed

Binary file (23.9 kB). View file

src/eval/metrics.py ADDED Viewed

	@@ -0,0 +1,852 @@

+"""Atlas evaluation metrics."""
+import re
+import numpy as np
+from typing import List, Dict, Tuple, Optional
+# scipy only affects match_lanes() / calculate_lane_detection_metrics(),
+# which are NOT used in the main eval path (eval_atlas.py).
+# Main eval uses: greedy matching for detection, OpenLane-V2 LaneEval.bench() for lanes.
+try:
+    from scipy.optimize import linear_sum_assignment
+    SCIPY_AVAILABLE = True
+except ImportError:
+    SCIPY_AVAILABLE = False
+NUSCENES_CLASS_MAP = {
+    # Base class names
+    'car': 'car',
+    'truck': 'truck',
+    'construction_vehicle': 'construction_vehicle',
+    'bus': 'bus',
+    'trailer': 'trailer',
+    'barrier': 'barrier',
+    'motorcycle': 'motorcycle',
+    'bicycle': 'bicycle',
+    'pedestrian': 'pedestrian',
+    'traffic_cone': 'traffic_cone',
+    # Full nuScenes category names - vehicles
+    'vehicle.car': 'car',
+    'vehicle.truck': 'truck',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.trailer': 'trailer',
+    'vehicle.motorcycle': 'motorcycle',
+    'vehicle.bicycle': 'bicycle',
+    # Full nuScenes category names - pedestrians (all subtypes)
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'human.pedestrian.wheelchair': 'pedestrian',
+    'human.pedestrian.stroller': 'pedestrian',
+    'human.pedestrian.personal_mobility': 'pedestrian',
+    # Full nuScenes category names - movable objects
+    'movable_object.barrier': 'barrier',
+    'movable_object.trafficcone': 'traffic_cone',
+    'movable_object.traffic_cone': 'traffic_cone',
+}
+def normalize_category(category: str) -> str:
+    """Normalize nuScenes category names to base class names."""
+    cat_lower = category.lower().strip()
+    if cat_lower in NUSCENES_CLASS_MAP:
+        return NUSCENES_CLASS_MAP[cat_lower]
+    for key, val in NUSCENES_CLASS_MAP.items():
+        if key in cat_lower or cat_lower in key:
+            return val
+    return cat_lower
+def normalize_ground_truths(ground_truths: List[Dict]) -> List[Dict]:
+    """Normalize category names and ensure world_coords in ground truth list.
+    Handles multiple GT formats:
+    - {"translation": [x, y, z], "category_name": ...}  (from regenerate_atlas_with_gt.py)
+    - {"box": [x, y, z, w, l, h, yaw], "category_name": ...}  (from gen_atlas_full_data.py)
+    - {"world_coords": [x, y, z], "category": ...}  (already normalized)
+    """
+    normalized = []
+    for gt in ground_truths:
+        gt_copy = dict(gt)
+        # Normalize category
+        if 'category' in gt_copy:
+            gt_copy['category_raw'] = gt_copy['category']
+            gt_copy['category'] = normalize_category(gt_copy['category'])
+        elif 'category_name' in gt_copy:
+            gt_copy['category_raw'] = gt_copy['category_name']
+            gt_copy['category'] = normalize_category(gt_copy['category_name'])
+        # Ensure world_coords exists
+        if 'world_coords' not in gt_copy:
+            if 'translation' in gt_copy:
+                gt_copy['world_coords'] = list(gt_copy['translation'][:3])
+            elif 'box' in gt_copy:
+                gt_copy['world_coords'] = list(gt_copy['box'][:3])
+        normalized.append(gt_copy)
+    return normalized
+def bin_to_meters(bin_val: int, bin_range: Tuple[float, float] = (-51.2, 51.2), num_bins: int = 1000) -> float:
+    min_val, max_val = bin_range
+    normalized = bin_val / (num_bins - 1)
+    meters = min_val + normalized * (max_val - min_val)
+    return meters
+def meters_to_bin(meters: float, bin_range: Tuple[float, float] = (-51.2, 51.2), num_bins: int = 1000) -> int:
+    min_val, max_val = bin_range
+    meters = np.clip(meters, min_val, max_val)
+    normalized = (meters - min_val) / (max_val - min_val)
+    bin_val = round(normalized * (num_bins - 1))
+    bin_val = int(np.clip(bin_val, 0, num_bins - 1))
+    return bin_val
+def _parse_lane_points(points_str: str) -> List[Dict]:
+    """Parse a sequence of [x, y, z] bins into lane point dicts."""
+    point_pattern = r'\[(\d+),\s*(\d+),\s*(\d+)\]'
+    points = re.findall(point_pattern, points_str)
+    lane_points = []
+    for x_bin, y_bin, z_bin in points:
+        x_bin, y_bin, z_bin = int(x_bin), int(y_bin), int(z_bin)
+        x_meters = bin_to_meters(x_bin, bin_range=(-51.2, 51.2))
+        y_meters = bin_to_meters(y_bin, bin_range=(-51.2, 51.2))
+        z_meters = bin_to_meters(z_bin, bin_range=(-5.0, 3.0))
+        lane_points.append({
+            'bin_coords': [x_bin, y_bin, z_bin],
+            'world_coords': [x_meters, y_meters, z_meters]
+        })
+    return lane_points
+def parse_atlas_output(text: str) -> List[Dict]:
+    """
+    Parse Atlas model output. Supports two canonical formats (checked in order):
+    1. Paper lane:  Lane: [x, y, z], [x, y, z]; [x, y, z], [x, y, z]; ...
+    2. Detection:   category: [x, y, z], [x, y, z]; category: [x, y, z].
+    """
+    results = []
+    # --- 1. Paper lane format: "Lane: [pts], [pts]; [pts], [pts]; ..." ---
+    paper_lane_match = re.search(r'Lane:\s*(.*)', text, re.DOTALL)
+    if paper_lane_match:
+        content = paper_lane_match.group(1).rstrip('. \t\n')
+        lane_strs = content.split(';')
+        for lane_idx, lane_str in enumerate(lane_strs):
+            lane_str = lane_str.strip()
+            if not lane_str:
+                continue
+            lane_points = _parse_lane_points(lane_str)
+            if lane_points:
+                results.append({
+                    'type': 'lane',
+                    'lane_id': str(lane_idx),
+                    'points': lane_points,
+                })
+        if results:
+            return results
+    # --- 2. Detection grouped format ---
+    # Canonical: "car: [pt1], [pt2]; truck: [pt3]."
+    def _make_det(category: str, x_b: int, y_b: int, z_b: int) -> Dict:
+        return {
+            'type': 'detection',
+            'category': normalize_category(category),
+            'category_raw': category,
+            'bin_coords': [x_b, y_b, z_b],
+            'world_coords': [
+                bin_to_meters(x_b, bin_range=(-51.2, 51.2)),
+                bin_to_meters(y_b, bin_range=(-51.2, 51.2)),
+                bin_to_meters(z_b, bin_range=(-5.0, 3.0)),
+            ],
+        }
+    point_re = re.compile(r'\[(\d+),\s*(\d+),\s*(\d+)\]')
+    group_re = re.compile(r'(\S+)\s*:\s*((?:\[\d+,\s*\d+,\s*\d+\][\s,]*)+)')
+    stripped = text.strip().rstrip('.')
+    if stripped.startswith('lane_centerline('):
+        return []
+    if ';' in stripped:
+        for seg in stripped.split(';'):
+            seg = seg.strip()
+            if not seg:
+                continue
+            gm = group_re.match(seg)
+            if gm:
+                for x_b, y_b, z_b in point_re.findall(gm.group(2)):
+                    results.append(_make_det(gm.group(1), int(x_b), int(y_b), int(z_b)))
+    if not results:
+        gm = group_re.match(stripped)
+        if gm:
+            pts_in_group = point_re.findall(gm.group(2))
+            pts_in_text = point_re.findall(stripped)
+            if len(pts_in_group) == len(pts_in_text):
+                for x_b, y_b, z_b in pts_in_group:
+                    results.append(_make_det(gm.group(1), int(x_b), int(y_b), int(z_b)))
+    return results
+def calculate_distance(
+    pred_coord: List[float],
+    gt_coord: List[float],
+    use_2d: bool = False,
+) -> float:
+    """
+    计算预测坐标和真实坐标之间的距离
+    Args:
+        pred_coord: 预测坐标 [x, y, z]
+        gt_coord: 真实坐标 [x, y, z]
+        use_2d: 如果为 True，只使用 XY 平面距离（BEV 距离），忽略 Z 轴
+                这是 BEV 3D 检测中更常用的匹配方式
+    """
+    pred = np.array(pred_coord)
+    gt = np.array(gt_coord)
+    if use_2d:
+        # 只使用 XY 平面距离（BEV 距离）
+        distance = np.linalg.norm(pred[:2] - gt[:2])
+    else:
+        # 3D 欧式距离
+        distance = np.linalg.norm(pred - gt)
+    return float(distance)
+def match_detections(
+    predictions: List[Dict],
+    ground_truths: List[Dict],
+    threshold: float = 2.0,
+    use_2d_distance: bool = True,
+    use_hungarian: bool = False,
+) -> Tuple[List[Tuple[int, int]], List[int], List[int]]:
+    """
+    匹配预测和真实检测框
+    Args:
+        predictions: 预测检测结果列表
+        ground_truths: 真实检测结果列表
+        threshold: 匹配距离阈值（米）
+        use_2d_distance: 如果为 True，使用 2D BEV 距离（XY 平面），这是 BEV 检测的标准做法
+        use_hungarian: 如果为 True，使用匈牙利算法进行最优匹配（需要 scipy）；
+                       默认 False，使用贪婪匹配（nuScenes 标准）
+    """
+    if len(predictions) == 0:
+        return [], [], list(range(len(ground_truths)))
+    if len(ground_truths) == 0:
+        return [], list(range(len(predictions))), []
+    # 按类别分组进行匹配
+    all_categories = set(p['category'] for p in predictions) | set(g['category'] for g in ground_truths)
+    matched_preds = set()
+    matched_gts = set()
+    matches = []
+    for category in all_categories:
+        cat_preds = [(i, p) for i, p in enumerate(predictions) if p['category'] == category]
+        cat_gts = [(i, g) for i, g in enumerate(ground_truths) if g['category'] == category]
+        if not cat_preds or not cat_gts:
+            continue
+        # 构建距离矩阵
+        n_preds = len(cat_preds)
+        n_gts = len(cat_gts)
+        cost_matrix = np.full((n_preds, n_gts), float('inf'))
+        for pi, (pred_idx, pred) in enumerate(cat_preds):
+            for gi, (gt_idx, gt) in enumerate(cat_gts):
+                dist = calculate_distance(pred['world_coords'], gt['world_coords'], use_2d=use_2d_distance)
+                if dist < threshold:
+                    cost_matrix[pi, gi] = dist
+        # 使用匈牙利算法或贪婪匹配
+        if use_hungarian and SCIPY_AVAILABLE and n_preds > 0 and n_gts > 0:
+            # 匈牙利算法最优匹配
+            row_ind, col_ind = linear_sum_assignment(cost_matrix)
+            for pi, gi in zip(row_ind, col_ind):
+                if cost_matrix[pi, gi] < threshold:
+                    pred_idx = cat_preds[pi][0]
+                    gt_idx = cat_gts[gi][0]
+                    matches.append((pred_idx, gt_idx))
+                    matched_preds.add(pred_idx)
+                    matched_gts.add(gt_idx)
+        else:
+            # 贪婪匹配（按距离排序）
+            distances = []
+            for pi, (pred_idx, pred) in enumerate(cat_preds):
+                for gi, (gt_idx, gt) in enumerate(cat_gts):
+                    dist = cost_matrix[pi, gi]
+                    if dist < threshold:
+                        distances.append((dist, pred_idx, gt_idx))
+            distances.sort(key=lambda x: x[0])
+            for dist, pred_idx, gt_idx in distances:
+                if pred_idx not in matched_preds and gt_idx not in matched_gts:
+                    matches.append((pred_idx, gt_idx))
+                    matched_preds.add(pred_idx)
+                    matched_gts.add(gt_idx)
+    false_positives = [i for i in range(len(predictions)) if i not in matched_preds]
+    false_negatives = [i for i in range(len(ground_truths)) if i not in matched_gts]
+    return matches, false_positives, false_negatives
+def calculate_detection_f1(
+    predictions: List[Dict],
+    ground_truths: List[Dict],
+    threshold: float = 2.0,
+) -> Dict[str, float]:
+    matches, false_positives, false_negatives = match_detections(
+        predictions, ground_truths, threshold
+    )
+    tp = len(matches)
+    fp = len(false_positives)
+    fn = len(false_negatives)
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+    metrics = {
+        'precision': precision,
+        'recall': recall,
+        'f1': f1,
+        'tp': tp,
+        'fp': fp,
+        'fn': fn,
+        'num_predictions': len(predictions),
+        'num_ground_truths': len(ground_truths),
+    }
+    return metrics
+def denormalize_ref_points_01(
+    ref_points_01: np.ndarray,
+    pc_range: Tuple[float, float, float, float, float, float] = (-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+) -> np.ndarray:
+    """Convert normalized ref points in [0,1] back to meters.
+    Args:
+        ref_points_01: array-like [..., 3] in [0, 1]
+        pc_range: (x_min, y_min, z_min, x_max, y_max, z_max)
+    Returns:
+        np.ndarray [..., 3] in meters
+    """
+    ref = np.asarray(ref_points_01, dtype=np.float64)
+    pc_min = np.array(pc_range[:3], dtype=np.float64)
+    pc_max = np.array(pc_range[3:], dtype=np.float64)
+    denom = np.clip(pc_max - pc_min, 1e-6, None)
+    ref01 = np.clip(ref, 0.0, 1.0)
+    return pc_min + ref01 * denom
+def snap_detections_to_ref_points(
+    predictions: List[Dict],
+    ref_points_01: np.ndarray,
+    pc_range: Tuple[float, float, float, float, float, float] = (-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+    keep_z: bool = True,
+) -> List[Dict]:
+    """Snap predicted detection centers to nearest reference points (BEV XY).
+    This is a post-processing step that constrains predictions to lie on the
+    StreamPETR proposal set (ref points). It can significantly reduce small
+    metric thresholds (0.5m/1m) sensitivity to free-form numeric drift.
+    Args:
+        predictions: list of detection dicts with 'world_coords' in meters
+        ref_points_01: [Q,3] or [B,Q,3] normalized ref points in [0,1]
+        pc_range: point cloud range for denormalization
+        keep_z: if True, keep each prediction's original z; else use ref z
+    Returns:
+        New list of predictions (deep-copied dicts) with snapped 'world_coords'
+    """
+    if not predictions:
+        return []
+    ref = np.asarray(ref_points_01, dtype=np.float64)
+    if ref.ndim == 3:
+        ref = ref[0]
+    if ref.ndim != 2 or ref.shape[1] != 3 or ref.shape[0] == 0:
+        return list(predictions)
+    ref_m = denormalize_ref_points_01(ref, pc_range=pc_range)
+    ref_xy = ref_m[:, :2]
+    pred_xy = np.array([p.get("world_coords", [0.0, 0.0, 0.0])[:2] for p in predictions], dtype=np.float64)
+    if pred_xy.ndim != 2 or pred_xy.shape[0] == 0:
+        return list(predictions)
+    d = ((pred_xy[:, None, :] - ref_xy[None, :, :]) ** 2).sum(-1)
+    nn = d.argmin(axis=1)
+    snapped = []
+    for i, p in enumerate(predictions):
+        p2 = dict(p)
+        wc = list(p2.get("world_coords", [0.0, 0.0, 0.0]))
+        j = int(nn[i])
+        new_xyz = ref_m[j].tolist()
+        if keep_z and len(wc) >= 3:
+            new_xyz[2] = float(wc[2])
+        p2["world_coords"] = [float(new_xyz[0]), float(new_xyz[1]), float(new_xyz[2])]
+        snapped.append(p2)
+    return snapped
+def calculate_per_class_metrics(
+    predictions: List[Dict],
+    ground_truths: List[Dict],
+    threshold: float = 2.0,
+) -> Dict[str, Dict[str, float]]:
+    pred_categories = set(pred['category'] for pred in predictions)
+    gt_categories = set(gt['category'] for gt in ground_truths)
+    all_categories = pred_categories | gt_categories
+    per_class_metrics = {}
+    for category in all_categories:
+        cat_preds = [pred for pred in predictions if pred['category'] == category]
+        cat_gts = [gt for gt in ground_truths if gt['category'] == category]
+        metrics = calculate_detection_f1(cat_preds, cat_gts, threshold)
+        per_class_metrics[category] = metrics
+    return per_class_metrics
+def parse_planning_output(text: str, require_full_vap: bool = False) -> Optional[Dict]:
+    result = {}
+    vel_pattern = r'ego car speed value:\s*\[(\d+),\s*(\d+)\]\.?'
+    acc_pattern = r'ego car acceleration value:\s*\[(\d+),\s*(\d+)\]\.?'
+    wp_pattern = (
+        r'(?:based on the ego car speed and acceleration you predicted,\s*)?'
+        r'(?:requeset|request)\s+the ego car planning waypoint(?:s)? in 3-seconds:\s*'
+        r'((?:\[\d+,\s*\d+\](?:,\s*)?)+)\.?'
+    )
+    vel_m = re.search(vel_pattern, text, flags=re.IGNORECASE)
+    if vel_m:
+        result['velocity_bins'] = [int(vel_m.group(1)), int(vel_m.group(2))]
+    acc_m = re.search(acc_pattern, text, flags=re.IGNORECASE)
+    if acc_m:
+        result['acceleration_bins'] = [int(acc_m.group(1)), int(acc_m.group(2))]
+    wp_m = re.search(wp_pattern, text, flags=re.IGNORECASE)
+    if wp_m:
+        point_pattern = r'\[(\d+),\s*(\d+)\]'
+        points = re.findall(point_pattern, wp_m.group(1))
+        wps = []
+        for xb, yb in points:
+            x = bin_to_meters(int(xb), bin_range=(-51.2, 51.2))
+            y = bin_to_meters(int(yb), bin_range=(-51.2, 51.2))
+            wps.append([x, y])
+        result['waypoints'] = wps
+    if 'waypoints' not in result or len(result['waypoints']) == 0:
+        return None
+    # Planning answers use a Figure 5-style chained speed + acceleration +
+    # waypoint protocol. The main evaluation path can require all three fields.
+    if require_full_vap and (
+        'velocity_bins' not in result or 'acceleration_bins' not in result
+    ):
+        return None
+    return result
+def _pad_waypoints(waypoints: List[List[float]], target_n: int = 6) -> List[List[float]]:
+    """Pad waypoint list to target_n by repeating last waypoint.
+    This prevents short model outputs from gaming the L2 / collision metrics.
+    """
+    if len(waypoints) >= target_n:
+        return waypoints[:target_n]
+    if len(waypoints) == 0:
+        return [[0.0, 0.0]] * target_n
+    last = list(waypoints[-1])
+    return list(waypoints) + [list(last)] * (target_n - len(waypoints))
+def calculate_planning_l2(
+    pred_waypoints: List[List[float]],
+    gt_waypoints: List[List[float]],
+    timestamps: List[float] = None,
+) -> Dict[str, float]:
+    n_gt = len(gt_waypoints)
+    if timestamps is None:
+        timestamps = [0.5 * (i + 1) for i in range(n_gt)]
+    # Pad predictions to match GT length to prevent short-output bias
+    pred_padded = _pad_waypoints(pred_waypoints, target_n=n_gt)
+    errors = {}
+    all_l2 = []
+    for i in range(n_gt):
+        pred = np.array(pred_padded[i][:2])
+        gt = np.array(gt_waypoints[i][:2])
+        l2 = float(np.linalg.norm(pred - gt))
+        all_l2.append(l2)
+        t = timestamps[i] if i < len(timestamps) else 0.5 * (i + 1)
+        if abs(t - 1.0) < 0.01:
+            errors['L2_1s'] = l2
+        if abs(t - 2.0) < 0.01:
+            errors['L2_2s'] = l2
+        if abs(t - 3.0) < 0.01:
+            errors['L2_3s'] = l2
+    key_steps = [v for k, v in errors.items() if k in ('L2_1s', 'L2_2s', 'L2_3s')]
+    errors['L2_avg'] = float(np.mean(key_steps)) if key_steps else (float(np.mean(all_l2)) if all_l2 else 0.0)
+    return errors
+def _box_corners_2d(cx: float, cy: float, w: float, l: float, yaw: float) -> np.ndarray:
+    """Build oriented box corners for yaw-from-x headings.
+    In planning eval JSON, yaw is measured from +X (right) axis:
+      - yaw = 0     -> vehicle length points to +X
+      - yaw = +pi/2 -> vehicle length points to +Y
+    This matches the qualitative visualization helper.
+    """
+    c = np.cos(yaw)
+    s = np.sin(yaw)
+    center = np.array([cx, cy], dtype=np.float64)
+    # Heading axis follows the vehicle length, with width perpendicular to it.
+    d_len = np.array([c, s], dtype=np.float64) * (l / 2.0)
+    d_wid = np.array([-s, c], dtype=np.float64) * (w / 2.0)
+    corners = np.stack([
+        center + d_len + d_wid,
+        center + d_len - d_wid,
+        center - d_len - d_wid,
+        center - d_len + d_wid,
+    ], axis=0)
+    return corners
+def _boxes_overlap(box1_corners: np.ndarray, box2_corners: np.ndarray) -> bool:
+    for box in [box1_corners, box2_corners]:
+        for i in range(4):
+            j = (i + 1) % 4
+            edge = box[j] - box[i]
+            normal = np.array([-edge[1], edge[0]])
+            proj1 = box1_corners @ normal
+            proj2 = box2_corners @ normal
+            if proj1.max() < proj2.min() or proj2.max() < proj1.min():
+                return False
+    return True
+def _check_collision_at_waypoints(
+    waypoints: List[List[float]],
+    gt_boxes: List[Dict],
+    ego_w: float,
+    ego_l: float,
+    gt_boxes_per_timestep: Optional[List[List[Dict]]] = None,
+) -> List[bool]:
+    """Check collision between ego at each waypoint and GT boxes.
+    When *gt_boxes_per_timestep* is provided (ST-P3 aligned), each waypoint
+    is checked against the boxes at the corresponding future timestep.
+    Otherwise falls back to using the same static *gt_boxes* for all waypoints.
+    """
+    collisions = []
+    for i, wp in enumerate(waypoints):
+        if i + 1 < len(waypoints):
+            dx = waypoints[i + 1][0] - wp[0]
+            dy = waypoints[i + 1][1] - wp[1]
+            ego_yaw = float(np.arctan2(dy, dx)) if (abs(dx) + abs(dy)) > 1e-4 else 0.0
+        elif i > 0:
+            dx = wp[0] - waypoints[i - 1][0]
+            dy = wp[1] - waypoints[i - 1][1]
+            ego_yaw = float(np.arctan2(dy, dx)) if (abs(dx) + abs(dy)) > 1e-4 else 0.0
+        else:
+            ego_yaw = 0.0
+        ego_corners = _box_corners_2d(wp[0], wp[1], ego_w, ego_l, ego_yaw)
+        boxes_at_t = gt_boxes
+        if gt_boxes_per_timestep is not None and i < len(gt_boxes_per_timestep):
+            boxes_at_t = gt_boxes_per_timestep[i]
+        collided = False
+        for box in boxes_at_t:
+            if 'world_coords' not in box:
+                continue
+            bx, by = box['world_coords'][0], box['world_coords'][1]
+            bw = box.get('w', 2.0)
+            bl = box.get('l', 4.0)
+            byaw = box.get('yaw', 0.0)
+            obj_corners = _box_corners_2d(bx, by, bw, bl, byaw)
+            if _boxes_overlap(ego_corners, obj_corners):
+                collided = True
+                break
+        collisions.append(collided)
+    return collisions
+def calculate_collision_rate(
+    pred_waypoints: List[List[float]],
+    gt_boxes: List[Dict],
+    ego_w: float = 1.85,
+    ego_l: float = 4.084,
+    timestamps: List[float] = None,
+    num_waypoints: int = 6,
+    gt_waypoints: Optional[List[List[float]]] = None,
+    gt_boxes_per_timestep: Optional[List[List[Dict]]] = None,
+) -> Dict[str, float]:
+    pred_padded = _pad_waypoints(pred_waypoints, target_n=num_waypoints)
+    if timestamps is None:
+        timestamps = [0.5 * (i + 1) for i in range(num_waypoints)]
+    # ST-P3 aligned: exclude timesteps where the GT trajectory itself collides
+    gt_collides = [False] * num_waypoints
+    if gt_waypoints is not None:
+        gt_padded = _pad_waypoints(gt_waypoints, target_n=num_waypoints)
+        gt_collides = _check_collision_at_waypoints(
+            gt_padded, gt_boxes, ego_w, ego_l,
+            gt_boxes_per_timestep=gt_boxes_per_timestep,
+        )
+    pred_collides = _check_collision_at_waypoints(
+        pred_padded, gt_boxes, ego_w, ego_l,
+        gt_boxes_per_timestep=gt_boxes_per_timestep,
+    )
+    collisions_at_t = {}
+    for i in range(num_waypoints):
+        t = timestamps[i] if i < len(timestamps) else 0.5 * (i + 1)
+        if gt_collides[i]:
+            collisions_at_t[t] = False
+        else:
+            collisions_at_t[t] = pred_collides[i]
+    results = {}
+    for target_t, key in [(1.0, 'collision_1s'), (2.0, 'collision_2s'), (3.0, 'collision_3s')]:
+        matched = [v for t, v in collisions_at_t.items() if abs(t - target_t) < 0.01]
+        if matched:
+            results[key] = float(matched[0])
+    key_cols = [v for k, v in results.items() if k in ('collision_1s', 'collision_2s', 'collision_3s')]
+    results['collision_avg'] = float(np.mean(key_cols)) if key_cols else 0.0
+    return results
+def calculate_planning_metrics(
+    predictions: List[Dict],
+    ground_truths: List[Dict],
+) -> Dict[str, float]:
+    all_l2 = {'L2_1s': [], 'L2_2s': [], 'L2_3s': [], 'L2_avg': []}
+    all_col = {'collision_1s': [], 'collision_2s': [], 'collision_3s': [], 'collision_avg': []}
+    for pred, gt in zip(predictions, ground_truths):
+        pred_wps = pred.get('waypoints', [])
+        gt_wps = gt.get('waypoints', [])
+        if pred_wps and gt_wps:
+            l2 = calculate_planning_l2(pred_wps, gt_wps)
+            for k, v in l2.items():
+                if k in all_l2:
+                    all_l2[k].append(v)
+        gt_boxes = gt.get('gt_boxes', [])
+        gt_boxes_per_ts = gt.get('gt_boxes_per_timestep', None)
+        if pred_wps and (gt_boxes or gt_boxes_per_ts):
+            col = calculate_collision_rate(
+                pred_wps, gt_boxes, gt_waypoints=gt_wps,
+                gt_boxes_per_timestep=gt_boxes_per_ts,
+            )
+            for k, v in col.items():
+                if k in all_col:
+                    all_col[k].append(v)
+    results = {}
+    for k, vals in all_l2.items():
+        results[k] = float(np.mean(vals)) if vals else 0.0
+    for k, vals in all_col.items():
+        results[k] = float(np.mean(vals)) if vals else 0.0
+    return results
+VEL_ACC_RANGE = (-50.0, 50.0)
+def vel_acc_bin_to_meters(bin_val: int, num_bins: int = 1000) -> float:
+    return bin_to_meters(bin_val, bin_range=VEL_ACC_RANGE, num_bins=num_bins)
+def chamfer_distance_polyline(
+    pred_pts: np.ndarray,
+    gt_pts: np.ndarray,
+) -> float:
+    if len(pred_pts) == 0 or len(gt_pts) == 0:
+        return float('inf')
+    pred_pts = np.asarray(pred_pts, dtype=np.float64)
+    gt_pts = np.asarray(gt_pts, dtype=np.float64)
+    d_p2g = 0.0
+    for p in pred_pts:
+        d_p2g += np.linalg.norm(gt_pts - p[None, :], axis=1).min()
+    d_p2g /= len(pred_pts)
+    d_g2p = 0.0
+    for g in gt_pts:
+        d_g2p += np.linalg.norm(pred_pts - g[None, :], axis=1).min()
+    d_g2p /= len(gt_pts)
+    return 0.5 * (d_p2g + d_g2p)
+def _lane_points_array(lane) -> np.ndarray:
+    pts = lane.get('points', [])
+    if not pts:
+        return np.zeros((0, 3))
+    rows = []
+    for pt in pts:
+        if isinstance(pt, dict):
+            rows.append(pt.get('world_coords', [0, 0, 0])[:3])
+        else:
+            rows.append(list(pt)[:3])
+    return np.array(rows, dtype=np.float64)
+def match_lanes(
+    pred_lanes: List[Dict],
+    gt_lanes: List[Dict],
+    threshold: float = 1.5,
+) -> Tuple[List[Tuple[int, int]], List[int], List[int]]:
+    if not pred_lanes:
+        return [], [], list(range(len(gt_lanes)))
+    if not gt_lanes:
+        return [], list(range(len(pred_lanes))), []
+    n_p = len(pred_lanes)
+    n_g = len(gt_lanes)
+    cost = np.full((n_p, n_g), float('inf'))
+    for i, pl in enumerate(pred_lanes):
+        p_pts = _lane_points_array(pl)
+        if len(p_pts) == 0:
+            continue
+        for j, gl in enumerate(gt_lanes):
+            g_pts = _lane_points_array(gl)
+            if len(g_pts) == 0:
+                continue
+            cd = chamfer_distance_polyline(p_pts, g_pts)
+            if cd < threshold:
+                cost[i, j] = cd
+    matches = []
+    matched_p = set()
+    matched_g = set()
+    if SCIPY_AVAILABLE and n_p > 0 and n_g > 0 and np.isfinite(cost).any():
+        try:
+            row_ind, col_ind = linear_sum_assignment(cost)
+        except ValueError:
+            row_ind, col_ind = [], []
+        for pi, gi in zip(row_ind, col_ind):
+            if cost[pi, gi] < threshold:
+                matches.append((pi, gi))
+                matched_p.add(pi)
+                matched_g.add(gi)
+    else:
+        pairs = []
+        for i in range(n_p):
+            for j in range(n_g):
+                if cost[i, j] < threshold:
+                    pairs.append((cost[i, j], i, j))
+        pairs.sort()
+        for _, i, j in pairs:
+            if i not in matched_p and j not in matched_g:
+                matches.append((i, j))
+                matched_p.add(i)
+                matched_g.add(j)
+    fp = [i for i in range(n_p) if i not in matched_p]
+    fn = [j for j in range(n_g) if j not in matched_g]
+    return matches, fp, fn
+def calculate_lane_detection_metrics(
+    pred_lanes: List[Dict],
+    gt_lanes: List[Dict],
+    threshold: float = 1.5,
+) -> Dict[str, float]:
+    matches, fp_list, fn_list = match_lanes(pred_lanes, gt_lanes, threshold)
+    tp = len(matches)
+    fp = len(fp_list)
+    fn = len(fn_list)
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+    return {
+        'lane_precision': precision,
+        'lane_recall': recall,
+        'lane_f1': f1,
+        'lane_tp': tp,
+        'lane_fp': fp,
+        'lane_fn': fn,
+    }
+def calculate_multi_threshold_detection_f1(
+    predictions: List[Dict],
+    ground_truths: List[Dict],
+    thresholds: Tuple[float, ...] = (0.5, 1.0, 2.0, 4.0),
+) -> Dict[str, float]:
+    results = {}
+    f1_vals = []
+    for t in thresholds:
+        m = calculate_detection_f1(predictions, ground_truths, threshold=t)
+        results[f'P@{t}m'] = m['precision']
+        results[f'R@{t}m'] = m['recall']
+        results[f'F1@{t}m'] = m['f1']
+        f1_vals.append(m['f1'])
+    results['F1_avg'] = float(np.mean(f1_vals)) if f1_vals else 0.0
+    return results
+def evaluate_all(
+    task_predictions: Dict[str, List],
+    task_ground_truths: Dict[str, List],
+) -> Dict[str, Dict[str, float]]:
+    results = {}
+    if 'detection' in task_predictions and 'detection' in task_ground_truths:
+        results['detection'] = calculate_multi_threshold_detection_f1(
+            task_predictions['detection'],
+            task_ground_truths['detection'],
+        )
+    if 'lane' in task_predictions and 'lane' in task_ground_truths:
+        agg = {'lane_precision': [], 'lane_recall': [], 'lane_f1': []}
+        for pred_set, gt_set in zip(task_predictions['lane'], task_ground_truths['lane']):
+            p_list = pred_set if isinstance(pred_set, list) else [pred_set]
+            g_list = gt_set if isinstance(gt_set, list) else [gt_set]
+            m = calculate_lane_detection_metrics(p_list, g_list)
+            for k in agg:
+                agg[k].append(m[k])
+        results['lane'] = {k: float(np.mean(v)) for k, v in agg.items() if v}
+    if 'planning' in task_predictions and 'planning' in task_ground_truths:
+        results['planning'] = calculate_planning_metrics(
+            task_predictions['planning'],
+            task_ground_truths['planning'],
+        )
+    return results

src/model/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Atlas model module."""
+from .topomlp_adapter import TopoMLPToAtlasMapTokens
+from .streampetr_adapter import extract_streampetr_topk_tokens
+# Atlas (LLM) side depends on `transformers`. StreamPETR/TopoMLP pretraining does not.
+try:
+    from .configuration_atlas import AtlasConfig
+    from .modeling_atlas import AtlasProjector, AtlasForCausalLM
+    _ATLAS_AVAILABLE = True
+except Exception:
+    AtlasConfig = None  # type: ignore[assignment]
+    AtlasProjector = None  # type: ignore[assignment]
+    AtlasForCausalLM = None  # type: ignore[assignment]
+    _ATLAS_AVAILABLE = False
+__all__ = [
+    "TopoMLPToAtlasMapTokens",
+    "extract_streampetr_topk_tokens",
+]
+if _ATLAS_AVAILABLE:
+    __all__ += [
+        "AtlasConfig",
+        "AtlasProjector",
+        "AtlasForCausalLM",
+    ]

src/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (622 Bytes). View file

src/model/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (622 Bytes). View file

src/model/__pycache__/configuration_atlas.cpython-310.pyc ADDED Viewed

Binary file (852 Bytes). View file

src/model/__pycache__/modeling_atlas.cpython-310.pyc ADDED Viewed

Binary file (15.3 kB). View file

src/model/__pycache__/modeling_atlas.cpython-38.pyc ADDED Viewed

Binary file (15.2 kB). View file

src/model/__pycache__/streampetr_adapter.cpython-310.pyc ADDED Viewed

Binary file (4.25 kB). View file

src/model/__pycache__/streampetr_adapter.cpython-38.pyc ADDED Viewed

Binary file (4.22 kB). View file

src/model/__pycache__/topomlp_adapter.cpython-310.pyc ADDED Viewed

Binary file (3.91 kB). View file

src/model/__pycache__/topomlp_adapter.cpython-38.pyc ADDED Viewed

Binary file (3.81 kB). View file

src/model/modeling_atlas.py ADDED Viewed

	@@ -0,0 +1,549 @@

+"""Atlas model (LLM + visual token injection)."""
+import os
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple, List, Union, Dict
+from transformers import (
+    AutoModelForCausalLM,
+    AutoConfig,
+    BitsAndBytesConfig,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+try:
+    from peft import (
+        LoraConfig,
+        get_peft_model,
+        prepare_model_for_kbit_training,
+        TaskType,
+    )
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
+from src.audit.audit_utils import audit_enabled, audit_check
+def get_quantization_config(
+    load_in_4bit: bool = True,
+    bnb_4bit_compute_dtype: torch.dtype = torch.bfloat16,
+    bnb_4bit_quant_type: str = "nf4",
+    bnb_4bit_use_double_quant: bool = True,
+) -> Optional[BitsAndBytesConfig]:
+    """
+    Create BitsAndBytes quantization config for 4-bit loading.
+    """
+    if not load_in_4bit:
+        return None
+    return BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
+        bnb_4bit_quant_type=bnb_4bit_quant_type,
+        bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
+    )
+class ReferencePointProjector(nn.Module):
+    """Reference points [B,Q,3] -> embeddings [B,Q,D]. Zero-initialized per paper."""
+    def __init__(self, visual_hidden_size: int = 256):
+        super().__init__()
+        self.visual_hidden_size = visual_hidden_size
+        self.projector_rp = nn.Linear(3, visual_hidden_size)
+        nn.init.zeros_(self.projector_rp.weight)
+        if self.projector_rp.bias is not None:
+            nn.init.zeros_(self.projector_rp.bias)
+    def forward(self, ref_points: torch.Tensor) -> torch.Tensor:
+        return self.projector_rp(ref_points.to(self.projector_rp.weight.dtype))
+class AtlasProjector(nn.Module):
+    """Project visual features [B,Q,Dv] -> LLM hidden [B,Q,H]."""
+    def __init__(
+        self,
+        visual_hidden_size: int = 256,
+        llm_hidden_size: int = 4096,
+    ):
+        super().__init__()
+        self.visual_hidden_size = visual_hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.projector = nn.Linear(visual_hidden_size, llm_hidden_size)
+        nn.init.xavier_uniform_(self.projector.weight)
+        if self.projector.bias is not None:
+            nn.init.zeros_(self.projector.bias)
+    def forward(self, visual_features: torch.Tensor) -> torch.Tensor:
+        return self.projector(visual_features.to(self.projector.weight.dtype))
+class AtlasUnifiedProjector(nn.Module):
+    def __init__(self, visual_hidden_size: int, llm_hidden_size: int):
+        super().__init__()
+        self.projector_det = AtlasProjector(visual_hidden_size, llm_hidden_size)
+        self.projector_map = AtlasProjector(visual_hidden_size, llm_hidden_size)
+        self.projector_rp = ReferencePointProjector(visual_hidden_size)
+    def forward(
+        self,
+        detection_features: torch.Tensor,
+        map_features: Optional[torch.Tensor] = None,
+        detection_ref_points: Optional[torch.Tensor] = None,
+        map_ref_points: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        if detection_ref_points is not None:
+            detection_features = detection_features + self.projector_rp(detection_ref_points)
+        det_embeds = self.projector_det(detection_features)
+        out: Dict[str, torch.Tensor] = {"detection": det_embeds}
+        if map_features is not None:
+            if map_ref_points is not None:
+                map_features = map_features + self.projector_rp(map_ref_points)
+            out["map"] = self.projector_map(map_features)
+        return out
+class AtlasForCausalLM(nn.Module):
+    """Atlas model."""
+    def __init__(
+        self,
+        llm_model_name: str = "lmsys/vicuna-7b-v1.5",
+        visual_hidden_size: int = 256,
+        num_queries: int = 256,
+        num_map_queries: int = 256,
+        load_in_4bit: bool = False,
+        use_flash_attention: bool = True,
+        device_map: Optional[str] = None,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        use_lora: bool = False,
+        lora_r: int = 64,
+        lora_alpha: int = 64,
+        lora_dropout: float = 0.1,
+        lora_target_modules: List[str] = None,
+    ):
+        super().__init__()
+        self.llm_model_name = llm_model_name
+        self.visual_hidden_size = visual_hidden_size
+        self.num_queries = num_queries
+        self.num_map_queries = num_map_queries
+        self.query_token_id = None
+        if load_in_4bit and not use_lora:
+            raise ValueError(
+                "load_in_4bit=True requires use_lora=True (4-bit weights are not trainable)."
+            )
+        if use_lora and not PEFT_AVAILABLE:
+            raise ImportError("LoRA requires peft library")
+        self.llm_config = AutoConfig.from_pretrained(llm_model_name)
+        self.llm_hidden_size = self.llm_config.hidden_size
+        quantization_config = None
+        if load_in_4bit:
+            quantization_config = get_quantization_config(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch_dtype,
+            )
+        attn_implementation = None
+        if use_flash_attention:
+            try:
+                import flash_attn  # noqa: F401
+                attn_implementation = "flash_attention_2"
+            except ImportError:
+                raise ImportError(
+                    "use_flash_attention=True but flash_attn is not installed. "
+                    "Install with: pip install flash-attn --no-build-isolation"
+                )
+        self.llm = AutoModelForCausalLM.from_pretrained(
+            llm_model_name,
+            quantization_config=quantization_config,
+            device_map=device_map,
+            torch_dtype=torch_dtype,
+            attn_implementation=attn_implementation,
+            trust_remote_code=True,
+        )
+        if use_lora:
+            if load_in_4bit:
+                self.llm = prepare_model_for_kbit_training(self.llm)
+            if lora_target_modules is None:
+                lora_target_modules = [
+                    "q_proj", "k_proj", "v_proj", "o_proj",  # Attention
+                    "gate_proj", "up_proj", "down_proj",      # MLP
+                ]
+            lora_config = LoraConfig(
+                r=lora_r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                target_modules=lora_target_modules,
+                bias="none",
+                task_type=TaskType.CAUSAL_LM,
+            )
+            self.llm = get_peft_model(self.llm, lora_config)
+        self.projector = AtlasUnifiedProjector(
+            visual_hidden_size=visual_hidden_size,
+            llm_hidden_size=self.llm_hidden_size,
+        )
+        try:
+            llm_device = next(self.llm.parameters()).device
+            self.projector = self.projector.to(llm_device)
+        except StopIteration:
+            pass
+    def set_query_token_id(self, query_token_id: int):
+        self.query_token_id = query_token_id
+    def get_input_embeddings(self):
+        return self.llm.get_input_embeddings()
+    def resize_token_embeddings(self, new_num_tokens: int):
+        self.llm.resize_token_embeddings(new_num_tokens)
+    def gradient_checkpointing_enable(self):
+        if hasattr(self.llm, 'gradient_checkpointing_enable'):
+            self.llm.gradient_checkpointing_enable()
+    _NO_DECAY_KEYWORDS = {"bias", "LayerNorm.weight", "layernorm.weight",
+                          "layer_norm.weight", "norm.weight"}
+    def get_trainable_param_groups(self, lr: float, weight_decay: float = 0.0) -> List[Dict]:
+        decay, no_decay = [], []
+        for module in [self.projector, self.llm]:
+            for name, param in module.named_parameters():
+                if not param.requires_grad:
+                    continue
+                if any(nd in name for nd in self._NO_DECAY_KEYWORDS):
+                    no_decay.append(param)
+                else:
+                    decay.append(param)
+        if not any(p.requires_grad for p in self.projector.parameters()):
+            raise RuntimeError("projector has no trainable parameters (requires_grad=False).")
+        groups: List[Dict] = []
+        if decay:
+            groups.append({"params": decay, "lr": lr, "weight_decay": weight_decay})
+        if no_decay:
+            groups.append({"params": no_decay, "lr": lr, "weight_decay": 0.0})
+        return groups
+    def get_expected_trainable_param_ids(self, lr: float) -> set:
+        """Convenience helper for 'optimizer coverage' hard checks."""
+        param_ids: set = set()
+        for g in self.get_trainable_param_groups(lr):
+            for p in g["params"]:
+                param_ids.add(id(p))
+        return param_ids
+    def parameters(self, recurse: bool = True):
+        for param in self.projector.parameters(recurse):
+            yield param
+        for param in self.llm.parameters(recurse):
+            yield param
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        visual_features: Optional[Union[torch.Tensor, Dict]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        query_token_id: Optional[int] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        inputs_embeds, attention_mask, position_ids = self._prepare_llm_inputs(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            visual_features=visual_features,
+            labels=labels,
+            query_token_id=query_token_id,
+        )
+        if self.training:
+            use_cache = False
+        # Forward through LLM
+        outputs = self.llm(
+            input_ids=None,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+    def _prepare_llm_inputs(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        visual_features: Optional[Union[torch.Tensor, Dict]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        query_token_id: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        """
+        Shared path for forward/generate:
+          - build inputs_embeds
+          - inject visual tokens into <query> positions
+          - ensure attention_mask / position_ids are consistent
+        """
+        inputs_embeds = self.llm.get_input_embeddings()(input_ids).clone()
+        _attention_mask_before_inject = None
+        if attention_mask is not None:
+            attention_mask = attention_mask.clone()
+            _attention_mask_before_inject = attention_mask.clone()
+        assert inputs_embeds.shape[:2] == input_ids.shape[:2], (
+            f"shape mismatch: inputs_embeds={inputs_embeds.shape[:2]} != input_ids={input_ids.shape[:2]}"
+        )
+        if attention_mask is not None:
+            assert attention_mask.shape[:2] == input_ids.shape[:2], (
+                f"shape mismatch: attention_mask={attention_mask.shape[:2]} != input_ids={input_ids.shape[:2]}"
+            )
+        qid = query_token_id if query_token_id is not None else self.query_token_id
+        if visual_features is not None and qid is not None:
+            batch_size = input_ids.shape[0]
+            device = inputs_embeds.device
+            dtype = inputs_embeds.dtype
+            if isinstance(visual_features, dict):
+                detection_features = visual_features["detection"]
+                map_features = visual_features.get("map", None)
+                det_ref_points = visual_features.get("detection_ref_points", None)
+                map_ref_points = visual_features.get("map_ref_points", None)
+                projected = self.projector(
+                    detection_features=detection_features,
+                    map_features=map_features,
+                    detection_ref_points=det_ref_points,
+                    map_ref_points=map_ref_points,
+                )
+                detection_embeds = projected["detection"].to(dtype=dtype, device=device)
+                map_embeds = projected.get("map", None)
+                if map_embeds is not None:
+                    map_embeds = map_embeds.to(dtype=dtype, device=device)
+                num_det = detection_embeds.shape[1]
+                num_map = map_embeds.shape[1] if map_embeds is not None else 0
+            else:
+                detection_embeds = self.projector.projector_det(visual_features).to(dtype=dtype, device=device)
+                map_embeds = None
+                num_det = detection_embeds.shape[1]
+                num_map = 0
+            for b in range(batch_size):
+                query_positions = torch.where(input_ids[b] == qid)[0]
+                num_query_slots = int(query_positions.numel())
+                if num_query_slots == 0:
+                    continue
+                if labels is not None:
+                    ans_pos = (labels[b] != -100).nonzero(as_tuple=True)[0]
+                    if ans_pos.numel() > 0 and audit_enabled():
+                        first_answer_token_pos = int(ans_pos[0].item())
+                        q_min = int(query_positions.min().item())
+                        q_max = int(query_positions.max().item())
+                        audit_check(
+                            "B4",
+                            q_max < first_answer_token_pos,
+                            once=True,
+                            min_query_pos=q_min,
+                            max_query_pos=q_max,
+                            first_answer_token_pos=first_answer_token_pos,
+                        )
+                if map_embeds is not None and num_query_slots == (num_det + num_map):
+                    for i in range(num_det):
+                        inputs_embeds[b, int(query_positions[i].item())] = detection_embeds[b, i]
+                    for i in range(num_map):
+                        inputs_embeds[b, int(query_positions[int(num_det) + i].item())] = map_embeds[b, i]
+                    num_injected = int(num_det + num_map)
+                    inj_det = int(num_det)
+                    inj_map = int(num_map)
+                elif num_query_slots == num_det:
+                    for i in range(num_det):
+                        inputs_embeds[b, int(query_positions[i].item())] = detection_embeds[b, i]
+                    num_injected = int(num_det)
+                    inj_det = int(num_det)
+                    inj_map = 0
+                else:
+                    raise ValueError(
+                        f"<query> slot mismatch: slots={num_query_slots}, "
+                        f"det={num_det}, map={num_map if map_embeds is not None else 0}. "
+                        f"Ensure visual_features provides the correct number of tokens "
+                        f"matching the prompt's <query> placeholders."
+                    )
+                if attention_mask is not None and num_injected > 0:
+                    attention_mask[b, query_positions[:num_injected]] = 1
+                if audit_enabled():
+                    if not hasattr(self, "_audit_forward_calls"):
+                        self._audit_forward_calls = 0
+                    max_calls = int(os.getenv("ATLAS_AUDIT_MAX_FWD", "1"))
+                    if self._audit_forward_calls < max_calls and b == 0:
+                        total_injected = int(num_injected)
+                        print(
+                            "[ATLAS_AUDIT][A3] "
+                            f"num_query_tokens_in_input_ids={num_query_slots} "
+                            f"num_detection_tokens_injected={inj_det} "
+                            f"num_map_tokens_injected={inj_map} "
+                            f"total_injected={total_injected}"
+                        )
+                        seq_len_input_ids = int(input_ids.shape[1])
+                        seq_len_embeds = int(inputs_embeds.shape[1])
+                        seq_len_mask = int(attention_mask.shape[1]) if attention_mask is not None else -1
+                        print(f"[ATLAS_AUDIT][B1] seq_len_input_ids={seq_len_input_ids} seq_len_embeds={seq_len_embeds} seq_len_mask={seq_len_mask}")
+                        n_show = min(20, num_query_slots)
+                        pos_first = query_positions[:n_show].tolist()
+                        src_first = []
+                        for i in range(n_show):
+                            src_first.append("DET" if i < int(num_det) else "MAP")
+                        packed = ",".join([f"{s}@{p}" for s, p in zip(src_first, pos_first)])
+                        print(f"[ATLAS_AUDIT][A4] first20={packed}")
+                        if attention_mask is not None and n_show > 0 and _attention_mask_before_inject is not None:
+                            q_mask = _attention_mask_before_inject[b, query_positions[:num_injected]]
+                            bad = int((q_mask == 0).sum().item())
+                            q_min2 = int(q_mask.min().item()) if q_mask.numel() else -1
+                            q_max2 = int(q_mask.max().item()) if q_mask.numel() else -1
+                            print(f"[ATLAS_AUDIT][A5/B2] query_mask_bad_count={bad} query_mask_min={q_min2} query_mask_max={q_max2}")
+                            assert bad == 0, "<query> positions have attention_mask==0"
+                        n = min(8, num_query_slots, num_injected)
+                        if n > 0:
+                            diffs = []
+                            for i in range(n):
+                                pos = int(query_positions[i].item())
+                                if i < int(num_det):
+                                    ref = detection_embeds[b, i]
+                                else:
+                                    j = i - int(num_det)
+                                    ref = map_embeds[b, j] if map_embeds is not None else detection_embeds[b, min(i, int(num_det) - 1)]
+                                diffs.append(float((inputs_embeds[b, pos] - ref).abs().max().item()))
+                            max_diff = max(diffs) if diffs else 0.0
+                            print(f"[ATLAS_AUDIT][C1] sampled_max_diff={max_diff:.3e} (n={n})")
+                            assert max_diff < 1e-5, f"injected embed diff too large: {max_diff}"
+                        # C3 audit (post-inject)
+                        if attention_mask is not None:
+                            text_pos2 = (attention_mask[b] == 1) & (input_ids[b] != qid)
+                        else:
+                            text_pos2 = (input_ids[b] != qid)
+                        text_vec2 = inputs_embeds[b, text_pos2].float()
+                        vis_vec2 = inputs_embeds[b, query_positions[:num_injected]].float()
+                        if text_vec2.numel() > 0 and vis_vec2.numel() > 0:
+                            text_norm = text_vec2.norm(dim=-1)
+                            vis_norm = vis_vec2.norm(dim=-1)
+                            t_mean = float(text_norm.mean().item())
+                            t_std = float(text_norm.std(unbiased=False).item())
+                            v_mean = float(vis_norm.mean().item())
+                            v_std = float(vis_norm.std(unbiased=False).item())
+                            ratio = v_mean / (t_mean + 1e-8)
+                            rmin = float(os.getenv("ATLAS_VIS_TEXT_NORM_RATIO_MIN", "0.1"))
+                            rmax = float(os.getenv("ATLAS_VIS_TEXT_NORM_RATIO_MAX", "10.0"))
+                            audit_check(
+                                "C3",
+                                (ratio >= rmin and ratio <= rmax),
+                                once=True,
+                                ratio=ratio,
+                                ratio_min=rmin,
+                                ratio_max=rmax,
+                                text_norm_mean=t_mean,
+                                text_norm_std=t_std,
+                                vis_norm_mean=v_mean,
+                                vis_norm_std=v_std,
+                                dtype=str(inputs_embeds.dtype),
+                            )
+                        self._audit_forward_calls += 1
+        batch_size, seq_length = inputs_embeds.shape[:2]
+        device = inputs_embeds.device
+        if attention_mask is not None:
+            has_token = (attention_mask.sum(dim=1) > 0)
+            left_padded = bool(((attention_mask[:, 0] == 0) & has_token).any().item() and ((attention_mask[:, -1] == 1) & has_token).any().item())
+            if left_padded:
+                position_ids = attention_mask.long().cumsum(dim=1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 0)
+            else:
+                position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device).unsqueeze(0).expand(batch_size, -1)
+        else:
+            position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device).unsqueeze(0).expand(batch_size, -1)
+        return inputs_embeds, attention_mask, position_ids
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        visual_features: Optional[Union[torch.Tensor, Dict]] = None,
+        max_new_tokens: int = 256,
+        query_token_id: Optional[int] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Greedy/standard generation using the SAME projector+injection+pos/mask path as forward.
+        """
+        inputs_embeds, attention_mask, position_ids = self._prepare_llm_inputs(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            visual_features=visual_features,
+            labels=None,
+            query_token_id=query_token_id,
+        )
+        gen_kwargs = dict(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+        )
+        if "pad_token_id" not in kwargs and hasattr(self.llm.config, "pad_token_id"):
+            gen_kwargs["pad_token_id"] = self.llm.config.pad_token_id
+        gen_kwargs.update(kwargs)
+        return self.llm.generate(**gen_kwargs)
+def load_atlas_model(
+    llm_model_name: str = "lmsys/vicuna-7b-v1.5",
+    visual_hidden_size: int = 256,
+    num_queries: int = 256,
+    num_map_queries: int = 256,
+    load_in_4bit: bool = False,
+    use_flash_attention: bool = True,
+    use_lora: bool = False,
+    lora_r: int = 64,
+    lora_alpha: int = 64,
+    lora_dropout: float = 0.1,
+    lora_target_modules: List[str] = None,
+) -> AtlasForCausalLM:
+    return AtlasForCausalLM(
+        llm_model_name=llm_model_name,
+        visual_hidden_size=visual_hidden_size,
+        num_queries=num_queries,
+        num_map_queries=num_map_queries,
+        load_in_4bit=load_in_4bit,
+        use_flash_attention=use_flash_attention,
+        use_lora=use_lora,
+        lora_r=lora_r,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        lora_target_modules=lora_target_modules,
+    )

src/model/streampetr_adapter.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+StreamPETR -> Atlas detection token adapter WITHOUT modifying StreamPETR source code.
+Rationale:
+  - StreamPETRHead updates internal memory with Top-K proposals (topk_proposals, typically 256).
+  - After a forward pass, the head stores:
+      - self.memory_embedding: [B, memory_len + topk_proposals, D] (prepend new Top-K)
+      - self.memory_reference_point: [B, memory_len + topk_proposals, 3]
+    (exact ordering: new Top-K are concatenated in front)
+IMPORTANT: after post_update_memory, memory_reference_point is in the **GLOBAL**
+coordinate frame (ego_pose applied).  We must invert the ego_pose to bring
+ref_points back to ego frame before normalizing with pc_range.
+"""
+from __future__ import annotations
+from typing import Any, Dict, Optional, Tuple
+import torch
+PC_RANGE = (-51.2, -51.2, -5.0, 51.2, 51.2, 3.0)
+def _normalize_ref_points(
+    ref: torch.Tensor,
+    pc_range: Tuple[float, float, float, float, float, float] = PC_RANGE,
+) -> torch.Tensor:
+    pc_min = ref.new_tensor(pc_range[:3])
+    pc_max = ref.new_tensor(pc_range[3:])
+    denom = (pc_max - pc_min).clamp(min=1e-6)
+    return ((ref - pc_min) / denom).clamp(0.0, 1.0)
+def _global_to_ego(ref: torch.Tensor, ego_pose: torch.Tensor) -> torch.Tensor:
+    """Transform reference points from global frame back to ego frame.
+    Args:
+        ref: [B, N, 3] in global coordinates
+        ego_pose: [B, 4, 4] ego-to-global transform
+    Returns:
+        [B, N, 3] in ego coordinates
+    """
+    B, N, _ = ref.shape
+    ones = torch.ones(B, N, 1, device=ref.device, dtype=ref.dtype)
+    ref_homo = torch.cat([ref, ones], dim=-1)  # [B, N, 4]
+    ego_pose_inv = torch.inverse(ego_pose)  # [B, 4, 4]
+    ref_ego = (ego_pose_inv.unsqueeze(1) @ ref_homo.unsqueeze(-1)).squeeze(-1)[..., :3]
+    return ref_ego
+def _nuscenes_ego_to_paper(ref: torch.Tensor) -> torch.Tensor:
+    """Convert nuScenes ego coords to Atlas paper frame.
+    nuScenes ego uses x=forward, y=left. Atlas detection QA uses
+    x=right, y=forward, so (x_p, y_p) = (-y_n, x_n).
+    """
+    ref_paper = ref.clone()
+    ref_paper[..., 0] = -ref[..., 1]
+    ref_paper[..., 1] = ref[..., 0]
+    return ref_paper
+@torch.no_grad()
+def extract_streampetr_topk_tokens(
+    pts_bbox_head: Any,
+    topk: int = 256,
+    pc_range: Tuple[float, float, float, float, float, float] = PC_RANGE,
+    ego_pose: Optional[torch.Tensor] = None,
+) -> Dict[str, torch.Tensor]:
+    """
+    Args:
+        pts_bbox_head: the StreamPETRHead instance (model.pts_bbox_head)
+        topk: number of tokens to export; should match pts_bbox_head.topk_proposals
+        pc_range: point cloud range used by StreamPETR, for normalizing ref_points
+        ego_pose: [B, 4, 4] ego-to-global transform. If provided, ref_points are
+                  transformed back from global to ego frame before normalization.
+    Returns:
+        dict:
+          - detection: [B, topk, D]
+          - detection_ref_points: [B, topk, 3]  (normalized to [0, 1];
+            if ego_pose is provided, aligned to Atlas paper frame)
+    """
+    if not hasattr(pts_bbox_head, "memory_embedding") or not hasattr(pts_bbox_head, "memory_reference_point"):
+        raise RuntimeError("pts_bbox_head missing memory buffers; ensure you have run a forward pass first.")
+    mem = pts_bbox_head.memory_embedding
+    ref = pts_bbox_head.memory_reference_point
+    if mem is None or ref is None:
+        raise RuntimeError("pts_bbox_head memory is None; ensure you have run a forward pass and prev_exists is set.")
+    if mem.ndim != 3 or ref.ndim != 3 or ref.shape[-1] != 3:
+        raise RuntimeError(f"unexpected shapes: memory_embedding={getattr(mem,'shape',None)} memory_reference_point={getattr(ref,'shape',None)}")
+    B = mem.shape[0]
+    if mem.shape[1] < topk or ref.shape[1] < topk:
+        raise RuntimeError(f"memory length too small: mem_len={mem.shape[1]} ref_len={ref.shape[1]} topk={topk}")
+    det = mem[:, :topk, :].contiguous()
+    det_ref = ref[:, :topk, :].contiguous()
+    # post_update_memory transforms ref_points to global frame via ego_pose.
+    # We invert this to get ego-frame coordinates, then rotate to Atlas paper
+    # frame so projector_rp sees the same XY semantics as detection QA/GT.
+    if ego_pose is not None:
+        det_ref = _global_to_ego(det_ref, ego_pose)
+        det_ref = _nuscenes_ego_to_paper(det_ref)
+    det_ref = _normalize_ref_points(det_ref, pc_range)
+    return {"detection": det, "detection_ref_points": det_ref}

src/model/topomlp_adapter.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""TopoMLP -> Atlas map token adapter.
+Paper-aligned: Top-K selection from TopoMLP decoder outputs,
+followed by a single linear projector (handled by AtlasUnifiedProjector).
+No Perceiver resampler -- queries and ref_points are passed through directly.
+"""
+from __future__ import annotations
+from typing import Dict, Optional, Tuple
+import torch
+def _lane_control_points_to_center_xyz(lane_preds: torch.Tensor) -> torch.Tensor:
+    if lane_preds.ndim != 3 or lane_preds.shape[-1] % 3 != 0:
+        raise ValueError(f"lane_preds expected [B,Q,3*K], got {tuple(lane_preds.shape)}")
+    B, Q, D = lane_preds.shape
+    K = D // 3
+    pts = lane_preds.view(B, Q, K, 3)
+    return pts.mean(dim=2)
+def _normalize_xyz(xyz: torch.Tensor, xyz_min: torch.Tensor, xyz_max: torch.Tensor) -> torch.Tensor:
+    denom = (xyz_max - xyz_min).clamp(min=1e-6)
+    out = (xyz - xyz_min) / denom
+    return out.clamp(0.0, 1.0)
+class TopoMLPToAtlasMapTokens(torch.nn.Module):
+    """Select top-K lane queries from TopoMLP and return them with reference points.
+    Aligned with Atlas paper Section 3.1:
+      "these queries are streamlined through a single linear layer"
+    The linear projection itself is in AtlasUnifiedProjector.projector_map.
+    This module only does Top-K selection + ref_point computation.
+    """
+    def __init__(
+        self,
+        num_map_tokens: int = 256,
+        hidden_size: int = 256,
+        bev_range: Tuple[float, float, float, float, float, float] = (-51.2, -25.6, -8.0, 51.2, 25.6, 4.0),
+        **kwargs,
+    ):
+        super().__init__()
+        self.num_map_tokens = int(num_map_tokens)
+        self.hidden_size = int(hidden_size)
+        self.bev_range = tuple(float(x) for x in bev_range)
+        xyz_min = torch.tensor(self.bev_range[:3], dtype=torch.float32)
+        xyz_max = torch.tensor(self.bev_range[3:], dtype=torch.float32)
+        self.register_buffer("_xyz_min", xyz_min, persistent=False)
+        self.register_buffer("_xyz_max", xyz_max, persistent=False)
+    @torch.no_grad()
+    def infer_lane_centers_from_outs(self, outs: Dict) -> torch.Tensor:
+        one2one_preds = outs["all_lc_preds_list"][-1]
+        return _lane_control_points_to_center_xyz(one2one_preds)
+    def forward(self, outs: Dict) -> Dict[str, torch.Tensor]:
+        lane_tokens = outs["lc_outs_dec_list"][-1]
+        lane_scores = outs["all_lc_cls_scores_list"][-1].squeeze(-1)
+        lane_centers = self.infer_lane_centers_from_outs(outs)
+        lane_ref_norm = _normalize_xyz(lane_centers, self._xyz_min, self._xyz_max)
+        B, N, D = lane_tokens.shape
+        if N == 0:
+            return {
+                "map": torch.zeros(B, self.num_map_tokens, D, dtype=lane_tokens.dtype, device=lane_tokens.device),
+                "map_ref_points": torch.zeros(B, self.num_map_tokens, 3, dtype=lane_ref_norm.dtype, device=lane_ref_norm.device),
+            }
+        k = min(self.num_map_tokens, N)
+        topk_idx = torch.topk(lane_scores, k=k, dim=1, largest=True, sorted=True).indices
+        tok_idx = topk_idx.unsqueeze(-1).expand(-1, -1, D)
+        ref_idx = topk_idx.unsqueeze(-1).expand(-1, -1, 3)
+        map_tokens = lane_tokens.gather(dim=1, index=tok_idx)
+        map_ref = lane_ref_norm.gather(dim=1, index=ref_idx)
+        if k < self.num_map_tokens:
+            pad_t = torch.zeros(B, self.num_map_tokens - k, D, dtype=map_tokens.dtype, device=map_tokens.device)
+            pad_r = torch.zeros(B, self.num_map_tokens - k, 3, dtype=map_ref.dtype, device=map_ref.device)
+            map_tokens = torch.cat([map_tokens, pad_t], dim=1)
+            map_ref = torch.cat([map_ref, pad_r], dim=1)
+        return {"map": map_tokens, "map_ref_points": map_ref}

src/prompting.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import random
+import re
+from typing import Literal, Optional, Tuple
+from src.audit.audit_utils import audit_enabled, h1b_record_prompt
+PLANNING_TABLE3_MODES = (
+    "atlas_base",
+    "atlas_high_level",
+    "atlas_high_level_ego",
+)
+_PLANNING_COMMAND_RE = re.compile(
+    r"The ego car will (?:turn left|turn right|go straight) in future\.\s*"
+)
+_PLANNING_STATE_RE = re.compile(
+    r"The current speed value of the ego car is \[\d+,\s*\d+\]\.\s*"
+    r"The current acceleration value of the ego car is \[\d+,\s*\d+\]\.\s*"
+)
+_PLANNING_ACCEL_SENTENCE = (
+    "The acceleration of the vehicle is defined as "
+    "[acceleration along the x-axis, acceleration along the y-axis]."
+)
+# Paper Table 6 prompts 1-3, plus two repository paraphrases.
+DETECTION_PROMPTS = [
+    # Paper Table 6, prompt 1.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Define the positive y-axis as the forward direction and the positive x-axis as the right direction. "
+        "Please complete the visual detection task under the Bird's Eye View (BEV) perspective. "
+        "Ensure that the detection range does not exceed 50 meters."
+    ),
+    # Paper Table 6, prompt 2.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Establish the positive y-axis as the frontward direction and the positive x-axis as the rightward direction. "
+        "Kindly execute the visual detection task within the Bird's Eye View (BEV) framework. "
+        "Be mindful not to exceed a detection range of 50 meters."
+    ),
+    # Paper Table 6, prompt 3.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Set the forward direction as the positive y-axis and the right direction as the positive x-axis. "
+        "Please carry out the visual detection task within the Bird's Eye View (BEV) context. "
+        "Ensure that the detection range remains within 50 meters."
+    ),
+    # Additional paraphrase variant 4.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Let the positive y-axis denote the forward direction and the positive x-axis denote the right direction. "
+        "Please perform the visual detection task from the Bird's Eye View (BEV) perspective. "
+        "Keep the detection range within 50 meters."
+    ),
+    # Additional paraphrase variant 5.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Take the positive y-axis to be the forward direction and the positive x-axis to be the right direction. "
+        "Kindly carry out the visual detection task under the Bird's Eye View (BEV) perspective. "
+        "Ensure that all detections stay within 50 meters."
+    ),
+]
+# Paper Table 7 prompts 1-3, plus two repository paraphrases.
+LANE_PROMPTS = [
+    # Paper Table 7, prompt 1.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Please complete the centerline detection task under the Bird's Eye View (BEV) perspective. "
+        "Ensure that the detection range does not exceed 50 meters."
+    ),
+    # Paper Table 7, prompt 2. The published paper text appears truncated and is kept verbatim.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Kindly execute the centerline detection task within the Bird's Eye View (BEV) framework. "
+        "Be mindful not to exceed a detection range of 50 meters."
+    ),
+    # Paper Table 7, prompt 3.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Could you complete the task of detecting the centerline from the Bird's Eye View (BEV) perspective? "
+        "Ensure that the detection range remains within 50 meters."
+    ),
+    # Additional paraphrase variant 4.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Kindly execute the centerline detection task within the Bird's Eye View (BEV) framework. "
+        "Be mindful not to exceed a detection range of 50 meters."
+    ),
+    # Additional paraphrase variant 5.
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Please carry out the task of detecting the centerline from the Bird's Eye View (BEV) perspective. "
+        "Ensure that the detection range remains within 50 meters."
+    ),
+]
+# Paper Table 9 prompts 1-3, plus two repository paraphrases.
+PLANNING_PROMPTS = [
+    # Paper Table 9, prompt 1. The paper uses fixed maneuver words; {command} keeps the same slot dynamic.
+    (
+        "The six images include objects that are uniformly represented as 3D detection query embeddings<query> "
+        "and map query embeddings<query>. "
+        "Define the positive y-axis as the forward direction and the positive x-axis as the right direction. "
+        "The speed of the vehicle is defined as [velocity along the x-axis, velocity along the y-axis]. "
+        "The acceleration of the vehicle is defined as [acceleration along the x-axis, acceleration along the y-axis]. "
+        "The ego car will {command} in future. "
+        "Kindly furnish suitable waypoints for the vehicle's trajectory based on the provided particulars. "
+        "Waypoints ought to adhere to the [x, y] format, with each waypoint spaced at 0.5-second intervals "
+        "within a continuous 3.0-second timeframe. "
+        "For planning tasks, please pay attention to driving safety and avoid vehicle collisions during driving in continous time."
+    ),
+    # Paper Table 9, prompt 2. The paper uses fixed maneuver words; {command} keeps the same slot dynamic.
+    (
+        "The six images include objects that are uniformly represented as 3D detection query embeddings<query> "
+        "and map query embeddings<query>. "
+        "Define the positive y-axis as the forward direction and the positive x-axis as the right direction. "
+        "The speed of the vehicle is defined as [velocity along the x-axis, velocity along the y-axis]. "
+        "The acceleration of the vehicle is defined as [acceleration along the x-axis, acceleration along the y-axis]. "
+        "The ego car will {command} in future. "
+        "We request your provision of pertinent waypoints for the vehicle's route in accordance with the given information. "
+        "Waypoints should conform to the format [x, y], with spacing set at 0.5-second intervals "
+        "over a continuous duration of 3.0 seconds. "
+        "For planning tasks, please pay attention to driving safety and avoid vehicle collisions during driving in continous time."
+    ),
+    # Paper Table 9, prompt 3. The paper uses fixed maneuver words; {command} keeps the same slot dynamic.
+    (
+        "The six images include objects that are uniformly represented as 3D detection query embeddings<query> "
+        "and map query embeddings<query>. "
+        "Define the positive y-axis as the forward direction and the positive x-axis as the right direction. "
+        "The speed of the vehicle is defined as [velocity along the x-axis, velocity along the y-axis]. "
+        "The acceleration of the vehicle is defined as [acceleration along the x-axis, acceleration along the y-axis]. "
+        "The ego car will {command} in future. "
+        "Please submit fitting waypoints for the vehicle's course based on the supplied data. "
+        "Ensure waypoints are structured as [x, y] and spaced at intervals of 0.5 seconds across a continuous 3.0-second period. "
+        "For planning tasks, please pay attention to driving safety and avoid vehicle collisions during driving in continous time."
+    ),
+    # Additional paraphrase variant 4.
+    (
+        "The six images include objects that are uniformly represented as 3D detection query embeddings<query> "
+        "and map query embeddings<query>. "
+        "Define the positive y-axis as the forward direction and the positive x-axis as the right direction. "
+        "The speed of the vehicle is defined as [velocity along the x-axis, velocity along the y-axis]. "
+        "The acceleration of the vehicle is defined as [acceleration along the x-axis, acceleration along the y-axis]. "
+        "The ego car will {command} in future. "
+        "Please provide suitable waypoints for the ego car in [x, y] format at 0.5-second intervals "
+        "over a continuous 3.0-second period. "
+        "For planning tasks, please pay attention to driving safety and avoid vehicle collisions during driving in continous time."
+    ),
+    # Additional paraphrase variant 5.
+    (
+        "The six images include objects that are uniformly represented as 3D detection query embeddings<query> "
+        "and map query embeddings<query>. "
+        "Define the positive y-axis as the forward direction and the positive x-axis as the right direction. "
+        "The speed of the vehicle is defined as [velocity along the x-axis, velocity along the y-axis]. "
+        "The acceleration of the vehicle is defined as [acceleration along the x-axis, acceleration along the y-axis]. "
+        "The ego car will {command} in future. "
+        "Could you generate appropriate waypoints for the vehicle's trajectory in [x, y] format, "
+        "with each waypoint separated by 0.5 seconds across the next 3.0 seconds? "
+        "For planning tasks, please pay attention to driving safety and avoid vehicle collisions during driving in continous time."
+    ),
+]
+# Figure 5-style single-view caption prompt, parameterized by camera_name.
+CAPTION_PROMPTS = [
+    (
+        "There are six images captured by the surround view cameras in driving vehicle. "
+        "They are uniformly represented as queries embeddings<query>. "
+        "Communicate a narrative of the setting within {camera_name} view image."
+    ),
+]
+_TASK_POOLS = {
+    "detection": DETECTION_PROMPTS,
+    "lane": LANE_PROMPTS,
+    "planning": PLANNING_PROMPTS,
+    "caption": CAPTION_PROMPTS,
+}
+def get_prompt_pool(task: str):
+    return _TASK_POOLS.get(task, DETECTION_PROMPTS)
+def sample_prompt(task: str, **kwargs) -> str:
+    pool = get_prompt_pool(task)
+    template = random.choice(pool)
+    if kwargs:
+        try:
+            return template.format(**kwargs)
+        except KeyError:
+            return template
+    return template
+def rewrite_planning_prompt_for_table3(
+    prompt_text: str,
+    mode: str,
+    command: Optional[str] = None,
+    velocity_bins: Optional[Tuple[int, int]] = None,
+    acceleration_bins: Optional[Tuple[int, int]] = None,
+) -> str:
+    """Rewrite planning prompts to match Atlas Table 3 variants.
+    Modes:
+      - atlas_base: no high-level command, no explicit ego-state values
+      - atlas_high_level: keep high-level command only
+      - atlas_high_level_ego: keep high-level command and inject ego-state bins
+    In this repository, the top-level route command is a UniAD-style
+    future-GT-derived coarse planning command, not a raw nuScenes field.
+    """
+    if mode not in PLANNING_TABLE3_MODES:
+        raise ValueError(f"Unsupported planning mode: {mode}")
+    prompt = " ".join(str(prompt_text).split())
+    prompt = _PLANNING_STATE_RE.sub("", prompt)
+    prompt = _PLANNING_COMMAND_RE.sub("", prompt)
+    if mode == "atlas_base":
+        return " ".join(prompt.split())
+    if not command:
+        raise ValueError(
+            f"{mode} requires an explicit top-level route command field"
+        )
+    command_sentence = f"The ego car will {command} in future."
+    if mode == "atlas_high_level":
+        if _PLANNING_ACCEL_SENTENCE in prompt:
+            prompt = prompt.replace(
+                _PLANNING_ACCEL_SENTENCE,
+                f"{_PLANNING_ACCEL_SENTENCE} {command_sentence}",
+                1,
+            )
+            return " ".join(prompt.split())
+        return " ".join(f"{command_sentence} {prompt}".split())
+    if velocity_bins is None or acceleration_bins is None:
+        raise ValueError(
+            "atlas_high_level_ego requires velocity_bins and acceleration_bins"
+        )
+    state_sentence = (
+        f"The current speed value of the ego car is [{int(velocity_bins[0])}, {int(velocity_bins[1])}]. "
+        f"The current acceleration value of the ego car is [{int(acceleration_bins[0])}, {int(acceleration_bins[1])}]."
+    )
+    if _PLANNING_ACCEL_SENTENCE in prompt:
+        prompt = prompt.replace(
+            _PLANNING_ACCEL_SENTENCE,
+            f"{_PLANNING_ACCEL_SENTENCE} {state_sentence} {command_sentence}",
+            1,
+        )
+        return " ".join(prompt.split())
+    return " ".join(f"{state_sentence} {command_sentence} {prompt}".split())
+def build_prompt(prompt_text: str, mode: Literal["train", "infer"]) -> str:
+    s = f"USER: {prompt_text}\nASSISTANT:"
+    if audit_enabled():
+        h1b_record_prompt(mode, s)
+    return s

train_atlas.py ADDED Viewed

	@@ -0,0 +1,1018 @@

+#!/usr/bin/env python3
+import argparse
+import os
+import sys
+import math
+import time
+import json
+import logging
+from datetime import timedelta
+from pathlib import Path
+from typing import Dict, Optional, List
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from src.model.modeling_atlas import AtlasForCausalLM
+from src.model.topomlp_adapter import TopoMLPToAtlasMapTokens
+from src.model.streampetr_adapter import extract_streampetr_topk_tokens
+from src.dataset.atlas_dataset import (
+    AtlasDataset, make_atlas_collate_fn, load_tokenizer,
+)
+from src.dataset.scene_sampler import SceneSequentialSampler
+from src.prompting import PLANNING_TABLE3_MODES
+logger = logging.getLogger("train_atlas")
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--llm_model", default="lmsys/vicuna-7b-v1.5")
+    p.add_argument("--visual_hidden_size", type=int, default=256)
+    p.add_argument("--num_det_queries", type=int, default=256)
+    p.add_argument("--num_map_queries", type=int, default=256)
+    p.add_argument("--streampetr_config", default=None)
+    p.add_argument("--streampetr_ckpt", default=None)
+    p.add_argument("--topomlp_config", default=None)
+    p.add_argument("--topomlp_ckpt", default=None)
+    p.add_argument("--data_json", required=True)
+    p.add_argument("--data_root", default="/mnt/data/nuscenes")
+    p.add_argument("--max_length", type=int, default=4096)
+    p.add_argument("--output_dir", default="work_dirs/atlas")
+    p.add_argument("--lr", type=float, default=2e-5)
+    p.add_argument("--weight_decay", type=float, default=1e-4)
+    p.add_argument("--batch_size", type=int, default=1)
+    p.add_argument("--epochs", type=int, default=8)
+    p.add_argument("--warmup_ratio", type=float, default=0.03)
+    p.add_argument("--gradient_accumulation_steps", type=int, default=2)
+    p.add_argument("--max_grad_norm", type=float, default=1.0)
+    p.add_argument("--use_lora", action="store_true")
+    p.add_argument("--lora_r", type=int, default=64)
+    p.add_argument("--lora_alpha", type=int, default=64)
+    p.add_argument("--lora_dropout", type=float, default=0.1)
+    p.add_argument("--load_in_4bit", action="store_true")
+    p.add_argument("--save_steps", type=int, default=0)
+    p.add_argument("--save_epochs", type=int, default=1)
+    p.add_argument("--log_steps", type=int, default=10)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--num_workers", type=int, default=4)
+    p.add_argument("--resume", default=None)
+    p.add_argument("--local_rank", "--local-rank", type=int, default=int(os.environ.get("LOCAL_RANK", -1)))
+    p.add_argument("--fp16", action="store_true")
+    p.add_argument("--bf16", action="store_true")
+    p.add_argument("--image_path_remap", default=None,
+                   help="old=new path remap, e.g. /mnt/data=/local/data")
+    p.add_argument("--precomputed_det_tokens", default=None,
+                   help="[offline only] Dir with precomputed det tokens (.pt files)")
+    p.add_argument("--precomputed_map_tokens", default=None,
+                   help="[offline only] Dir with precomputed TopoMLP map tokens (.pt files)")
+    p.add_argument("--visual_token_mode", choices=("online", "offline"), default="online",
+                   help="Visual token source: online=live frozen encoders (default), offline=read *_offline dirs")
+    p.add_argument("--deepspeed", default=None,
+                   help="Path to DeepSpeed config JSON (enables ZeRO)")
+    p.add_argument("--keep_last_n_ckpts", type=int, default=0,
+                   help="Keep only the N most recent epoch checkpoints (0=keep all)")
+    p.add_argument(
+        "--planning_table3_mode",
+        choices=PLANNING_TABLE3_MODES,
+        default="atlas_base",
+        help=(
+            "Planning prompt variant matching Atlas Table 3: "
+            "atlas_base=no command/no explicit ego state; "
+            "atlas_high_level=requires top-level route_command "
+            "(this repo uses a UniAD-style future-GT-derived command); "
+            "atlas_high_level_ego=requires top-level route_command plus "
+            "velocity/acceleration bins."
+        ),
+    )
+    return p.parse_args()
+def _validate_visual_token_mode(args):
+    """Enforce mode-specific constraints. Fail hard, never silently degrade."""
+    if args.visual_token_mode == "online":
+        if args.precomputed_det_tokens or args.precomputed_map_tokens:
+            raise RuntimeError(
+                "visual_token_mode=online forbids --precomputed_det_tokens / "
+                "--precomputed_map_tokens. Use --visual_token_mode offline to "
+                "read offline token directories."
+            )
+        missing = []
+        if not args.streampetr_config or not args.streampetr_ckpt:
+            missing.append("--streampetr_config/--streampetr_ckpt")
+        if not args.topomlp_config or not args.topomlp_ckpt:
+            missing.append("--topomlp_config/--topomlp_ckpt")
+        if missing:
+            raise RuntimeError(
+                "visual_token_mode=online requires live encoder configs and "
+                "checkpoints. Missing: " + ", ".join(missing)
+            )
+        for p in (args.streampetr_config, args.streampetr_ckpt, args.topomlp_config, args.topomlp_ckpt):
+            if not os.path.exists(p):
+                raise RuntimeError(f"Required online asset does not exist: {p}")
+        if args.batch_size != 1:
+            raise RuntimeError(
+                "visual_token_mode=online with temporal memory requires "
+                "--batch_size 1 (paper-aligned). Got: %d" % args.batch_size
+            )
+    else:
+        if not args.precomputed_det_tokens and not args.precomputed_map_tokens:
+            raise RuntimeError(
+                "visual_token_mode=offline requires at least one "
+                "--precomputed_*_tokens directory."
+            )
+        for p in (args.precomputed_det_tokens, args.precomputed_map_tokens):
+            if p and not os.path.isdir(p):
+                raise RuntimeError(f"Offline token directory does not exist: {p}")
+def set_seed(seed):
+    import random
+    import numpy as np
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def setup_distributed(local_rank):
+    if local_rank == -1:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        return device, False, 0, 1
+    dist.init_process_group(backend="nccl", timeout=timedelta(seconds=1800))
+    torch.cuda.set_device(local_rank)
+    device = torch.device("cuda", local_rank)
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    return device, True, rank, world_size
+def is_main_process(distributed, rank):
+    return (not distributed) or (rank == 0)
+def load_frozen_encoder(config_path, ckpt_path, model_type, device):
+    if config_path is None or ckpt_path is None:
+        return None
+    try:
+        from mmcv import Config
+        from mmdet3d.models import build_model
+        from mmcv.runner import load_checkpoint
+    except ImportError:
+        raise RuntimeError(
+            f"mmcv/mmdet3d not installed but --{model_type}_config and "
+            f"--{model_type}_ckpt were explicitly provided. "
+            f"Install mmcv/mmdet3d or remove these arguments to train without {model_type}."
+        )
+    if model_type == "streampetr":
+        sp_root = str(Path(__file__).resolve().parent / "external" / "StreamPETR")
+        if sp_root not in sys.path:
+            sys.path.insert(0, sp_root)
+        try:
+            import projects.mmdet3d_plugin  # noqa: F401
+        except ImportError:
+            raise RuntimeError(
+                f"StreamPETR plugin not found under {sp_root}/projects/mmdet3d_plugin. "
+                f"Ensure the submodule is checked out, or remove --streampetr_config/--streampetr_ckpt."
+            )
+    elif model_type == "topomlp":
+        tp_root = str(Path(__file__).resolve().parent / "external" / "TopoMLP_Repo")
+        if tp_root not in sys.path:
+            sys.path.insert(0, tp_root)
+        try:
+            os.environ["ATLAS_TOPOMLP_MODELS_ONLY"] = "1"
+            from mmcv.utils import registry as _reg
+            _orig = _reg.Registry._register_module
+            def _tolerant_register(self, module, module_name=None, force=False):
+                return _orig(self, module, module_name=module_name, force=True)
+            _reg.Registry._register_module = _tolerant_register
+            import projects.topomlp  # noqa: F401
+            _reg.Registry._register_module = _orig
+        except ImportError:
+            raise RuntimeError(
+                f"TopoMLP plugin not found under {tp_root}/projects/topomlp. "
+                f"Ensure the submodule is checked out, or remove --topomlp_config/--topomlp_ckpt."
+            )
+    cfg = Config.fromfile(config_path)
+    model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"))
+    load_checkpoint(model, ckpt_path, map_location="cpu")
+    model.eval()
+    model.to(device)
+    for param in model.parameters():
+        param.requires_grad_(False)
+    logger.info("Loaded frozen %s from %s", model_type, ckpt_path)
+    return model
+def build_img_metas_streampetr(batch, device, idx):
+    N = batch["pixel_values_det"].shape[1]
+    fH, fW = 800, 1600
+    scene_ids = batch.get("scene_id", ["__atlas__"] * (idx + 1))
+    meta = {
+        "pad_shape": [(fH, fW, 3)] * N,
+        "img_shape": [(fH, fW, 3)] * N,
+        "scene_token": scene_ids[idx] if idx < len(scene_ids) else "__atlas__",
+    }
+    if "lidar2img_det" in batch:
+        meta["lidar2img"] = batch["lidar2img_det"][idx].cpu().numpy()
+    return meta
+def build_img_metas_topomlp(batch, device, idx):
+    meta = {}
+    if "lidar2img_map" in batch:
+        meta["lidar2img"] = batch["lidar2img_map"][idx].cpu().numpy()
+    tH, tW = 800, 1600
+    N = batch["pixel_values_map"].shape[1]
+    meta["img_shape"] = tuple([(tH, tW, 3)] * N)
+    meta["pad_shape"] = tuple([(tH, tW, 3)] * N)
+    meta["scale_factor"] = 1.0
+    meta["te_yolov8"] = None
+    return meta
+@torch.no_grad()
+def run_streampetr_forward(model, imgs, img_metas, batch, device, prev_exists=None):
+    B, N = imgs.shape[:2]
+    img_feats = model.extract_img_feat(imgs, 1)
+    data = {
+        "img": imgs,
+        "img_feats": img_feats,
+        "prev_exists": prev_exists if prev_exists is not None else imgs.new_zeros(B),
+    }
+    if "intrinsics_det" in batch:
+        K3 = batch["intrinsics_det"].to(device)
+        K4 = torch.zeros(B, N, 4, 4, device=device, dtype=K3.dtype)
+        K4[:, :, :3, :3] = K3
+        K4[:, :, 3, 3] = 1.0
+        data["intrinsics"] = K4
+    else:
+        data["intrinsics"] = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).expand(B, N, -1, -1).contiguous()
+    if "lidar2img_det" in batch:
+        data["lidar2img"] = batch["lidar2img_det"].to(device)
+    else:
+        data["lidar2img"] = torch.eye(4, device=device).unsqueeze(0).unsqueeze(0).expand(B, N, -1, -1).contiguous()
+    if "ego_pose" in batch and batch["ego_pose"] is not None:
+        data["ego_pose"] = batch["ego_pose"].to(device)
+    else:
+        data["ego_pose"] = torch.eye(4, device=device).unsqueeze(0).expand(B, -1, -1).contiguous()
+    if "ego_pose_inv" in batch and batch["ego_pose_inv"] is not None:
+        data["ego_pose_inv"] = batch["ego_pose_inv"].to(device)
+    else:
+        data["ego_pose_inv"] = torch.inverse(data["ego_pose"])
+    if "timestamp" in batch and batch["timestamp"] is not None:
+        data["timestamp"] = batch["timestamp"].to(device)
+    else:
+        data["timestamp"] = torch.zeros(B, device=device)
+    location = model.prepare_location(img_metas, **data)
+    outs_roi = model.forward_roi_head(location, **data)
+    topk_indexes = outs_roi["topk_indexes"]
+    outs = model.pts_bbox_head(location, img_metas, topk_indexes, **data)
+    return outs
+@torch.no_grad()
+def run_topomlp_forward(model, imgs, img_metas):
+    return model.simple_forward(imgs, img_metas)
+def _reconstruct_topomlp_outs(saved: dict, device, dtype):
+    """Convert precomputed .pt dict back to the format adapter.forward() expects."""
+    def _restore(t):
+        return t.to(device=device, dtype=dtype).unsqueeze(0)
+    return {
+        "lc_outs_dec_list": [_restore(saved["lc_outs_dec"])],
+        "all_lc_cls_scores_list": [_restore(saved["lc_cls_scores"])],
+        "all_lc_preds_list": [_restore(saved["lc_preds"])],
+        "lc_outs_dec_one2many_list": [_restore(saved["lc_outs_dec_o2m"])],
+        "all_lc_cls_scores_one2many_list": [_restore(saved["lc_cls_scores_o2m"])],
+        "all_lc_preds_one2many_list": [_restore(saved["lc_preds_o2m"])],
+    }
+def extract_visual_tokens(
+    streampetr_model,
+    topomlp_model,
+    topomlp_adapter,
+    batch,
+    device,
+    num_det_queries=256,
+    visual_hidden_size=256,
+    query_token_id=None,
+    visual_token_mode="online",
+    streaming_state=None,
+):
+    """Extract det + map visual tokens.
+    In online mode with streaming_state, StreamPETR temporal memory is managed
+    per-scene and duplicate physical frames are protected: if the current
+    sample_id equals the previous one, we reuse cached det tokens and skip the
+    StreamPETR forward to avoid pushing the same frame into memory twice.
+    """
+    B = batch["pixel_values_det"].shape[0]
+    vis: Dict[str, torch.Tensor] = {}
+    needs_map = False
+    if query_token_id is not None and "input_ids" in batch:
+        n_queries = int((batch["input_ids"] == query_token_id).sum(dim=-1).max().item())
+        needs_map = n_queries > num_det_queries
+    # ---- Detection tokens ----
+    if visual_token_mode == "offline" and "precomputed_det" in batch and "precomputed_det_ref" in batch:
+        vis["detection"] = batch["precomputed_det"].to(device)
+        vis["detection_ref_points"] = batch["precomputed_det_ref"].to(device)
+    elif visual_token_mode == "offline":
+        raise RuntimeError(
+            "visual_token_mode=offline but detection precomputed tokens are missing "
+            "for the current batch. Refusing to zero-fill."
+        )
+    elif streampetr_model is not None:
+        if B != 1 and streaming_state is not None:
+            raise RuntimeError("online temporal det requires batch_size=1")
+        current_sample_id = batch.get("sample_id", [None])[0]
+        current_scene = batch.get("scene_id", ["__atlas__"])[0]
+        reuse_cache = False
+        if streaming_state is not None:
+            prev_scene = streaming_state.get("prev_scene_token")
+            prev_sample_id = streaming_state.get("prev_sample_id")
+            ts_tensor = batch.get("timestamp")
+            current_ts = float(ts_tensor[0].item()) if ts_tensor is not None else None
+            prev_ts = streaming_state.get("prev_timestamp")
+            is_new_segment = (
+                prev_scene is None
+                or current_scene != prev_scene
+                or (current_ts is not None and prev_ts is not None and current_ts <= prev_ts)
+            )
+            if current_sample_id is not None and current_sample_id == prev_sample_id:
+                cached = streaming_state.get("cached_det")
+                if cached is not None:
+                    reuse_cache = True
+                    vis["detection"] = cached["detection"]
+                    vis["detection_ref_points"] = cached["detection_ref_points"]
+            if not reuse_cache:
+                if is_new_segment:
+                    streampetr_model.pts_bbox_head.reset_memory()
+                prev_exists_val = 0.0 if is_new_segment else 1.0
+                imgs_det = batch["pixel_values_det"].to(device)
+                prev_exists = imgs_det.new_full((B,), prev_exists_val)
+                img_metas = [build_img_metas_streampetr(batch, device, b) for b in range(B)]
+                run_streampetr_forward(streampetr_model, imgs_det, img_metas, batch, device, prev_exists=prev_exists)
+                ego_pose_for_ref = batch.get("ego_pose")
+                if ego_pose_for_ref is not None:
+                    ego_pose_for_ref = ego_pose_for_ref.to(device)
+                det_out = extract_streampetr_topk_tokens(
+                    streampetr_model.pts_bbox_head,
+                    topk=num_det_queries,
+                    ego_pose=ego_pose_for_ref,
+                )
+                vis["detection"] = det_out["detection"]
+                vis["detection_ref_points"] = det_out["detection_ref_points"]
+                streaming_state["cached_det"] = {
+                    "detection": vis["detection"],
+                    "detection_ref_points": vis["detection_ref_points"],
+                }
+            streaming_state["prev_scene_token"] = current_scene
+            streaming_state["prev_sample_id"] = current_sample_id
+            if batch.get("timestamp") is not None:
+                streaming_state["prev_timestamp"] = float(batch["timestamp"][0].item())
+        else:
+            imgs_det = batch["pixel_values_det"].to(device)
+            img_metas = [build_img_metas_streampetr(batch, device, b) for b in range(B)]
+            run_streampetr_forward(streampetr_model, imgs_det, img_metas, batch, device)
+            ego_pose_for_ref = batch.get("ego_pose")
+            if ego_pose_for_ref is not None:
+                ego_pose_for_ref = ego_pose_for_ref.to(device)
+            det_out = extract_streampetr_topk_tokens(
+                streampetr_model.pts_bbox_head,
+                topk=num_det_queries,
+                ego_pose=ego_pose_for_ref,
+            )
+            vis["detection"] = det_out["detection"]
+            vis["detection_ref_points"] = det_out["detection_ref_points"]
+    elif visual_token_mode == "online":
+        raise RuntimeError(
+            "visual_token_mode=online but StreamPETR model is None. "
+            "Provide --streampetr_config and --streampetr_ckpt."
+        )
+    else:
+        vis["detection"] = torch.zeros(B, num_det_queries, visual_hidden_size, device=device)
+        vis["detection_ref_points"] = torch.zeros(B, num_det_queries, 3, device=device)
+    # ---- Map tokens ----
+    num_map_queries = num_det_queries
+    if topomlp_adapter is not None:
+        num_map_queries = topomlp_adapter.num_map_tokens
+    if topomlp_adapter is not None:
+        _params = list(topomlp_adapter.parameters())
+        _bufs = list(topomlp_adapter.buffers())
+        adapter_dtype = _params[0].dtype if _params else (_bufs[0].dtype if _bufs else torch.float32)
+        map_filled = False
+        if visual_token_mode == "offline" and needs_map and "precomputed_map" in batch:
+            if B == 1:
+                outs = _reconstruct_topomlp_outs(batch["precomputed_map"][0], device, adapter_dtype)
+            else:
+                per_sample = [_reconstruct_topomlp_outs(batch["precomputed_map"][b], device, adapter_dtype) for b in range(B)]
+                outs = {}
+                for k in per_sample[0]:
+                    outs[k] = [torch.cat([s[k][i] for s in per_sample], dim=0) for i in range(len(per_sample[0][k]))]
+            map_out = topomlp_adapter(outs)
+            vis["map"] = map_out["map"]
+            vis["map_ref_points"] = map_out["map_ref_points"]
+            map_filled = True
+        elif visual_token_mode == "offline" and needs_map:
+            raise RuntimeError(
+                "visual_token_mode=offline but map precomputed tokens are missing "
+                "for a batch that requires map queries. Refusing to zero-fill."
+            )
+        elif needs_map and topomlp_model is not None:
+            imgs_map = batch["pixel_values_map"].to(device)
+            img_metas = [build_img_metas_topomlp(batch, device, b) for b in range(B)]
+            outs = run_topomlp_forward(topomlp_model, imgs_map, img_metas)
+            for k, v in outs.items():
+                if isinstance(v, torch.Tensor):
+                    outs[k] = v.to(adapter_dtype)
+                elif isinstance(v, list):
+                    outs[k] = [x.to(adapter_dtype) if isinstance(x, torch.Tensor) else x for x in v]
+            map_out = topomlp_adapter(outs)
+            vis["map"] = map_out["map"]
+            vis["map_ref_points"] = map_out["map_ref_points"]
+            map_filled = True
+        elif needs_map and visual_token_mode == "online":
+            raise RuntimeError(
+                "visual_token_mode=online but TopoMLP model is None. "
+                "Provide --topomlp_config and --topomlp_ckpt."
+            )
+        if not map_filled:
+            vis["map"] = torch.zeros(B, num_map_queries, visual_hidden_size, device=device)
+            vis["map_ref_points"] = torch.zeros(B, num_map_queries, 3, device=device)
+    return vis
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, min_lr_ratio=0.0):
+    def lr_lambda(step):
+        if step < num_warmup_steps:
+            return float(step) / float(max(1, num_warmup_steps))
+        progress = float(step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(min_lr_ratio, 0.5 * (1.0 + math.cos(math.pi * progress)))
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+def _optimizer_steps_per_epoch(num_batches: int, grad_accum_steps: int) -> int:
+    if num_batches <= 0:
+        return 0
+    return int(math.ceil(float(num_batches) / float(max(1, grad_accum_steps))))
+def _accum_window_size_for_batch(
+    batch_idx: int,
+    num_batches: int,
+    grad_accum_steps: int,
+) -> int:
+    """Return the effective accumulation window size for this batch.
+    Full windows use `grad_accum_steps`. The final partial window uses the
+    remainder so that tail batches are not under-scaled when we flush them.
+    """
+    grad_accum_steps = max(1, int(grad_accum_steps))
+    remainder = int(num_batches % grad_accum_steps)
+    tail_start = int(num_batches - remainder)
+    if remainder > 0 and batch_idx >= tail_start:
+        return remainder
+    return grad_accum_steps
+def _is_optimizer_step_batch(
+    batch_idx: int,
+    num_batches: int,
+    grad_accum_steps: int,
+) -> bool:
+    grad_accum_steps = max(1, int(grad_accum_steps))
+    natural_boundary = ((batch_idx + 1) % grad_accum_steps) == 0
+    is_last_batch = (batch_idx + 1) == num_batches
+    return natural_boundary or is_last_batch
+def save_checkpoint(path, atlas, adapter, optimizer, scheduler, global_step, epoch, args):
+    save_dict = {
+        "global_step": global_step,
+        "epoch": epoch,
+        "args": vars(args),
+        "atlas_state_dict": {k: v.cpu() for k, v in atlas.state_dict().items()},
+        "optimizer": optimizer.state_dict(),
+        "scheduler": scheduler.state_dict() if scheduler is not None else None,
+    }
+    if adapter is not None:
+        save_dict["adapter_state_dict"] = {k: v.cpu() for k, v in adapter.state_dict().items()}
+    Path(path).parent.mkdir(parents=True, exist_ok=True)
+    torch.save(save_dict, path)
+def cleanup_old_checkpoints(output_dir: Path, keep_n: int):
+    """Delete old epoch-* checkpoint dirs, keeping only the most recent *keep_n*."""
+    if keep_n <= 0:
+        return
+    import shutil
+    epoch_dirs = sorted(
+        [d for d in output_dir.iterdir() if d.is_dir() and d.name.startswith("epoch-")],
+        key=lambda d: int(d.name.split("-")[1]),
+    )
+    while len(epoch_dirs) > keep_n:
+        old = epoch_dirs.pop(0)
+        shutil.rmtree(old, ignore_errors=True)
+        logger.info("Deleted old checkpoint: %s", old)
+def main():
+    args = parse_args()
+    class _FlushHandler(logging.StreamHandler):
+        def emit(self, record):
+            super().emit(record)
+            self.flush()
+    logging.root.handlers.clear()
+    _h = _FlushHandler(sys.stderr)
+    _fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")
+    _h.setFormatter(_fmt)
+    logging.root.addHandler(_h)
+    logging.root.setLevel(logging.INFO)
+    _validate_visual_token_mode(args)
+    device, distributed, rank, world_size = setup_distributed(args.local_rank)
+    set_seed(args.seed + rank)
+    _main = is_main_process(distributed, rank)
+    is_online = args.visual_token_mode == "online"
+    output_dir = Path(args.output_dir)
+    if _main:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        _fh = logging.FileHandler(str(output_dir / "train.log"), mode="a")
+        _fh.setFormatter(_fmt)
+        logging.root.addHandler(_fh)
+        with open(output_dir / "args.json", "w") as f:
+            json.dump(vars(args), f, indent=2)
+    if _main:
+        logger.info("Loading tokenizer: %s", args.llm_model)
+    tokenizer = load_tokenizer(args.llm_model)
+    if "<query>" not in tokenizer.get_vocab():
+        tokenizer.add_tokens(["<query>"])
+    _precomp_det = args.precomputed_det_tokens if not is_online else None
+    _precomp_map = args.precomputed_map_tokens if not is_online else None
+    dataset = AtlasDataset(
+        json_file=args.data_json,
+        image_root=args.data_root,
+        tokenizer=tokenizer,
+        max_length=args.max_length,
+        is_training=True,
+        planning_table3_mode=args.planning_table3_mode,
+        image_path_remap=args.image_path_remap,
+        precomputed_det_tokens=_precomp_det,
+        precomputed_map_tokens=_precomp_map,
+    )
+    if is_online:
+        scene_groups = dataset.get_scene_groups()
+        sampler = SceneSequentialSampler(
+            scene_groups,
+            num_replicas=world_size,
+            rank=rank,
+            seed=args.seed,
+            pad_to_multiple=args.gradient_accumulation_steps,
+        )
+        if _main:
+            logger.info("Online mode: SceneSequentialSampler (%d scenes, %d samples, world=%d)",
+                        len(scene_groups), len(dataset), world_size)
+    else:
+        from torch.utils.data import DistributedSampler
+        sampler = DistributedSampler(dataset, shuffle=True) if distributed else None
+    collate_fn = make_atlas_collate_fn(tokenizer.pad_token_id)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=(not is_online and sampler is None),
+        sampler=sampler,
+        num_workers=args.num_workers,
+        collate_fn=collate_fn,
+        pin_memory=True,
+        drop_last=not is_online,
+    )
+    streampetr_model = load_frozen_encoder(
+        args.streampetr_config, args.streampetr_ckpt, "streampetr", device
+    )
+    topomlp_model = load_frozen_encoder(
+        args.topomlp_config, args.topomlp_ckpt, "topomlp", device
+    )
+    topomlp_adapter = None
+    if topomlp_model is not None or _precomp_map:
+        _tp_bev_range = (-51.2, -25.6, -8.0, 51.2, 25.6, 4.0)
+        if args.topomlp_config:
+            try:
+                from mmcv import Config as _Cfg
+                _tp_cfg = _Cfg.fromfile(args.topomlp_config)
+                if hasattr(_tp_cfg, "point_cloud_range"):
+                    _tp_bev_range = tuple(float(v) for v in _tp_cfg.point_cloud_range)
+                    logger.info("TopoMLP bev_range from config: %s", _tp_bev_range)
+            except Exception as e:
+                logger.warning("Failed to read point_cloud_range from TopoMLP config: %s. Using default: %s", e, _tp_bev_range)
+        topomlp_adapter = TopoMLPToAtlasMapTokens(
+            num_map_tokens=args.num_map_queries,
+            hidden_size=args.visual_hidden_size,
+            bev_range=_tp_bev_range,
+        ).to(device)
+    dtype = torch.float32
+    if args.bf16:
+        dtype = torch.bfloat16
+    elif args.fp16:
+        dtype = torch.float16
+    if args.load_in_4bit:
+        dm = {"": device} if distributed else "auto"
+    else:
+        dm = None
+    _ds_bf16 = False
+    _ds_fp16 = False
+    if args.deepspeed:
+        with open(args.deepspeed) as _f:
+            _ds_cfg_peek = json.load(_f)
+        _ds_bf16 = _ds_cfg_peek.get("bf16", {}).get("enabled", False)
+        _ds_fp16 = _ds_cfg_peek.get("fp16", {}).get("enabled", False)
+    _use_half = args.bf16 or args.fp16 or _ds_bf16 or _ds_fp16
+    if _use_half and dtype == torch.float32:
+        dtype = torch.bfloat16 if (args.bf16 or _ds_bf16) else torch.float16
+    atlas = AtlasForCausalLM(
+        llm_model_name=args.llm_model,
+        visual_hidden_size=args.visual_hidden_size,
+        num_queries=args.num_det_queries,
+        num_map_queries=args.num_map_queries,
+        load_in_4bit=args.load_in_4bit,
+        use_flash_attention=_use_half,
+        device_map=dm,
+        torch_dtype=dtype,
+        use_lora=args.use_lora,
+        lora_r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+    )
+    atlas.resize_token_embeddings(len(tokenizer))
+    query_token_id = tokenizer.convert_tokens_to_ids("<query>")
+    atlas.set_query_token_id(query_token_id)
+    if topomlp_adapter is not None:
+        atlas.topomlp_adapter = topomlp_adapter
+    if dm is None and args.deepspeed is None:
+        atlas = atlas.to(device)
+    atlas.gradient_checkpointing_enable()
+    num_batches_per_epoch = len(dataloader)
+    steps_per_epoch = _optimizer_steps_per_epoch(
+        num_batches_per_epoch, args.gradient_accumulation_steps
+    )
+    total_steps = steps_per_epoch * args.epochs
+    warmup_steps = int(total_steps * args.warmup_ratio)
+    global_step = 0
+    start_epoch = 0
+    _resume_ckpt = None
+    if args.resume:
+        _resume_ckpt = torch.load(args.resume, map_location="cpu")
+        if "atlas_state_dict" not in _resume_ckpt:
+            raise RuntimeError(f"Checkpoint missing 'atlas_state_dict'. Keys: {list(_resume_ckpt.keys())}")
+        missing, _ = atlas.load_state_dict(_resume_ckpt["atlas_state_dict"], strict=False)
+        if _main and missing:
+            logger.warning("Resume: %d missing keys (first 10): %s", len(missing), missing[:10])
+        if topomlp_adapter is not None and "adapter_state_dict" in _resume_ckpt:
+            _m, _u = topomlp_adapter.load_state_dict(_resume_ckpt["adapter_state_dict"], strict=False)
+            if _main and _u:
+                logger.info("Adapter resume: ignored %d legacy keys: %s", len(_u), _u[:5])
+        global_step = _resume_ckpt.get("global_step", 0)
+        start_epoch = _resume_ckpt.get("epoch", 0)
+        if _main:
+            logger.info("Resumed from %s (step=%d, epoch=%d)", args.resume, global_step, start_epoch)
+    use_deepspeed = args.deepspeed is not None
+    if use_deepspeed:
+        import deepspeed
+        ds_config = json.load(open(args.deepspeed))
+        ds_config["optimizer"] = {
+            "type": "Adam",
+            "params": {
+                "lr": args.lr, "weight_decay": args.weight_decay,
+                "betas": [0.9, 0.999], "torch_adam": True, "adam_w_mode": True,
+            },
+        }
+        ds_config["scheduler"] = {
+            "type": "WarmupCosineLR",
+            "params": {
+                "total_num_steps": total_steps,
+                "warmup_num_steps": warmup_steps,
+                "warmup_type": "linear",
+            },
+        }
+        ds_config["gradient_accumulation_steps"] = args.gradient_accumulation_steps
+        ds_config["train_micro_batch_size_per_gpu"] = args.batch_size
+        ds_config["train_batch_size"] = args.batch_size * args.gradient_accumulation_steps * world_size
+        ds_bf16 = ds_config.get("bf16", {}).get("enabled", False)
+        ds_fp16 = ds_config.get("fp16", {}).get("enabled", False)
+        if ds_bf16:
+            atlas.to(device=device, dtype=torch.bfloat16)
+        elif ds_fp16:
+            atlas.to(device=device, dtype=torch.float16)
+        else:
+            atlas.to(device)
+        all_params = atlas.get_trainable_param_groups(args.lr, weight_decay=args.weight_decay)
+        if topomlp_adapter is not None:
+            _adapter_trainable = [p for p in topomlp_adapter.parameters() if p.requires_grad]
+            if _adapter_trainable:
+                all_params.append({"params": _adapter_trainable, "lr": args.lr, "weight_decay": 0.0})
+        atlas_ddp, optimizer, _, scheduler = deepspeed.initialize(
+            model=atlas, model_parameters=all_params,
+            config=ds_config, dist_init_required=False,
+        )
+        if _resume_ckpt is not None and "optimizer" in _resume_ckpt:
+            try:
+                optimizer.load_state_dict(_resume_ckpt["optimizer"])
+                if _main:
+                    logger.info("Restored DeepSpeed optimizer state from checkpoint")
+            except Exception as e:
+                if _main:
+                    logger.warning("Failed to restore DeepSpeed optimizer state: %s", e)
+        if global_step > 0 and scheduler is not None:
+            for _ in range(global_step):
+                scheduler.step()
+            _ff_lr = scheduler.get_lr()
+            if _main:
+                logger.info(
+                    "Fast-forwarded DeepSpeed LR scheduler to step %d  (lr=%s)",
+                    global_step,
+                    [f"{x:.6e}" for x in _ff_lr] if isinstance(_ff_lr, (list, tuple)) else f"{_ff_lr:.6e}",
+                )
+    else:
+        param_groups = atlas.get_trainable_param_groups(args.lr, weight_decay=args.weight_decay)
+        if topomlp_adapter is not None:
+            _adapter_trainable = [p for p in topomlp_adapter.parameters() if p.requires_grad]
+            if _adapter_trainable:
+                param_groups.append({"params": _adapter_trainable, "lr": args.lr, "weight_decay": 0.0})
+        optimizer = torch.optim.AdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay)
+        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
+        if distributed:
+            atlas_ddp = torch.nn.parallel.DistributedDataParallel(
+                atlas, device_ids=[args.local_rank], find_unused_parameters=True,
+            )
+        else:
+            atlas_ddp = atlas
+    if _resume_ckpt is not None and not use_deepspeed:
+        if "optimizer" in _resume_ckpt:
+            try:
+                optimizer.load_state_dict(_resume_ckpt["optimizer"])
+            except Exception as e:
+                if _main:
+                    logger.warning("Failed to restore optimizer state: %s", e)
+        if "scheduler" in _resume_ckpt and _resume_ckpt["scheduler"] is not None:
+            try:
+                scheduler.load_state_dict(_resume_ckpt["scheduler"])
+            except Exception as e:
+                if _main:
+                    logger.warning("Failed to restore scheduler state: %s", e)
+    _resume_ckpt = None
+    atlas_ddp.train()
+    if topomlp_adapter is not None:
+        topomlp_adapter.train()
+    if _main:
+        logger.info("=== Training Config ===")
+        logger.info("  epochs: %d, lr: %s, batch: %d, accum: %d",
+                     args.epochs, args.lr, args.batch_size, args.gradient_accumulation_steps)
+        logger.info("  total_steps: %d, warmup_steps: %d", total_steps, warmup_steps)
+        logger.info("  use_lora: %s, load_in_4bit: %s, fp16: %s, bf16: %s, deepspeed: %s",
+                     args.use_lora, args.load_in_4bit, args.fp16, args.bf16, use_deepspeed)
+        n_trainable = sum(p.numel() for p in atlas.parameters() if p.requires_grad)
+        if topomlp_adapter is not None:
+            n_trainable += sum(p.numel() for p in topomlp_adapter.parameters() if p.requires_grad)
+        logger.info("  trainable params: %s", f"{n_trainable:,}")
+        logger.info("  visual_token_mode: %s", args.visual_token_mode)
+        logger.info("  streampetr: %s", "online-temporal" if (is_online and streampetr_model) else ("loaded" if streampetr_model else ("precomputed" if _precomp_det else "NONE (should not happen)")))
+        logger.info("  topomlp: %s", "online" if (is_online and topomlp_model) else ("loaded" if topomlp_model else ("precomputed" if _precomp_map else "NONE (should not happen)")))
+        logger.info("=======================")
+    streaming_state = {} if is_online else None
+    for epoch in range(start_epoch, args.epochs):
+        if sampler is not None:
+            sampler.set_epoch(epoch)
+        if streaming_state is not None:
+            streaming_state.clear()
+            if streampetr_model is not None:
+                streampetr_model.pts_bbox_head.reset_memory()
+        epoch_loss = 0.0
+        num_batches = 0
+        t0 = time.time()
+        if not use_deepspeed:
+            optimizer.zero_grad()
+        for batch_idx, batch in enumerate(dataloader):
+            do_step = _is_optimizer_step_batch(
+                batch_idx, num_batches_per_epoch, args.gradient_accumulation_steps
+            )
+            accum_window_size = _accum_window_size_for_batch(
+                batch_idx, num_batches_per_epoch, args.gradient_accumulation_steps
+            )
+            scaled_loss = None
+            if use_deepspeed:
+                if not hasattr(atlas_ddp, "set_gradient_accumulation_boundary"):
+                    raise RuntimeError(
+                        "DeepSpeed engine is missing set_gradient_accumulation_boundary(); "
+                        "cannot enforce epoch-tail flush semantics."
+                    )
+                atlas_ddp.set_gradient_accumulation_boundary(do_step)
+            if _main and batch_idx < 5:
+                _has_map = "precomputed_map" in batch
+                _nq = int((batch["input_ids"] == query_token_id).sum(dim=-1).max().item()) if query_token_id else 0
+                logger.info("[DBG] batch_idx=%d nq=%d has_map=%s sid=%s mode=%s",
+                            batch_idx, _nq, _has_map,
+                            batch.get("sample_id", ["?"])[0][:20],
+                            args.visual_token_mode)
+                for _handler in logging.root.handlers:
+                    _handler.flush()
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["labels"].to(device)
+            visual_features = extract_visual_tokens(
+                streampetr_model, topomlp_model, topomlp_adapter,
+                batch, device, args.num_det_queries, args.visual_hidden_size,
+                query_token_id=query_token_id,
+                visual_token_mode=args.visual_token_mode,
+                streaming_state=streaming_state,
+            )
+            if _main and batch_idx < 5:
+                logger.info("[DBG] vis_keys=%s", list(visual_features.keys()))
+                for _handler in logging.root.handlers:
+                    _handler.flush()
+            if _main and batch_idx < 5:
+                logger.info("[DBG] pre-forward batch_idx=%d seqlen=%d", batch_idx, input_ids.shape[1])
+                for _handler in logging.root.handlers:
+                    _handler.flush()
+            outputs = atlas_ddp(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                visual_features=visual_features,
+                labels=labels,
+            )
+            loss = outputs.loss
+            if _main and batch_idx < 5:
+                logger.info("[DBG] post-forward batch_idx=%d loss=%.4f", batch_idx, loss.item())
+                for _handler in logging.root.handlers:
+                    _handler.flush()
+            if _main and batch_idx < 5:
+                logger.info("[DBG] pre-backward batch_idx=%d", batch_idx)
+                for _handler in logging.root.handlers:
+                    _handler.flush()
+            if use_deepspeed:
+                scaled_loss = loss / accum_window_size
+                atlas_ddp.backward(scaled_loss, scale_wrt_gas=False)
+                if _main and batch_idx < 5:
+                    logger.info("[DBG] pre-step batch_idx=%d", batch_idx)
+                    for _handler in logging.root.handlers:
+                        _handler.flush()
+                atlas_ddp.step()
+            else:
+                scaled_loss = loss / accum_window_size
+                scaled_loss.backward()
+                if not use_deepspeed and distributed and topomlp_adapter is not None:
+                    for p in topomlp_adapter.parameters():
+                        if p.requires_grad and p.grad is not None:
+                            dist.all_reduce(p.grad, op=dist.ReduceOp.SUM)
+                            p.grad.div_(world_size)
+            epoch_loss += loss.item()
+            num_batches += 1
+            if _main and num_batches <= 3:
+                logger.info("batch=%d loss=%.4f", num_batches, loss.item())
+                for _handler in logging.root.handlers:
+                    _handler.flush()
+            if do_step:
+                if not use_deepspeed:
+                    all_params = list(atlas.parameters()) + (
+                        list(topomlp_adapter.parameters()) if topomlp_adapter is not None else []
+                    )
+                    trainable = [p for p in all_params if p.requires_grad]
+                    torch.nn.utils.clip_grad_norm_(trainable, args.max_grad_norm)
+                    optimizer.step()
+                    scheduler.step()
+                    optimizer.zero_grad()
+                global_step += 1
+                if _main and global_step % args.log_steps == 0:
+                    # DeepSpeed LR scheduler may not expose get_last_lr() before first scheduler.step().
+                    if use_deepspeed and hasattr(atlas_ddp, "get_lr"):
+                        try:
+                            _lrs = atlas_ddp.get_lr()
+                            if isinstance(_lrs, (list, tuple)) and len(_lrs) > 0:
+                                lr_now = float(_lrs[0])
+                            else:
+                                lr_now = float(_lrs)
+                        except Exception:
+                            lr_now = optimizer.param_groups[0]["lr"] if getattr(optimizer, "param_groups", None) else args.lr
+                    elif hasattr(scheduler, "get_last_lr"):
+                        try:
+                            lr_now = scheduler.get_last_lr()[0]
+                        except Exception:
+                            lr_now = optimizer.param_groups[0]["lr"] if getattr(optimizer, "param_groups", None) else args.lr
+                    else:
+                        lr_now = args.lr
+                    elapsed = time.time() - t0
+                    samples_sec = num_batches * args.batch_size / max(elapsed, 1e-6)
+                    avg_loss = epoch_loss / max(num_batches, 1)
+                    logger.info(
+                        "epoch=%d step=%d loss=%.4f lr=%.2e samples/s=%.1f",
+                        epoch, global_step, avg_loss, lr_now, samples_sec,
+                    )
+                    for _handler in logging.root.handlers:
+                        _handler.flush()
+                if _main and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    ckpt_path = output_dir / f"checkpoint-{global_step}" / "checkpoint.pt"
+                    save_checkpoint(ckpt_path, atlas, topomlp_adapter, optimizer, scheduler, global_step, epoch, args)
+                    logger.info("Saved step checkpoint: %s", ckpt_path)
+        avg_loss = epoch_loss / max(num_batches, 1)
+        if _main:
+            logger.info("Epoch %d done — avg_loss=%.4f (%.1f min)", epoch, avg_loss, (time.time() - t0) / 60)
+        if _main and (epoch + 1) % args.save_epochs == 0:
+            ckpt_path = output_dir / f"epoch-{epoch}" / "checkpoint.pt"
+            save_checkpoint(ckpt_path, atlas, topomlp_adapter, optimizer, scheduler, global_step, epoch + 1, args)
+            logger.info("Saved epoch checkpoint: %s", ckpt_path)
+            if args.keep_last_n_ckpts > 0:
+                cleanup_old_checkpoints(output_dir, args.keep_last_n_ckpts)
+    if _main:
+        final_path = output_dir / "final" / "checkpoint.pt"
+        save_checkpoint(final_path, atlas, topomlp_adapter, optimizer, scheduler, global_step, args.epochs, args)
+        logger.info("Training complete. Final checkpoint: %s", final_path)
+    if distributed:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()